From: Pat Schloss Date: Mon, 22 Oct 2012 13:18:06 +0000 (-0400) Subject: Merge remote-tracking branch 'mothur/master' X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=90708fe9701e3827e477c82fb3652539c3bf2a0d;hp=f320651dfb5359e6bba597280753553de28a2154 Merge remote-tracking branch 'mothur/master' --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 979b1e6..ecb0619 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -19,11 +19,21 @@ A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; }; A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; }; A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; }; + A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB66161C570F009860A1 /* alignnode.cpp */; }; + A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB68161C570F009860A1 /* aligntree.cpp */; }; + A721AB71161C572A009860A1 /* kmernode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6D161C572A009860A1 /* kmernode.cpp */; }; + A721AB72161C572A009860A1 /* kmertree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6F161C572A009860A1 /* kmertree.cpp */; }; + A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB73161C573B009860A1 /* taxonomynode.cpp */; }; A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A724D2B6153C8628000A826F /* makebiomcommand.cpp */; }; A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; }; + A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C211619CCE600651424 /* classifysharedcommand.cpp */; }; + A7386C251619E52300651424 /* abstractdecisiontree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C241619E52200651424 /* abstractdecisiontree.cpp */; }; + A7386C27161A0F9D00651424 /* abstractrandomforest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C26161A0F9C00651424 /* abstractrandomforest.cpp */; }; + A7386C29161A110800651424 /* decisiontree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C28161A110700651424 /* decisiontree.cpp */; }; A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73901071588C40900ED2ED6 /* loadlogfilecommand.cpp */; }; A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */; }; A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */; }; + A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */; }; A74A9A9F148E881E00AB5E3E /* spline.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74A9A9E148E881E00AB5E3E /* spline.cpp */; }; A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; }; A74D59A4159A1E2000043046 /* counttable.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D59A3159A1E2000043046 /* counttable.cpp */; }; @@ -36,6 +46,8 @@ A77410F614697C300098E6AC /* seqnoise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77410F414697C300098E6AC /* seqnoise.cpp */; }; A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; }; A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; }; + A77E1938161B201E00DB1A2A /* randomforest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77E1937161B201E00DB1A2A /* randomforest.cpp */; }; + A77E193B161B289600DB1A2A /* rftreenode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77E193A161B289600DB1A2A /* rftreenode.cpp */; }; A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */; }; A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7876A25152A017C00A0AE86 /* subsample.cpp */; }; A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79234D613C74BF6002B08E2 /* mothurfisher.cpp */; }; @@ -50,6 +62,7 @@ A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */; }; A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; }; A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */; }; + A7C7DAB915DA758B0059B0CF /* sffmultiplecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */; }; A7D755DA1535F679009BF21A /* treereader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D755D91535F679009BF21A /* treereader.cpp */; }; A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */; }; A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; }; @@ -387,16 +400,39 @@ A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = ""; }; A721765513BB9F7D0014DAAE /* referencedb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = referencedb.h; sourceTree = ""; }; A721765613BB9F7D0014DAAE /* referencedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = referencedb.cpp; sourceTree = ""; }; + A721AB66161C570F009860A1 /* alignnode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignnode.cpp; sourceTree = ""; }; + A721AB67161C570F009860A1 /* alignnode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignnode.h; sourceTree = ""; }; + A721AB68161C570F009860A1 /* aligntree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligntree.cpp; sourceTree = ""; }; + A721AB69161C570F009860A1 /* aligntree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligntree.h; sourceTree = ""; }; + A721AB6D161C572A009860A1 /* kmernode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmernode.cpp; sourceTree = ""; }; + A721AB6E161C572A009860A1 /* kmernode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmernode.h; sourceTree = ""; }; + A721AB6F161C572A009860A1 /* kmertree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmertree.cpp; sourceTree = ""; }; + A721AB70161C572A009860A1 /* kmertree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmertree.h; sourceTree = ""; }; + A721AB73161C573B009860A1 /* taxonomynode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = taxonomynode.cpp; sourceTree = ""; }; + A721AB74161C573B009860A1 /* taxonomynode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = taxonomynode.h; sourceTree = ""; }; A724D2B4153C8600000A826F /* makebiomcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makebiomcommand.h; sourceTree = ""; }; A724D2B6153C8628000A826F /* makebiomcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makebiomcommand.cpp; sourceTree = ""; }; A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = ""; }; A727864312E9E28C00F86ABA /* removerarecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removerarecommand.cpp; sourceTree = ""; }; + A7386C1B1619CACB00651424 /* abstractdecisiontree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = abstractdecisiontree.hpp; sourceTree = ""; }; + A7386C1C1619CACB00651424 /* abstractrandomforest.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = abstractrandomforest.hpp; sourceTree = ""; }; + A7386C1D1619CACB00651424 /* decisiontree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = decisiontree.hpp; sourceTree = ""; }; + A7386C1E1619CACB00651424 /* macros.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = macros.h; sourceTree = ""; }; + A7386C1F1619CACB00651424 /* randomforest.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = randomforest.hpp; sourceTree = ""; }; + A7386C201619CACB00651424 /* rftreenode.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = rftreenode.hpp; sourceTree = ""; }; + A7386C211619CCE600651424 /* classifysharedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifysharedcommand.cpp; sourceTree = ""; }; + A7386C221619CCE600651424 /* classifysharedcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifysharedcommand.h; sourceTree = ""; }; + A7386C241619E52200651424 /* abstractdecisiontree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstractdecisiontree.cpp; sourceTree = ""; }; + A7386C26161A0F9C00651424 /* abstractrandomforest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstractrandomforest.cpp; sourceTree = ""; }; + A7386C28161A110700651424 /* decisiontree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decisiontree.cpp; sourceTree = ""; }; A73901051588C3EF00ED2ED6 /* loadlogfilecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = loadlogfilecommand.h; sourceTree = ""; }; A73901071588C40900ED2ED6 /* loadlogfilecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = loadlogfilecommand.cpp; sourceTree = ""; }; A73DDBB813C4A0D1006AAE38 /* clearmemorycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clearmemorycommand.h; sourceTree = ""; }; A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clearmemorycommand.cpp; sourceTree = ""; }; A73DDC3613C4BF64006AAE38 /* mothurmetastats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurmetastats.h; sourceTree = ""; }; A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurmetastats.cpp; sourceTree = ""; }; + A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequencecountparser.cpp; sourceTree = ""; }; + A741FAD415D168A00067BCC5 /* sequencecountparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequencecountparser.h; sourceTree = ""; }; A74A9A9D148E881E00AB5E3E /* spline.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = spline.h; sourceTree = ""; }; A74A9A9E148E881E00AB5E3E /* spline.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = spline.cpp; sourceTree = ""; }; A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerauchimecommand.h; sourceTree = ""; }; @@ -421,6 +457,8 @@ A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcommandinfocommand.cpp; sourceTree = ""; }; A77A221D139001B600B0BE70 /* deuniquetreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deuniquetreecommand.h; sourceTree = ""; }; A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deuniquetreecommand.cpp; sourceTree = ""; }; + A77E1937161B201E00DB1A2A /* randomforest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = randomforest.cpp; sourceTree = ""; }; + A77E193A161B289600DB1A2A /* rftreenode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rftreenode.cpp; sourceTree = ""; }; A77EBD2C1523707F00ED407C /* createdatabasecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = createdatabasecommand.h; sourceTree = ""; }; A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = createdatabasecommand.cpp; sourceTree = ""; }; A7876A25152A017C00A0AE86 /* subsample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = subsample.cpp; sourceTree = ""; }; @@ -451,6 +489,8 @@ A7C3DC0A14FE457500FE1924 /* cooccurrencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cooccurrencecommand.h; sourceTree = ""; }; A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trialSwap2.cpp; sourceTree = ""; }; A7C3DC0E14FE469500FE1924 /* trialswap2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trialswap2.h; sourceTree = ""; }; + A7C7DAB615DA75760059B0CF /* sffmultiplecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sffmultiplecommand.h; sourceTree = ""; }; + A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sffmultiplecommand.cpp; sourceTree = ""; }; A7D755D71535F665009BF21A /* treereader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treereader.h; sourceTree = ""; }; A7D755D91535F679009BF21A /* treereader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treereader.cpp; sourceTree = ""; }; A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = ""; }; @@ -1126,6 +1166,7 @@ A7E9B79B12D37EC400DA6239 /* progress.cpp */, A7E9B79C12D37EC400DA6239 /* progress.hpp */, A7E9B7A512D37EC400DA6239 /* rarecalc.cpp */, + A7386C191619C9FB00651424 /* randomforest */, A7E9B7A612D37EC400DA6239 /* rarecalc.h */, A7E9B7A712D37EC400DA6239 /* raredisplay.cpp */, A7E9B7A812D37EC400DA6239 /* raredisplay.h */, @@ -1167,6 +1208,24 @@ name = Products; sourceTree = ""; }; + A7386C191619C9FB00651424 /* randomforest */ = { + isa = PBXGroup; + children = ( + A7386C1B1619CACB00651424 /* abstractdecisiontree.hpp */, + A7386C241619E52200651424 /* abstractdecisiontree.cpp */, + A7386C1C1619CACB00651424 /* abstractrandomforest.hpp */, + A7386C26161A0F9C00651424 /* abstractrandomforest.cpp */, + A7386C1D1619CACB00651424 /* decisiontree.hpp */, + A7386C28161A110700651424 /* decisiontree.cpp */, + A7386C1E1619CACB00651424 /* macros.h */, + A7386C1F1619CACB00651424 /* randomforest.hpp */, + A77E1937161B201E00DB1A2A /* randomforest.cpp */, + A7386C201619CACB00651424 /* rftreenode.hpp */, + A77E193A161B289600DB1A2A /* rftreenode.cpp */, + ); + name = randomforest; + sourceTree = ""; + }; A7D161E7149F7F50000523E8 /* fortran */ = { isa = PBXGroup; children = ( @@ -1223,6 +1282,8 @@ A7E9B69012D37EC400DA6239 /* classifyotucommand.cpp */, A7E9B69312D37EC400DA6239 /* classifyseqscommand.h */, A7E9B69212D37EC400DA6239 /* classifyseqscommand.cpp */, + A7386C221619CCE600651424 /* classifysharedcommand.h */, + A7386C211619CCE600651424 /* classifysharedcommand.cpp */, A7EEB0F714F29C1B00344B83 /* classifytreecommand.h */, A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */, A7E9B69712D37EC400DA6239 /* clearcutcommand.h */, @@ -1407,6 +1468,8 @@ A7E9B7E112D37EC400DA6239 /* setlogfilecommand.cpp */, A7E9B7E412D37EC400DA6239 /* sffinfocommand.h */, A7E9B7E312D37EC400DA6239 /* sffinfocommand.cpp */, + A7C7DAB615DA75760059B0CF /* sffmultiplecommand.h */, + A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */, A7E9B7F312D37EC400DA6239 /* sharedcommand.h */, A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */, A7E9B82812D37EC400DA6239 /* shhhercommand.h */, @@ -1659,6 +1722,8 @@ A7E9B7D012D37EC400DA6239 /* sabundvector.hpp */, A7E9B7DB12D37EC400DA6239 /* sequence.cpp */, A7E9B7DC12D37EC400DA6239 /* sequence.hpp */, + A741FAD415D168A00067BCC5 /* sequencecountparser.h */, + A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */, A7E9B7DD12D37EC400DA6239 /* sequencedb.cpp */, A7E9B7DE12D37EC400DA6239 /* sequencedb.h */, A7F9F5CD141A5E500032F693 /* sequenceparser.h */, @@ -1725,10 +1790,18 @@ A7E9BA4B12D3966900DA6239 /* classifier */ = { isa = PBXGroup; children = ( - A7E9B65A12D37EC300DA6239 /* bayesian.cpp */, + A721AB67161C570F009860A1 /* alignnode.h */, + A721AB66161C570F009860A1 /* alignnode.cpp */, + A721AB69161C570F009860A1 /* aligntree.h */, + A721AB68161C570F009860A1 /* aligntree.cpp */, A7E9B65B12D37EC300DA6239 /* bayesian.h */, + A7E9B65A12D37EC300DA6239 /* bayesian.cpp */, A7E9B68E12D37EC400DA6239 /* classify.cpp */, A7E9B68F12D37EC400DA6239 /* classify.h */, + A721AB6E161C572A009860A1 /* kmernode.h */, + A721AB6D161C572A009860A1 /* kmernode.cpp */, + A721AB70161C572A009860A1 /* kmertree.h */, + A721AB6F161C572A009860A1 /* kmertree.cpp */, A7E9B73812D37EC400DA6239 /* knn.h */, A7E9B73712D37EC400DA6239 /* knn.cpp */, A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */, @@ -1737,6 +1810,8 @@ A7E9B79012D37EC400DA6239 /* phylotree.h */, A7E9B85D12D37EC400DA6239 /* taxonomyequalizer.cpp */, A7E9B85E12D37EC400DA6239 /* taxonomyequalizer.h */, + A721AB74161C573B009860A1 /* taxonomynode.h */, + A721AB73161C573B009860A1 /* taxonomynode.cpp */, ); name = classifier; sourceTree = ""; @@ -2192,6 +2267,19 @@ A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */, A74D59A4159A1E2000043046 /* counttable.cpp in Sources */, A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */, + A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */, + A7C7DAB915DA758B0059B0CF /* sffmultiplecommand.cpp in Sources */, + A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */, + A7386C251619E52300651424 /* abstractdecisiontree.cpp in Sources */, + A7386C27161A0F9D00651424 /* abstractrandomforest.cpp in Sources */, + A7386C29161A110800651424 /* decisiontree.cpp in Sources */, + A77E1938161B201E00DB1A2A /* randomforest.cpp in Sources */, + A77E193B161B289600DB1A2A /* rftreenode.cpp in Sources */, + A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */, + A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */, + A721AB71161C572A009860A1 /* kmernode.cpp in Sources */, + A721AB72161C572A009860A1 /* kmertree.cpp in Sources */, + A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -2275,8 +2363,8 @@ GCC_MODEL_TUNING = ""; GCC_OPTIMIZATION_LEVEL = 3; GCC_PREPROCESSOR_DEFINITIONS = ( - "VERSION=\"\\\"1.26.0\\\"\"", - "RELEASE_DATE=\"\\\"7/9/2012\\\"\"", + "VERSION=\"\\\"1.27.0\\\"\"", + "RELEASE_DATE=\"\\\"8/8/2012\\\"\"", ); GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES; diff --git a/abstractdecisiontree.cpp b/abstractdecisiontree.cpp new file mode 100644 index 0000000..085cd31 --- /dev/null +++ b/abstractdecisiontree.cpp @@ -0,0 +1,285 @@ +// +// abstractdecisiontree.cpp +// Mothur +// +// Created by Sarah Westcott on 10/1/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "abstractdecisiontree.hpp" + +/**************************************************************************************************/ + +AbstractDecisionTree::AbstractDecisionTree(vector >baseDataSet, + vector globalDiscardedFeatureIndices, + OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, + string treeSplitCriterion) : baseDataSet(baseDataSet), +numSamples((int)baseDataSet.size()), +numFeatures((int)(baseDataSet[0].size() - 1)), +numOutputClasses(0), +rootNode(NULL), +globalDiscardedFeatureIndices(globalDiscardedFeatureIndices), +optimumFeatureSubsetSize(optimumFeatureSubsetSelector.getOptimumFeatureSubsetSize(numFeatures)), +treeSplitCriterion(treeSplitCriterion) { + + try { + // TODO: istead of calculating this for every DecisionTree + // clacualte this once in the RandomForest class and pass the values + m = MothurOut::getInstance(); + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { break; } + int outcome = baseDataSet[i][numFeatures]; + vector::iterator it = find(outputClasses.begin(), outputClasses.end(), outcome); + if (it == outputClasses.end()){ // find() will return classes.end() if the element is not found + outputClasses.push_back(outcome); + numOutputClasses++; + } + } + + if (m->debug) { + //m->mothurOut("outputClasses = " + toStringVectorInt(outputClasses)); + m->mothurOut("numOutputClasses = " + toString(numOutputClasses) + '\n'); + } + + } + catch(exception& e) { + m->errorOut(e, "AbstractDecisionTree", "AbstractDecisionTree"); + exit(1); + } +} +/**************************************************************************************************/ +int AbstractDecisionTree::createBootStrappedSamples(){ + try { + vector isInTrainingSamples(numSamples, false); + + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { return 0; } + // TODO: optimize the rand() function call + double check if it's working properly + int randomIndex = rand() % numSamples; + bootstrappedTrainingSamples.push_back(baseDataSet[randomIndex]); + isInTrainingSamples[randomIndex] = true; + } + + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { return 0; } + if (isInTrainingSamples[i]){ bootstrappedTrainingSampleIndices.push_back(i); } + else{ + bootstrappedTestSamples.push_back(baseDataSet[i]); + bootstrappedTestSampleIndices.push_back(i); + } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AbstractDecisionTree", "createBootStrappedSamples"); + exit(1); + } +} +/**************************************************************************************************/ +int AbstractDecisionTree::getMinEntropyOfFeature(vector featureVector, vector outputVector, double& minEntropy, int& featureSplitValue, double& intrinsicValue){ + try { + + vector< vector > featureOutputPair(featureVector.size(), vector(2, 0)); + for (int i = 0; i < featureVector.size(); i++) { + if (m->control_pressed) { return 0; } + featureOutputPair[i][0] = featureVector[i]; + featureOutputPair[i][1] = outputVector[i]; + } + // TODO: using default behavior to sort(), need to specify the comparator for added safety and compiler portability + sort(featureOutputPair.begin(), featureOutputPair.end()); + + + vector splitPoints; + vector uniqueFeatureValues(1, featureOutputPair[0][0]); + + for (int i = 0; i < featureOutputPair.size(); i++) { + if (m->control_pressed) { return 0; } + int featureValue = featureOutputPair[i][0]; + vector::iterator it = find(uniqueFeatureValues.begin(), uniqueFeatureValues.end(), featureValue); + if (it == uniqueFeatureValues.end()){ // NOT FOUND + uniqueFeatureValues.push_back(featureValue); + splitPoints.push_back(i); + } + } + + + + int bestSplitIndex = -1; + if (splitPoints.size() == 0){ + // TODO: trying out C++'s infitinity, don't know if this will work properly + // TODO: check the caller function of this function, there check the value if minEntropy and comapre to inf + // so that no wrong calculation is done + minEntropy = numeric_limits::infinity(); // OUTPUT + intrinsicValue = numeric_limits::infinity(); // OUTPUT + featureSplitValue = -1; // OUTPUT + }else{ + getBestSplitAndMinEntropy(featureOutputPair, splitPoints, minEntropy, bestSplitIndex, intrinsicValue); // OUTPUT + featureSplitValue = featureOutputPair[splitPoints[bestSplitIndex]][0]; // OUTPUT + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AbstractDecisionTree", "getMinEntropyOfFeature"); + exit(1); + } +} +/**************************************************************************************************/ +double AbstractDecisionTree::calcIntrinsicValue(int numLessThanValueAtSplitPoint, int numGreaterThanValueAtSplitPoint, int numSamples) { + try { + double upperSplitEntropy = 0.0, lowerSplitEntropy = 0.0; + if (numLessThanValueAtSplitPoint > 0) { + upperSplitEntropy = numLessThanValueAtSplitPoint * log2((double) numLessThanValueAtSplitPoint / (double) numSamples); + } + + if (numGreaterThanValueAtSplitPoint > 0) { + lowerSplitEntropy = numGreaterThanValueAtSplitPoint * log2((double) numGreaterThanValueAtSplitPoint / (double) numSamples); + } + + double intrinsicValue = - ((double)(upperSplitEntropy + lowerSplitEntropy) / (double)numSamples); + return intrinsicValue; + } + catch(exception& e) { + m->errorOut(e, "AbstractDecisionTree", "calcIntrinsicValue"); + exit(1); + } +} +/**************************************************************************************************/ +int AbstractDecisionTree::getBestSplitAndMinEntropy(vector< vector > featureOutputPairs, vector splitPoints, + double& minEntropy, int& minEntropyIndex, double& relatedIntrinsicValue){ + try { + + int numSamples = (int)featureOutputPairs.size(); + vector entropies; + vector intrinsicValues; + + for (int i = 0; i < splitPoints.size(); i++) { + if (m->control_pressed) { return 0; } + int index = splitPoints[i]; + int valueAtSplitPoint = featureOutputPairs[index][0]; + int numLessThanValueAtSplitPoint = 0; + int numGreaterThanValueAtSplitPoint = 0; + + for (int j = 0; j < featureOutputPairs.size(); j++) { + if (m->control_pressed) { return 0; } + vector record = featureOutputPairs[j]; + if (record[0] < valueAtSplitPoint){ numLessThanValueAtSplitPoint++; } + else{ numGreaterThanValueAtSplitPoint++; } + } + + double upperEntropyOfSplit = calcSplitEntropy(featureOutputPairs, index, numOutputClasses, true); + double lowerEntropyOfSplit = calcSplitEntropy(featureOutputPairs, index, numOutputClasses, false); + + double totalEntropy = (numLessThanValueAtSplitPoint * upperEntropyOfSplit + numGreaterThanValueAtSplitPoint * lowerEntropyOfSplit) / (double)numSamples; + double intrinsicValue = calcIntrinsicValue(numLessThanValueAtSplitPoint, numGreaterThanValueAtSplitPoint, numSamples); + entropies.push_back(totalEntropy); + intrinsicValues.push_back(intrinsicValue); + + } + + // set output values + vector::iterator it = min_element(entropies.begin(), entropies.end()); + minEntropy = *it; // OUTPUT + minEntropyIndex = (int)(it - entropies.begin()); // OUTPUT + relatedIntrinsicValue = intrinsicValues[minEntropyIndex]; // OUTPUT + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AbstractDecisionTree", "getBestSplitAndMinEntropy"); + exit(1); + } +} +/**************************************************************************************************/ + +double AbstractDecisionTree::calcSplitEntropy(vector< vector > featureOutputPairs, int splitIndex, int numOutputClasses, bool isUpperSplit = true) { + try { + vector classCounts(numOutputClasses, 0); + + if (isUpperSplit) { + for (int i = 0; i < splitIndex; i++) { + if (m->control_pressed) { return 0; } + classCounts[featureOutputPairs[i][1]]++; + } + } else { + for (int i = splitIndex; i < featureOutputPairs.size(); i++) { + if (m->control_pressed) { return 0; } + classCounts[featureOutputPairs[i][1]]++; + } + } + + int totalClassCounts = accumulate(classCounts.begin(), classCounts.end(), 0); + + double splitEntropy = 0.0; + + for (int i = 0; i < classCounts.size(); i++) { + if (m->control_pressed) { return 0; } + if (classCounts[i] == 0) { continue; } + double probability = (double) classCounts[i] / (double) totalClassCounts; + splitEntropy += -(probability * log2(probability)); + } + + return splitEntropy; + } + catch(exception& e) { + m->errorOut(e, "AbstractDecisionTree", "calcSplitEntropy"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AbstractDecisionTree::getSplitPopulation(RFTreeNode* node, vector< vector >& leftChildSamples, vector< vector >& rightChildSamples){ + try { + // TODO: there is a possibility of optimization if we can recycle the samples in each nodes + // we just need to pointers to the samples i.e. vector and use it everywhere and not create the sample + // sample over and over again + // we need to make this const so that it is not modified by all the function calling + // currently purgeTreeNodesDataRecursively() is used for the same purpose, but this can be avoided altogher + // if re-using the same data over the classes + + int splitFeatureGlobalIndex = node->getSplitFeatureIndex(); + + for (int i = 0; i < node->getBootstrappedTrainingSamples().size(); i++) { + if (m->control_pressed) { return 0; } + vector sample = node->getBootstrappedTrainingSamples()[i]; + if (m->control_pressed) { return 0; } + if (sample[splitFeatureGlobalIndex] < node->getSplitFeatureValue()){ leftChildSamples.push_back(sample); } + else{ rightChildSamples.push_back(sample); } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AbstractDecisionTree", "getSplitPopulation"); + exit(1); + } +} +/**************************************************************************************************/ +// TODO: checkIfAlreadyClassified() verify code +// TODO: use bootstrappedOutputVector for easier calculation instead of using getBootstrappedTrainingSamples() +bool AbstractDecisionTree::checkIfAlreadyClassified(RFTreeNode* treeNode, int& outputClass) { + try { + + vector tempOutputClasses; + for (int i = 0; i < treeNode->getBootstrappedTrainingSamples().size(); i++) { + if (m->control_pressed) { return 0; } + int sampleOutputClass = treeNode->getBootstrappedTrainingSamples()[i][numFeatures]; + vector::iterator it = find(tempOutputClasses.begin(), tempOutputClasses.end(), sampleOutputClass); + if (it == tempOutputClasses.end()) { // NOT FOUND + tempOutputClasses.push_back(sampleOutputClass); + } + } + + if (tempOutputClasses.size() < 2) { outputClass = tempOutputClasses[0]; return true; } + else { outputClass = -1; return false; } + + } + catch(exception& e) { + m->errorOut(e, "AbstractDecisionTree", "checkIfAlreadyClassified"); + exit(1); + } +} + +/**************************************************************************************************/ diff --git a/abstractdecisiontree.hpp b/abstractdecisiontree.hpp new file mode 100755 index 0000000..3445db4 --- /dev/null +++ b/abstractdecisiontree.hpp @@ -0,0 +1,63 @@ +// +// abstractdecisiontree.hpp +// rrf-fs-prototype +// +// Created by Abu Zaher Faridee on 7/22/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#ifndef rrf_fs_prototype_abstractdecisiontree_hpp +#define rrf_fs_prototype_abstractdecisiontree_hpp + +#include "mothurout.h" +#include "macros.h" +#include "rftreenode.hpp" + +#define DEBUG_MODE + +/**************************************************************************************************/ + +class AbstractDecisionTree{ + +public: + + AbstractDecisionTree(vector >baseDataSet, + vector globalDiscardedFeatureIndices, + OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, + string treeSplitCriterion); + virtual ~AbstractDecisionTree(){} + + +protected: + + virtual int createBootStrappedSamples(); + virtual int getMinEntropyOfFeature(vector featureVector, vector outputVector, double& minEntropy, int& featureSplitValue, double& intrinsicValue); + virtual int getBestSplitAndMinEntropy(vector< vector > featureOutputPairs, vector splitPoints, double& minEntropy, int& minEntropyIndex, double& relatedIntrinsicValue); + virtual double calcIntrinsicValue(int numLessThanValueAtSplitPoint, int numGreaterThanValueAtSplitPoint, int numSamples); + virtual double calcSplitEntropy(vector< vector > featureOutputPairs, int splitIndex, int numOutputClasses, bool); + virtual int getSplitPopulation(RFTreeNode* node, vector< vector >& leftChildSamples, vector< vector >& rightChildSamples); + virtual bool checkIfAlreadyClassified(RFTreeNode* treeNode, int& outputClass); + + vector< vector > baseDataSet; + int numSamples; + int numFeatures; + int numOutputClasses; + vector outputClasses; + vector< vector > bootstrappedTrainingSamples; + vector bootstrappedTrainingSampleIndices; + vector< vector > bootstrappedTestSamples; + vector bootstrappedTestSampleIndices; + + RFTreeNode* rootNode; + vector globalDiscardedFeatureIndices; + int optimumFeatureSubsetSize; + string treeSplitCriterion; + MothurOut* m; + +private: + + +}; +/**************************************************************************************************/ + +#endif diff --git a/abstractrandomforest.cpp b/abstractrandomforest.cpp new file mode 100644 index 0000000..ae60b77 --- /dev/null +++ b/abstractrandomforest.cpp @@ -0,0 +1,58 @@ +// +// abstractrandomforest.cpp +// Mothur +// +// Created by Sarah Westcott on 10/1/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "abstractrandomforest.hpp" + +/***********************************************************************/ +AbstractRandomForest::AbstractRandomForest(const std::vector < std::vector > dataSet, + const int numDecisionTrees, + const string treeSplitCriterion = "informationGain") +: dataSet(dataSet), +numDecisionTrees(numDecisionTrees), +numSamples((int)dataSet.size()), +numFeatures((int)(dataSet[0].size() - 1)), +globalDiscardedFeatureIndices(getGlobalDiscardedFeatureIndices()), +globalVariableImportanceList(numFeatures, 0), +treeSplitCriterion(treeSplitCriterion) { + m = MothurOut::getInstance(); + // TODO: double check if the implemenatation of 'globalOutOfBagEstimates' is correct +} + +/***********************************************************************/ + +vector AbstractRandomForest::getGlobalDiscardedFeatureIndices() { + try { + vector globalDiscardedFeatureIndices; + + // calculate feature vectors + vector< vector > featureVectors(numFeatures, vector(numSamples, 0)); + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { return globalDiscardedFeatureIndices; } + for (int j = 0; j < numFeatures; j++) { featureVectors[j][i] = dataSet[i][j]; } + } + + for (int i = 0; i < featureVectors.size(); i++) { + if (m->control_pressed) { return globalDiscardedFeatureIndices; } + double standardDeviation = m->getStandardDeviation(featureVectors[i]); + if (standardDeviation <= 0){ globalDiscardedFeatureIndices.push_back(i); } + } + + if (m->debug) { + m->mothurOut("number of global discarded features: " + toString(globalDiscardedFeatureIndices.size())+ "\n"); + m->mothurOut("total features: " + toString(featureVectors.size())+ "\n"); + } + + return globalDiscardedFeatureIndices; + } + catch(exception& e) { + m->errorOut(e, "AbstractRandomForest", "getGlobalDiscardedFeatureIndices"); + exit(1); + } +} + +/***********************************************************************/ \ No newline at end of file diff --git a/abstractrandomforest.hpp b/abstractrandomforest.hpp new file mode 100755 index 0000000..3be91b9 --- /dev/null +++ b/abstractrandomforest.hpp @@ -0,0 +1,67 @@ +// +// abstractrandomforest.hpp +// rrf-fs-prototype +// +// Created by Abu Zaher Faridee on 7/20/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#ifndef rrf_fs_prototype_abstractrandomforest_hpp +#define rrf_fs_prototype_abstractrandomforest_hpp + +#include "mothurout.h" +#include "macros.h" +#include "abstractdecisiontree.hpp" + +#define DEBUG_MODE + +/***********************************************************************/ + +class AbstractRandomForest{ +public: + // intialization with vectors + AbstractRandomForest(const std::vector < std::vector > dataSet, + const int numDecisionTrees, + const string); + virtual ~AbstractRandomForest(){ } + virtual int populateDecisionTrees() = 0; + virtual int calcForrestErrorRate() = 0; + virtual int calcForrestVariableImportance(string) = 0; + +/***********************************************************************/ + +protected: + + // TODO: create a better way of discarding feature + // currently we just set FEATURE_DISCARD_SD_THRESHOLD to 0 to solved this + // it can be tuned for better selection + // also, there might be other factors like Mean or other stuffs + // same would apply for createLocalDiscardedFeatureList in the TreeNode class + + // TODO: Another idea is getting an aggregated discarded feature indices after the run, from combining + // the local discarded feature indices + // this would penalize a feature, even if in global space the feature looks quite good + // the penalization would be averaged, so this woould unlikely to create a local optmina + + vector getGlobalDiscardedFeatureIndices(); + + int numDecisionTrees; + int numSamples; + int numFeatures; + vector< vector > dataSet; + vector globalDiscardedFeatureIndices; + vector globalVariableImportanceList; + string treeSplitCriterion; + // This is a map of each feature to outcome count of each classes + // e.g. 1 => [2 7] means feature 1 has 2 outcome of 0 and 7 outcome of 1 + map > globalOutOfBagEstimates; + + // TODO: fix this, do we use pointers? + vector decisionTrees; + + MothurOut* m; + +private: + +}; +#endif diff --git a/aligncommand.cpp b/aligncommand.cpp index a68fbfc..efc8ce4 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -572,7 +572,6 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { alignment->resize(candidateSeq->getUnaligned().length()+1); } - Sequence temp = templateDB->findClosestSequence(candidateSeq); Sequence* templateSeq = &temp; diff --git a/alignnode.cpp b/alignnode.cpp new file mode 100755 index 0000000..ccd8fb0 --- /dev/null +++ b/alignnode.cpp @@ -0,0 +1,257 @@ +/* + * alignNode.cpp + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "alignNode.h" +#include "taxonomynode.h" + +#include "bayesian.h" + +/**************************************************************************************************/ + +AlignNode::AlignNode(string n, int l): TaxonomyNode(n, l){ + + alignLength = 0; +} + +/**************************************************************************************************/ + +void AlignNode::printTheta(){ + try { + m->mothurOut("A:\t"); for(int i=0;imothurOut(toString(theta[i].A)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("T:\t"); for(int i=0;imothurOut(toString(theta[i].T)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("G:\t"); for(int i=0;imothurOut(toString(theta[i].G)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("C:\t"); for(int i=0;imothurOut(toString(theta[i].C)+ '\t'); } m->mothurOutEndLine(); + m->mothurOut("I:\t"); for(int i=0;imothurOut(toString(theta[i].gap)+ '\t'); } m->mothurOutEndLine(); + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "printTheta"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignNode::loadSequence(string& sequence){ + try { + alignLength = (int)sequence.length(); // this function runs through the alignment and increments the frequency + // of each base for a particular taxon. we are building the thetas + + if(theta.size() == 0){ + theta.resize(alignLength); + columnCounts.resize(alignLength, 0); + } + + for(int i=0;icontrol_pressed) { return 0; } + + char base = sequence[i]; + + if(base == 'A') { theta[i].A++; columnCounts[i]++; } // our thetas will be alignLength x 5 + else if(base == 'T'){ theta[i].T++; columnCounts[i]++; } // and we ignore any position that has + else if(base == 'G'){ theta[i].G++; columnCounts[i]++; } // an ambiguous base call + else if(base == 'C'){ theta[i].C++; columnCounts[i]++; } + else if(base == '-'){ theta[i].gap++; columnCounts[i]++; } + else if(base == 'U'){ theta[i].T++; columnCounts[i]++; } + } + + numSeqs++; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "loadSequence"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignNode::checkTheta(){ + try { + for(int i=0;icontrol_pressed) { return 0; } + + if(theta[i].gap == columnCounts[i]){ + columnCounts[i] = 0; + } + // else{ + // int maxCount = theta[i].A; + // + // if(theta[i].T > maxCount) { maxCount = theta[i].T; } + // if(theta[i].G > maxCount) { maxCount = theta[i].T; } + // if(theta[i].C > maxCount) { maxCount = theta[i].T; } + // if(theta[i].gap > maxCount) { maxCount = theta[i].T; } + // + // if(maxCount < columnCounts[i] * 0.25){// || maxCount == columnCounts[i]){ //remove any column where the maximum frequency is <50% + // columnCounts[i] = 0; + // } + // } + + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "checkTheta"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignNode::addThetas(vector newTheta, int newNumSeqs){ + try { + if(alignLength == 0){ + alignLength = (int)newTheta.size(); + theta.resize(alignLength); + columnCounts.resize(alignLength); + } + + for(int i=0;icontrol_pressed) { return 0; } + + theta[i].A += newTheta[i].A; columnCounts[i] += newTheta[i].A; + theta[i].T += newTheta[i].T; columnCounts[i] += newTheta[i].T; + theta[i].G += newTheta[i].G; columnCounts[i] += newTheta[i].G; + theta[i].C += newTheta[i].C; columnCounts[i] += newTheta[i].C; + theta[i].gap += newTheta[i].gap; columnCounts[i] += newTheta[i].gap; + } + + numSeqs += newNumSeqs; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "addThetas"); + exit(1); + } +} + +/**************************************************************************************************/ + +double AlignNode::getSimToConsensus(string& query){ + try { + double similarity = 0; + + int length = 0; + + for(int i=0;icontrol_pressed) { return similarity; } + + char base = query[i]; + + if(base != '.' && base != 'N' && columnCounts[i] != 0){ + + double fraction = 0; + + if(base == 'A'){ + fraction = (int) theta[i].A / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == 'T'){ + fraction = (int) theta[i].T / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == 'G'){ + fraction = (int) theta[i].G / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == 'C'){ + fraction = (int) theta[i].C / (double) columnCounts[i]; + similarity += fraction; + length++; + } + else if(base == '-'){ + fraction = (int) theta[i].gap / (double) columnCounts[i]; + similarity += fraction; + length++; + } + } + } + + if(length != 0){ + similarity /= double(length); + } + else { + similarity = 0; + } + + return similarity; + + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "getSimToConsensus"); + exit(1); + } +} + +/**************************************************************************************************/ + +double AlignNode::getPxGivenkj_D_j(string& query){ //P(x | k_j, D, j) + try { + double PxGivenkj_D_j = 0; + + int count = 0; + double alpha = 1 / (double)totalSeqs; //flat prior + + + for(int s=0;scontrol_pressed) { return PxGivenkj_D_j; } + + char base = query[s]; + thetaAlign thetaS = theta[s]; + + if(base != '.' && base != 'N' && columnCounts[s] != 0){ + double Nkj_s = (double)columnCounts[s]; + double nkj_si = 0; + + + if(base == 'A') { nkj_si = (double)thetaS.A; } + else if(base == 'T'){ nkj_si = (double)thetaS.T; } + else if(base == 'G'){ nkj_si = (double)thetaS.G; } + else if(base == 'C'){ nkj_si = (double)thetaS.C; } + else if(base == '-'){ nkj_si = (double)thetaS.gap; } + else if(base == 'U'){ nkj_si = (double)thetaS.T; } + + // double alpha = pow(0.2, double(Nkj_s)) + 0.0001; //need to make 1e-4 a variable in future; this is the non-flat prior + + // if(columnCounts[s] != nkj_si){ //deal only with segregating sites... + double numerator = nkj_si + alpha; + double denomenator = Nkj_s + 5.0 * alpha; + + PxGivenkj_D_j += log(numerator) - log(denomenator); + count++; + // } + } + if(base != '.' && columnCounts[s] == 0 && thetaS.gap == 0){ + count = 0; + break; + } + + } + + if(count == 0){ PxGivenkj_D_j = -1e10; } + + return PxGivenkj_D_j; + } + catch(exception& e) { + m->errorOut(e, "AlignNode", "getPxGivenkj_D_j"); + exit(1); + } +} + +/**************************************************************************************************/ diff --git a/alignnode.h b/alignnode.h new file mode 100755 index 0000000..4aecca7 --- /dev/null +++ b/alignnode.h @@ -0,0 +1,49 @@ +#ifndef ALIGNNODE +#define ALIGNNODE + +/* + * alignNode.h + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "taxonomynode.h" + +/**************************************************************************************************/ + +struct thetaAlign { + thetaAlign() : A(0), T(0), G(0), C(0), gap(0){} + unsigned int A; + unsigned int T; + unsigned int G; + unsigned int C; + unsigned int gap; +}; + +/**************************************************************************************************/ + +class AlignNode : public TaxonomyNode { + +public: + AlignNode(string, int); + int loadSequence(string&); + int checkTheta(); + void printTheta(); + double getPxGivenkj_D_j(string& query); //P(x | k_j, D, j) + double getSimToConsensus(string& query); + vector getTheta() { return theta; } + int addThetas(vector, int); + +private: + vector theta; + vector columnCounts; + int alignLength; +}; + +/**************************************************************************************************/ + +#endif + diff --git a/aligntree.cpp b/aligntree.cpp new file mode 100755 index 0000000..41667ca --- /dev/null +++ b/aligntree.cpp @@ -0,0 +1,371 @@ +// +// alignTree.cpp +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#include "alignnode.h" +#include "aligntree.h" + +/**************************************************************************************************/ + +AlignTree::AlignTree(string referenceFileName, string taxonomyFileName, int cutoff) : Classify(), confidenceThreshold(cutoff){ + try { + AlignNode* newNode = new AlignNode("Root", 0); + tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode + + string refTaxonomy; + + readTaxonomy(taxonomyFileName); + + ifstream referenceFile; + m->openInputFile(referenceFileName, referenceFile); + bool error = false; + map lengths; + while(!referenceFile.eof()){ + + if (m->control_pressed) { break; } + + Sequence seq(referenceFile); m->gobble(referenceFile); + + if (seq.getName() != "") { + map::iterator it = taxonomy.find(seq.getName()); + + if (it != taxonomy.end()) { + refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence + string aligned = seq.getAligned(); + lengths[aligned.length()] = 1; + if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: reference sequences must be aligned to use the align method, quitting.\n"); break; } + addTaxonomyToTree(seq.getName(), refTaxonomy, aligned); + }else { + m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true; + } + } + } + referenceFile.close(); + + length = (lengths.begin())->first; + + if (error) { m->control_pressed = true; } + + numTaxa = (int)tree.size(); + + numLevels = 0; + for(int i=0;igetLevel(); + if(level > numLevels){ numLevels = level; } + } + numLevels++; + + aggregateThetas(); + + int dbSize = tree[0]->getNumSeqs(); + + for(int i=0;icheckTheta(); + tree[i]->setTotalSeqs(dbSize); + } + + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "AlignTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +AlignTree::~AlignTree(){ + try { + for(int i=0;ierrorOut(e, "AlignTree", "~AlignTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignTree::addTaxonomyToTree(string seqName, string& taxonomy, string& sequence){ + try { + AlignNode* newNode; + string taxonName = ""; + int treePosition = 0; // the root is element 0 + + int level = 1; + + for(int i=0;icontrol_pressed) { break; } + + if(taxonomy[i] == ';'){ // looking for semicolons... + + if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; } + + int newIndex = tree[treePosition]->getChildIndex(taxonName); // look to see if your current node already + // has a child with the new taxonName + if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that + else { // position in the tree + int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one... + tree[treePosition]->makeChild(taxonName, newChildIndex); + + newNode = new AlignNode(taxonName, level); + + newNode->setParent(treePosition); + + tree.push_back(newNode); + treePosition = newChildIndex; + } + + // sequence data to that node to update that node's theta - seems slow... + taxonName = ""; // clear out the taxon name that we will build as we look + level++; + } // for a semicolon + else{ + taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon + } + } + tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the + + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "addTaxonomyToTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignTree::aggregateThetas(){ + try { + vector > levelMatrix(numLevels+1); + + for(int i=0;icontrol_pressed) { return 0; } + levelMatrix[tree[i]->getLevel()].push_back(i); + } + + for(int i=numLevels-1;i>0;i--){ + if (m->control_pressed) { return 0; } + for(int j=0;jgetParent()]->addThetas(holder->getTheta(), holder->getNumSeqs()); + } + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "aggregateThetas"); + exit(1); + } +} + +/**************************************************************************************************/ + +double AlignTree::getOutlierLogProbability(string& sequence){ + try { + double count = 0; + + for(int i=0;ierrorOut(e, "AlignTree", "getOutlierLogProbability"); + exit(1); + } +} + +/**************************************************************************************************/ + +int AlignTree::getMinRiskIndexAlign(string& sequence, vector& taxaIndices, vector& probabilities){ + try { + int numProbs = (int)probabilities.size(); + + vector G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence + vector risk(numProbs, 0); + + for(int i=1;icontrol_pressed) { return 0; } + G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence); + } + + double minRisk = 1e6; + int minRiskIndex = 0; + + for(int i=0;icontrol_pressed) { return 0; } + for(int j=0;jerrorOut(e, "AlignTree", "getMinRiskIndexAlign"); + exit(1); + } + +} + +/**************************************************************************************************/ + +int AlignTree::sanityCheck(vector >& indices, vector& maxIndices){ + try { + int finalLevel = (int)indices.size()-1; + + for(int position=1;positioncontrol_pressed) { return 0; } + int predictedParent = tree[indices[position][maxIndices[position]]]->getParent(); + int actualParent = indices[position-1][maxIndices[position-1]]; + + if(predictedParent != actualParent){ + finalLevel = position - 1; + return finalLevel; + } + } + return finalLevel; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "sanityCheck"); + exit(1); + } +} + +/**************************************************************************************************/ + +string AlignTree::getTaxonomy(Sequence* seq){ + try { + string seqName = seq->getName(); string querySequence = seq->getAligned(); string taxonProbabilityString = ""; + if (querySequence.length() != length) { + m->mothurOut("[ERROR]: " + seq->getName() + " has length " + toString(querySequence.length()) + ", reference sequences length is " + toString(length) + ". Are your sequences aligned? Sequences must be aligned to use the align search method.\n"); m->control_pressed = true; return ""; + } + double logPOutlier = getOutlierLogProbability(querySequence); + + vector > pXgivenKj_D_j(numLevels); + vector > indices(numLevels); + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[i].push_back(logPOutlier); + indices[i].push_back(-1); + } + + + for(int i=0;igetName() << '\t' << tree[i]->getLevel() << '\t' << tree[i]->getPxGivenkj_D_j(querySequence) << endl; + if (m->control_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(querySequence)); + indices[tree[i]->getLevel()].push_back(i); + } + + vector sumLikelihood(numLevels, 0); + vector bestPosterior(numLevels, 0); + vector maxIndex(numLevels, 0); + int maxPosteriorIndex; + + + //cout << "before best level" << endl; + + //let's find the best level and taxa within that level + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + int numTaxaInLevel = (int)indices[i].size(); + + //cout << "numTaxaInLevel:\t" << numTaxaInLevel << endl; + + vector posteriors(numTaxaInLevel, 0); + sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex); + + maxPosteriorIndex = 0; + for(int j=0;j posteriors[maxPosteriorIndex]){ + maxPosteriorIndex = j; + } + + } + + maxIndex[i] = getMinRiskIndexAlign(querySequence, indices[i], posteriors); + + maxIndex[i] = maxPosteriorIndex; + bestPosterior[i] = posteriors[maxIndex[i]]; + } + + // vector pX_level(numLevels, 0); + // + // for(int i=0;igetNumSeqs(); + // } + // + // int max_pLevel_X_index = -1; + // double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index); + // double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum); + // + // vector pLevel_X(numLevels, 0); + // for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + int confidenceScore = (int) (bestPosterior[i] * 100); + if (confidenceScore >= confidenceThreshold) { + if(indices[i][maxIndex[i]] != -1){ + taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + '(' + toString(confidenceScore) + ");"; + simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";"; + // levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");"; + } + else{ + taxonProbabilityString + "unclassified" + '(' + toString(confidenceScore) + ");"; + // levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");"; + simpleTax += "unclassified;"; + } + }else { break; } + savedspot = i; + } + + for(int i=savedspot+1;icontrol_pressed) { return taxonProbabilityString; } + taxonProbabilityString + "unclassified(0);"; + simpleTax += "unclassified;"; + } + + return taxonProbabilityString; + } + catch(exception& e) { + m->errorOut(e, "AlignTree", "getTaxonomy"); + exit(1); + } +} + + +/**************************************************************************************************/ diff --git a/aligntree.h b/aligntree.h new file mode 100755 index 0000000..51008ff --- /dev/null +++ b/aligntree.h @@ -0,0 +1,34 @@ +// +// alignTree.h +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#ifndef pdsBayesian_alignTree_h +#define pdsBayesian_alignTree_h + +#include "classify.h" + +class AlignNode; + +class AlignTree : public Classify { + +public: + AlignTree(string, string, int); + ~AlignTree(); + string getTaxonomy(Sequence*); + +private: + int addTaxonomyToTree(string, string&, string&); + double getOutlierLogProbability(string&); + int getMinRiskIndexAlign(string&, vector&, vector&); + int aggregateThetas(); + int sanityCheck(vector >&, vector&); + + int numSeqs, confidenceThreshold, length; + vector tree; +}; + +#endif diff --git a/bayesian.cpp b/bayesian.cpp index 1dc3833..49be4af 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -12,13 +12,14 @@ #include "phylosummary.h" #include "referencedb.h" /**************************************************************************************************/ -Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f) : +Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f, bool sh) : Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { try { ReferenceDB* rdb = ReferenceDB::getInstance(); threadID = tid; flip = f; + shortcuts = sh; string baseName = tempFile; if (baseName == "saved") { baseName = rdb->getSavedReference(); } @@ -63,7 +64,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { } saveIn.close(); } - + if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){ if (tempFile == "saved") { m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); } @@ -113,7 +114,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { WordPairDiffArr.resize(numKmers); for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } - ofstream out; + ofstream out; ofstream out2; #ifdef USE_MPI @@ -124,23 +125,24 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { #endif - m->openOutputFile(probFileName, out); + if (shortcuts) { + m->openOutputFile(probFileName, out); - //output mothur version - out << "#" << m->getVersion() << endl; + //output mothur version + out << "#" << m->getVersion() << endl; - out << numKmers << endl; + out << numKmers << endl; - m->openOutputFile(probFileName2, out2); + m->openOutputFile(probFileName2, out2); - //output mothur version - out2 << "#" << m->getVersion() << endl; + //output mothur version + out2 << "#" << m->getVersion() << endl; + } #ifdef USE_MPI } #endif - //for each word for (int i = 0; i < numKmers; i++) { if (m->control_pressed) { break; } @@ -151,7 +153,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - out << i << '\t'; + if (shortcuts) { out << i << '\t'; } #ifdef USE_MPI } @@ -159,12 +161,10 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { vector seqsWithWordi = database->getSequencesWithKmer(i); - map count; - for (int k = 0; k < genusNodes.size(); k++) { count[genusNodes[k]] = 0; } - //for each sequence with that word + vector count; count.resize(genusNodes.size(), 0); for (int j = 0; j < seqsWithWordi.size(); j++) { - int temp = phyloTree->getIndex(names[seqsWithWordi[j]]); + int temp = phyloTree->getGenusIndex(names[seqsWithWordi[j]]); count[temp]++; //increment count of seq in this genus who have this word } @@ -178,9 +178,9 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1); - wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1)); + wordGenusProb[i][k] = log((count[k] + probabilityInTemplate) / (float) (genusTotals[k] + 1)); - if (count[genusNodes[k]] != 0) { + if (count[k] != 0) { #ifdef USE_MPI int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -188,7 +188,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - out << k << '\t' << wordGenusProb[i][k] << '\t' ; + if (shortcuts) { out << k << '\t' << wordGenusProb[i][k] << '\t' ; } #ifdef USE_MPI } @@ -204,8 +204,10 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - out << endl; - out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl; + if (shortcuts) { + out << endl; + out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl; + } #ifdef USE_MPI } @@ -218,9 +220,10 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - out.close(); - out2.close(); - + if (shortcuts) { + out.close(); + out2.close(); + } #ifdef USE_MPI } #endif diff --git a/bayesian.h b/bayesian.h index 7c88433..405fee3 100644 --- a/bayesian.h +++ b/bayesian.h @@ -18,7 +18,7 @@ class Bayesian : public Classify { public: - Bayesian(string, string, string, int, int, int, int, bool); + Bayesian(string, string, string, int, int, int, int, bool, bool); ~Bayesian(); string getTaxonomy(Sequence*); diff --git a/binsequencecommand.cpp b/binsequencecommand.cpp index 7569a4b..ad71d10 100644 --- a/binsequencecommand.cpp +++ b/binsequencecommand.cpp @@ -15,8 +15,9 @@ vector BinSeqCommand::setParameters(){ try { CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -34,7 +35,7 @@ vector BinSeqCommand::setParameters(){ string BinSeqCommand::getHelpString(){ try { string helpString = ""; - helpString += "The bin.seqs command parameters are list, fasta, name, label and group. The fasta and list are required, unless you have a valid current list and fasta file.\n"; + helpString += "The bin.seqs command parameters are list, fasta, name, count, label and group. The fasta and list are required, unless you have a valid current list and fasta file.\n"; helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and are separated by dashes.\n"; helpString += "The bin.seqs command should be in the following format: bin.seqs(fasta=yourFastaFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n"; helpString += "Example bin.seqs(fasta=amazon.fasta, group=amazon.groups, name=amazon.names).\n"; @@ -147,6 +148,14 @@ BinSeqCommand::BinSeqCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -195,11 +204,26 @@ BinSeqCommand::BinSeqCommand(string option) { if (groupfile == "not open") { abort = true; } else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namesfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } - if (namesfile == ""){ - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + + if (countfile == "") { + if (namesfile == ""){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -229,9 +253,8 @@ int BinSeqCommand::execute(){ fasta->readFastaFile(fastafile); //if user gave a namesfile then use it - if (namesfile != "") { - readNamesFile(); - } + if (namesfile != "") { readNamesFile(); } + if (countfile != "") { ct.readTable(countfile); } input = new InputData(listfile, "list"); list = input->getListVector(); @@ -362,79 +385,71 @@ void BinSeqCommand::readNamesFile() { //return 1 if error, 0 otherwise int BinSeqCommand::process(ListVector* list) { try { - string binnames, name, sequence; - - string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + list->getLabel() + getOutputFileNameTag("fasta"); + string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + list->getLabel() + "." + getOutputFileNameTag("fasta"); m->openOutputFile(outputFileName, out); - - //save to output list of output file names - outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName); - - m->mothurOut(list->getLabel()); m->mothurOutEndLine(); - - //for each bin in the list vector - for (int i = 0; i < list->size(); i++) { - - if (m->control_pressed) { return 1; } - - binnames = list->get(i); - while (binnames.find_first_of(',') != -1) { - name = binnames.substr(0,binnames.find_first_of(',')); - binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length()); - - //do work for that name - sequence = fasta->getSequence(name); - if (sequence != "not found") { - //if you don't have groups - if (groupfile == "") { - name = name + "\t" + toString(i+1); - out << ">" << name << endl; - out << sequence << endl; - }else {//if you do have groups - string group = groupMap->getGroup(name); - if (group == "not found") { - m->mothurOut(name + " is missing from your group file. Please correct. "); m->mothurOutEndLine(); - return 1; - }else{ - name = name + "\t" + group + "\t" + toString(i+1); - out << ">" << name << endl; - out << sequence << endl; - } - } - }else { - m->mothurOut(name + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine(); - return 1; - } - - } - - //get last name - sequence = fasta->getSequence(binnames); - if (sequence != "not found") { - //if you don't have groups - if (groupfile == "") { - binnames = binnames + "\t" + toString(i+1); - out << ">" << binnames << endl; - out << sequence << endl; - }else {//if you do have groups - string group = groupMap->getGroup(binnames); - if (group == "not found") { - m->mothurOut(binnames + " is missing from your group file. Please correct. "); m->mothurOutEndLine(); - return 1; - }else{ - binnames = binnames + "\t" + group + "\t" + toString(i+1); - out << ">" << binnames << endl; - out << sequence << endl; - } - } - }else { - m->mothurOut(binnames + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine(); - return 1; - } - } - - out.close(); - return 0; + outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName); + + m->mothurOut(list->getLabel()); m->mothurOutEndLine(); + + //for each bin in the list vector + for (int i = 0; i < list->size(); i++) { + + if (m->control_pressed) { return 1; } + + string binnames = list->get(i); + vector names; + m->splitAtComma(binnames, names); + for (int j = 0; j < names.size(); j++) { + string name = names[j]; + + //do work for that name + string sequence = fasta->getSequence(name); + + if (countfile != "") { + if (sequence != "not found") { + if (ct.hasGroupInfo()) { + vector groups = ct.getGroups(name); + string groupInfo = ""; + for (int k = 0; k < groups.size()-1; k++) { + groupInfo += groups[k] + "-"; + } + if (groups.size() != 0) { groupInfo += groups[groups.size()-1]; } + else { groupInfo = "not found"; } + name = name + "\t" + groupInfo + "\t" + toString(i+1)+ "\tNumRep=" + toString(ct.getNumSeqs(name)); + out << ">" << name << endl; + out << sequence << endl; + }else { + name = name + "\t" + toString(i+1) + "\tNumRep=" + toString(ct.getNumSeqs(name)); + out << ">" << name << endl; + out << sequence << endl; + } + + }else { m->mothurOut(name + " is missing from your fasta. Does your list file contain all sequence names or just the uniques?"); m->mothurOutEndLine(); return 1; } + }else { + if (sequence != "not found") { + //if you don't have groups + if (groupfile == "") { + name = name + "\t" + toString(i+1); + out << ">" << name << endl; + out << sequence << endl; + }else {//if you do have groups + string group = groupMap->getGroup(name); + if (group == "not found") { + m->mothurOut(name + " is missing from your group file. Please correct. "); m->mothurOutEndLine(); + return 1; + }else{ + name = name + "\t" + group + "\t" + toString(i+1); + out << ">" << name << endl; + out << sequence << endl; + } + } + }else { m->mothurOut(name + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine(); return 1; } + } + } + } + + out.close(); + return 0; } catch(exception& e) { diff --git a/binsequencecommand.h b/binsequencecommand.h index 1fb5664..5bdd401 100644 --- a/binsequencecommand.h +++ b/binsequencecommand.h @@ -16,6 +16,7 @@ #include "listvector.hpp" #include "fastamap.h" #include "groupmap.h" +#include "counttable.h" class BinSeqCommand : public Command { @@ -36,14 +37,14 @@ public: void help() { m->mothurOut(getHelpString()); } private: - + CountTable ct; ListVector* list; InputData* input; FastaMap* fasta; GroupMap* groupMap; bool abort, allLines; set labels; //holds labels to be used - string filename, fastafile, listfile, namesfile, groupfile, label, outputDir; + string filename, fastafile, listfile, namesfile, groupfile, countfile, label, outputDir; ofstream out; ifstream in, inNames; vector outputNames; diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index e3691e8..7ae5d69 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -10,12 +10,15 @@ #include "chimeraperseuscommand.h" #include "deconvolutecommand.h" #include "sequence.hpp" +#include "counttable.h" +#include "sequencecountparser.h" //********************************************************************************************************************** vector ChimeraPerseusCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "NameCount", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -36,10 +39,11 @@ vector ChimeraPerseusCommand::setParameters(){ string ChimeraPerseusCommand::getHelpString(){ try { string helpString = ""; - helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n"; + helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n"; helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; - helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n"; + helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n"; + helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; @@ -96,6 +100,8 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(){ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { try { abort = false; calledHelp = false; + hasCount = false; + hasName = false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -107,7 +113,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { OptionParser parser(option); map parameters = parser.getParameters(); - ValidParameters validParameter("chimera.uchime"); + ValidParameters validParameter("chimera.perseus"); map::iterator it; //check to make sure all parameters are valid for command @@ -203,15 +209,9 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { //check for required parameters - bool hasName = true; namefile = validParameter.validFile(parameters, "name", false); - if (namefile == "not found") { - //if there is a current fasta file, use it - string filename = m->getNameFile(); - if (filename != "") { nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; } - hasName = false; - }else { + if (namefile == "not found") { namefile = ""; } + else { m->splitAtDash(namefile, nameFileNames); //go through files and make sure they are good, if not, then disregard them @@ -277,12 +277,101 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { } } } + } + + if (nameFileNames.size() != 0) { hasName = true; } + + //check for required parameters + vector countfileNames; + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); - //make sure there is at least one valid file left - if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; } + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (countfileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } } - - if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + + if (countfileNames.size() != 0) { hasCount = true; } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + if (!hasName && !hasCount) { + //if there is a current name file, use it, else look for current count file + string filename = m->getNameFile(); + if (filename != "") { hasName = true; nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); } + else { + filename = m->getCountTableFile(); + if (filename != "") { hasCount = true; countfileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("[ERROR]: You must provide a count or name file."); m->mothurOutEndLine(); abort = true; } + } + } + if (!hasName && hasCount) { nameFileNames = countfileNames; } + + if (nameFileNames.size() != fastaFileNames.size()) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } bool hasGroup = true; groupfile = validParameter.validFile(parameters, "group", false); @@ -360,6 +449,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -415,41 +505,82 @@ int ChimeraPerseusCommand::execute(){ int numSeqs = 0; int numChimeras = 0; - - if (groupFile != "") { - //Parse sequences by group - SequenceParser parser(groupFile, fastaFileNames[s], nameFile); - vector groups = parser.getNamesOfGroups(); - - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - //clears files - ofstream out, out1, out2; - m->openOutputFile(outputFileName, out); out.close(); - m->openOutputFile(accnosFileName, out1); out1.close(); - - if(processors == 1) { numSeqs = driverGroups(parser, outputFileName, accnosFileName, 0, groups.size(), groups); } - else { numSeqs = createProcessesGroups(parser, outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); } - - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - numChimeras = deconvoluteResults(parser, outputFileName, accnosFileName); - - m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); - - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - }else{ - if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } - - //read sequences and store sorted by frequency - vector sequences = readFiles(fastaFileNames[s], nameFile); - - if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); + + if (hasCount) { + CountTable* ct = new CountTable(); + ct->readTable(nameFile); + + if (ct->hasGroupInfo()) { + cparser = new SequenceCountParser(fastaFileNames[s], *ct); + + vector groups = cparser->getNamesOfGroups(); + + if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + //clears files + ofstream out, out1, out2; + m->openOutputFile(outputFileName, out); out.close(); + m->openOutputFile(accnosFileName, out1); out1.close(); + + if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups); } + else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); } + + if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + map uniqueNames = cparser->getAllSeqsMap(); + numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName); + delete cparser; + + m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); + + if (m->control_pressed) { delete ct; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + }else { + if (processors != 1) { m->mothurOut("Your count file does not contain group information, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } + + //read sequences and store sorted by frequency + vector sequences = readFiles(fastaFileNames[s], ct); + + if (m->control_pressed) { delete ct; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); + } + delete ct; + }else { + if (groupFile != "") { + //Parse sequences by group + parser = new SequenceParser(groupFile, fastaFileNames[s], nameFile); + vector groups = parser->getNamesOfGroups(); + + if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + //clears files + ofstream out, out1, out2; + m->openOutputFile(outputFileName, out); out.close(); + m->openOutputFile(accnosFileName, out1); out1.close(); + + if(processors == 1) { numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups); } + else { numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile); } + + if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + map uniqueNames = parser->getAllSeqsMap(); + numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName); + delete parser; + + m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + }else{ + if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } + + //read sequences and store sorted by frequency + vector sequences = readFiles(fastaFileNames[s], nameFile); + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); + } } - + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found."); m->mothurOutEndLine(); @@ -510,7 +641,7 @@ string ChimeraPerseusCommand::getNamesFile(string& inputFile){ } } //********************************************************************************************************************** -int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFName, string accnos, int start, int end, vector groups){ +int ChimeraPerseusCommand::driverGroups(string outputFName, string accnos, int start, int end, vector groups){ try { int totalSeqs = 0; @@ -522,7 +653,7 @@ int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFNa int start = time(NULL); if (m->control_pressed) { return 0; } - vector sequences = loadSequences(parser, groups[i]); + vector sequences = loadSequences(groups[i]); if (m->control_pressed) { return 0; } @@ -547,32 +678,48 @@ int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFNa } } //********************************************************************************************************************** -vector ChimeraPerseusCommand::loadSequences(SequenceParser& parser, string group){ +vector ChimeraPerseusCommand::loadSequences(string group){ try { - - vector thisGroupsSeqs = parser.getSeqs(group); - map nameMap = parser.getNameMap(group); - map::iterator it; - - vector sequences; - bool error = false; - alignLength = 0; - - for (int i = 0; i < thisGroupsSeqs.size(); i++) { - - if (m->control_pressed) { return sequences; } - - it = nameMap.find(thisGroupsSeqs[i].getName()); - if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); } - else { - int num = m->getNumNames(it->second); - sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); - if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } - } + bool error = false; + alignLength = 0; + vector sequences; + if (hasCount) { + vector thisGroupsSeqs = cparser->getSeqs(group); + map counts = cparser->getCountTable(group); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (m->control_pressed) { return sequences; } + + it = counts.find(thisGroupsSeqs[i].getName()); + if (it == counts.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); m->mothurOutEndLine(); } + else { + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + }else{ + vector thisGroupsSeqs = parser->getSeqs(group); + map nameMap = parser->getNameMap(group); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (m->control_pressed) { return sequences; } + + it = nameMap.find(thisGroupsSeqs[i].getName()); + if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); } + else { + int num = m->getNumNames(it->second); + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + } - if (error) { m->control_pressed = true; } - + if (error) { m->control_pressed = true; } //sort by frequency sort(sequences.rbegin(), sequences.rend()); @@ -619,6 +766,37 @@ vector ChimeraPerseusCommand::readFiles(string inputFile, string name){ return sequences; } + catch(exception& e) { + m->errorOut(e, "ChimeraPerseusCommand", "readFiles"); + exit(1); + } +} +//********************************************************************************************************************** +vector ChimeraPerseusCommand::readFiles(string inputFile, CountTable* ct){ + try { + //read fasta file and create sequenceData structure - checking for file mismatches + vector sequences; + ifstream in; + m->openInputFile(inputFile, in); + alignLength = 0; + + while (!in.eof()) { + Sequence temp(in); m->gobble(in); + + int count = ct->getNumSeqs(temp.getName()); + if (m->control_pressed) { break; } + else { + sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), count)); + if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); } + } + } + in.close(); + + //sort by frequency + sort(sequences.rbegin(), sequences.rend()); + + return sequences; + } catch(exception& e) { m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile"); exit(1); @@ -771,7 +949,7 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque } } /**************************************************************************************************/ -int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string accnos, vector groups, string group, string fasta, string name) { +int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accnos, vector groups, string group, string fasta, string name) { try { vector processIDS; @@ -801,7 +979,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); + num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); //pass numSeqs to parent ofstream out; @@ -819,7 +997,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string } //do my part - num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups); + num = driverGroups(outputFName, accnos, lines[0].start, lines[0].end, groups); //force parent to wait until all the processes are done for (int i=0;i& uniqueNames, string outputFileName, string accnosFileName){ try { - map uniqueNames = parser.getAllSeqsMap(); map::iterator itUnique; int total = 0; diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h index b6957e3..e2855d0 100644 --- a/chimeraperseuscommand.h +++ b/chimeraperseuscommand.h @@ -16,7 +16,9 @@ #include "mothur.h" #include "command.hpp" #include "sequenceparser.h" +#include "sequencecountparser.h" #include "myPerseus.h" +#include "counttable.h" /***********************************************************/ class ChimeraPerseusCommand : public Command { @@ -43,10 +45,12 @@ private: linePair(int i, int j) : start(i), end(j) {} }; - bool abort; - string fastafile, groupfile, outputDir, namefile; + bool abort, hasName, hasCount; + string fastafile, groupfile, countfile, outputDir, namefile; int processors, alignLength; double cutoff, alpha, beta; + SequenceParser* parser; + SequenceCountParser* cparser; vector outputNames; vector fastaFileNames; @@ -56,10 +60,11 @@ private: string getNamesFile(string&); int driver(string, vector&, string, int&); vector readFiles(string, string); - vector loadSequences(SequenceParser&, string); - int deconvoluteResults(SequenceParser&, string, string); - int driverGroups(SequenceParser&, string, string, int, int, vector); - int createProcessesGroups(SequenceParser&, string, string, vector, string, string, string); + vector readFiles(string inputFile, CountTable* ct); + vector loadSequences(string); + int deconvoluteResults(map&, string, string); + int driverGroups(string, string, int, int, vector); + int createProcessesGroups(string, string, vector, string, string, string); }; /**************************************************************************************************/ @@ -75,12 +80,13 @@ struct perseusData { MothurOut* m; int start; int end; + bool hasName, hasCount; int threadID, count, numChimeras; double alpha, beta, cutoff; vector groups; perseusData(){} - perseusData(double a, double b, double c, string o, string f, string n, string g, string ac, vector gr, MothurOut* mout, int st, int en, int tid) { + perseusData(bool hn, bool hc, double a, double b, double c, string o, string f, string n, string g, string ac, vector gr, MothurOut* mout, int st, int en, int tid) { alpha = a; beta = b; cutoff = c; @@ -94,6 +100,8 @@ struct perseusData { end = en; threadID = tid; groups = gr; + hasName = hn; + hasCount = hc; count = 0; numChimeras = 0; } @@ -114,38 +122,67 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ //parse fasta and name file by group SequenceParser* parser; - if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } - else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } - + SequenceCountParser* cparser; + if (pDataArray->hasCount) { + CountTable* ct = new CountTable(); + ct->readTable(pDataArray->namefile); + cparser = new SequenceCountParser(pDataArray->fastafile, *ct); + delete ct; + }else { + if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } + else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } + } + int totalSeqs = 0; int numChimeras = 0; for (int i = pDataArray->start; i < pDataArray->end; i++) { - int start = time(NULL); if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + int start = time(NULL); if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "..."); pDataArray->m->mothurOutEndLine(); //vector sequences = loadSequences(parser, groups[i]); - same function below //////////////////////////////////////////////////////////////////////////////////////// - vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]); - map nameMap = parser->getNameMap(pDataArray->groups[i]); - map::iterator it; - - vector sequences; bool error = false; - - for (int j = 0; j < thisGroupsSeqs.size(); j++) { - - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } - - it = nameMap.find(thisGroupsSeqs[j].getName()); - if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } - else { - int num = pDataArray->m->getNumNames(it->second); - sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num)); - } - } + int alignLength = 0; + vector sequences; + if (pDataArray->hasCount) { + vector thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[i]); + map counts = cparser->getCountTable(pDataArray->groups[i]); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (pDataArray->m->control_pressed) { break; } + + it = counts.find(thisGroupsSeqs[i].getName()); + if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } + else { + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + }else{ + vector thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]); + map nameMap = parser->getNameMap(pDataArray->groups[i]); + map::iterator it; + + for (int i = 0; i < thisGroupsSeqs.size(); i++) { + + if (pDataArray->m->control_pressed) { break; } + + it = nameMap.find(thisGroupsSeqs[i].getName()); + if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } + else { + int num = pDataArray->m->getNumNames(it->second); + sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } + } + } + + } + if (error) { pDataArray->m->control_pressed = true; } @@ -153,7 +190,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ sort(sequences.rbegin(), sequences.rend()); //////////////////////////////////////////////////////////////////////////////////////// - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below //////////////////////////////////////////////////////////////////////////////////////// @@ -184,7 +221,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ } int numSeqs = sequences.size(); - int alignLength = sequences[0].sequence.size(); + //int alignLength = sequences[0].sequence.size(); ofstream chimeraFile; ofstream accnosFile; @@ -200,7 +237,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ for(int j=0;jm->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } vector restricted = chimeras; @@ -217,7 +254,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi; @@ -226,7 +263,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ if(comparisons >= 2){ minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } int minMismatchToTrimera = numeric_limits::max(); int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB; @@ -234,12 +271,12 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ if(minMismatchToChimera >= 3 && comparisons >= 3){ minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } } double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } string type; string chimeraRefSeq; @@ -253,16 +290,16 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps); } - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq); double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; } chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t'; chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t'; @@ -304,11 +341,11 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){ pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName); pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + "."); pDataArray->m->mothurOutEndLine(); - if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; } } pDataArray->count = totalSeqs; - delete parser; + if (pDataArray->hasCount) { delete cparser; } { delete parser; } return totalSeqs; } diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index 59dd0a5..bd9bdbf 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -11,14 +11,16 @@ #include "deconvolutecommand.h" #include "referencedb.h" #include "sequenceparser.h" +#include "counttable.h" //********************************************************************************************************************** vector ChimeraSlayerCommand::setParameters(){ try { CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate); CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pwindow("window", "Number", "", "50", "", "", "",false,false); parameters.push_back(pwindow); CommandParameter pksize("ksize", "Number", "", "7", "", "", "",false,false); parameters.push_back(pksize); CommandParameter pmatch("match", "Number", "", "5.0", "", "", "",false,false); parameters.push_back(pmatch); @@ -57,10 +59,11 @@ string ChimeraSlayerCommand::getHelpString(){ string helpString = ""; helpString += "The chimera.slayer command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n"; - helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n"; + helpString += "The chimera.slayer command parameters are fasta, name, group, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file, if you are using reference=self. \n"; helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; + helpString += "The count parameter allows you to provide a count file. The count file reference=self. If your count file contains group information, when checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; @@ -139,6 +142,8 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { try { abort = false; calledHelp = false; ReferenceDB* rdb = ReferenceDB::getInstance(); + hasCount = false; + hasName = false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -247,9 +252,8 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { //check for required parameters - bool hasName = true; namefile = validParameter.validFile(parameters, "name", false); - if (namefile == "not found") { namefile = ""; hasName = false; } + if (namefile == "not found") { namefile = ""; } else { m->splitAtDash(namefile, nameFileNames); @@ -316,12 +320,91 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { } } } + } + + if (nameFileNames.size() != 0) { hasName = true; } + + //check for required parameters + vector countfileNames; + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); - //make sure there is at least one valid file left - if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; } + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (nameFileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } } - - if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + + if (countfileNames.size() != 0) { hasCount = true; } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + if (!hasName && hasCount) { nameFileNames = countfileNames; } + + if ((hasCount || hasName) && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } bool hasGroup = true; groupfile = validParameter.validFile(parameters, "group", false); @@ -399,7 +482,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } - + if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -449,6 +532,12 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { m->mothurOutEndLine(); save = false; } + }else if (hasCount) { templatefile = "self"; + if (save) { + m->mothurOut("[WARNING]: You can't save reference=self, ignoring save."); + m->mothurOutEndLine(); + save = false; + } } else { if (rdb->referenceSeqs.size() != 0) { @@ -551,7 +640,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { if ((search != "blast") && (search != "kmer")) { m->mothurOut(search + " is not a valid search."); m->mothurOutEndLine(); abort = true; } - if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } + if ((hasName || hasCount) && (templatefile != "self")) { m->mothurOut("You have provided a namefile or countfile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } //until we resolve the issue 10-18-11 @@ -599,13 +688,23 @@ int ChimeraSlayerCommand::execute(){ map fileGroup; fileToPriority[fastaFileNames[s]] = priority; //default fileGroup[fastaFileNames[s]] = "noGroup"; - SequenceParser* parser = NULL; + map uniqueNames; int totalChimeras = 0; lines.clear(); - if (templatefile == "self") { setUpForSelfReference(parser, fileGroup, fileToPriority, s); } + if (templatefile == "self") { + if (hasCount) { + SequenceCountParser* parser = NULL; + setUpForSelfReference(parser, fileGroup, fileToPriority, s); + if (parser != NULL) { uniqueNames = parser->getAllSeqsMap(); delete parser; } + }else { + SequenceParser* parser = NULL; + setUpForSelfReference(parser, fileGroup, fileToPriority, s); + if (parser != NULL) { uniqueNames = parser->getAllSeqsMap(); delete parser; } + } + } - if (m->control_pressed) { if (parser != NULL) { delete parser; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } if (fileToPriority.size() == 1) { //you running without a groupfile itFile = fileToPriority.begin(); @@ -637,7 +736,7 @@ int ChimeraSlayerCommand::execute(){ if(processors == 1){ numSeqs = driver(lines[0], outputFileName, thisFastaName, accnosFileName, trimFastaFileName, thisPriority); } else{ numSeqs = createProcesses(outputFileName, thisFastaName, accnosFileName, trimFastaFileName, thisPriority); } - if (m->control_pressed) { if (parser != NULL) { delete parser; } outputTypes.clear(); if (trim) { m->mothurRemove(trimFastaFileName); } m->mothurRemove(outputFileName); m->mothurRemove(accnosFileName); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + if (m->control_pressed) { outputTypes.clear(); if (trim) { m->mothurRemove(trimFastaFileName); } m->mothurRemove(outputFileName); m->mothurRemove(accnosFileName); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } #endif }else { //you have provided a groupfile #ifdef USE_MPI @@ -653,16 +752,13 @@ int ChimeraSlayerCommand::execute(){ if (pid == 0) { #endif - - totalChimeras = deconvoluteResults(parser, outputFileName, accnosFileName, trimFastaFileName); + totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, trimFastaFileName); m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine(); #ifdef USE_MPI } MPI_Barrier(MPI_COMM_WORLD); //make everyone wait #endif } - - if (parser != NULL) { delete parser; } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); } @@ -919,9 +1015,8 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st } } //********************************************************************************************************************** -int ChimeraSlayerCommand::deconvoluteResults(SequenceParser* parser, string outputFileName, string accnosFileName, string trimFileName){ +int ChimeraSlayerCommand::deconvoluteResults(map& uniqueNames, string outputFileName, string accnosFileName, string trimFileName){ try { - map uniqueNames = parser->getAllSeqsMap(); map::iterator itUnique; int total = 0; @@ -1169,7 +1264,51 @@ int ChimeraSlayerCommand::setUpForSelfReference(SequenceParser*& parser, map& fileGroup, map >& fileToPriority, int s){ + try { + fileGroup.clear(); + fileToPriority.clear(); + + string nameFile = ""; + if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one + nameFile = nameFileNames[s]; + }else { m->control_pressed = true; return 0; } + + CountTable ct; + if (!ct.testGroups(nameFile)) { + if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } + + //sort fastafile by abundance, returns new sorted fastafile name + m->mothurOut("Sorting fastafile according to abundance..."); cout.flush(); + priority = sortFastaFile(fastaFileNames[s], nameFile); + m->mothurOut("Done."); m->mothurOutEndLine(); + + fileToPriority[fastaFileNames[s]] = priority; + fileGroup[fastaFileNames[s]] = "noGroup"; + }else { + //Parse sequences by group + parser = new SequenceCountParser(nameFile, fastaFileNames[s]); + vector groups = parser->getNamesOfGroups(); + + for (int i = 0; i < groups.size(); i++) { + vector thisGroupsSeqs = parser->getSeqs(groups[i]); + map thisGroupsMap = parser->getCountTable(groups[i]); + string newFastaFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + groups[i] + "-sortedTemp.fasta"; + sortFastaFile(thisGroupsSeqs, thisGroupsMap, newFastaFile); + fileToPriority[newFastaFile] = thisGroupsMap; + fileGroup[newFastaFile] = groups[i]; + } + } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "setUpForSelfReference"); + exit(1); + } +} //********************************************************************************************************************** string ChimeraSlayerCommand::getNamesFile(string& inputFile){ try { @@ -1820,9 +1959,22 @@ map ChimeraSlayerCommand::sortFastaFile(string fastaFile, string na in.close(); - //read namefile + //read namefile or countfile vector nameMapCount; - int error = m->readNames(nameFile, nameMapCount, seqs); + int error; + if (hasCount) { + CountTable ct; + ct.readTable(nameFile); + + for(map::iterator it = seqs.begin(); it != seqs.end(); it++) { + int num = ct.getNumSeqs(it->first); + if (num == 0) { error = 1; } + else { + seqPriorityNode temp(num, it->second, it->first); + nameMapCount.push_back(temp); + } + } + }else { error = m->readNames(nameFile, nameMapCount, seqs); } if (m->control_pressed) { return nameAbund; } @@ -1904,4 +2056,51 @@ map ChimeraSlayerCommand::sortFastaFile(vector& thisseqs, } } /**************************************************************************************************/ +int ChimeraSlayerCommand::sortFastaFile(vector& thisseqs, map& countMap, string newFile) { + try { + vector nameVector; + + //read through fastafile and store info + map seqs; + + for (int i = 0; i < thisseqs.size(); i++) { + + if (m->control_pressed) { return 0; } + + map::iterator itCountMap = countMap.find(thisseqs[i].getName()); + + if (itCountMap == countMap.end()){ + m->control_pressed = true; + m->mothurOut("[ERROR]: " + thisseqs[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine(); + }else { + seqPriorityNode temp(itCountMap->second, thisseqs[i].getAligned(), thisseqs[i].getName()); + nameVector.push_back(temp); + } + } + + //sort by num represented + sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes); + + if (m->control_pressed) { return 0; } + + if (thisseqs.size() != nameVector.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your count file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + + ofstream out; + m->openOutputFile(newFile, out); + + //print new file in order of + for (int i = 0; i < nameVector.size(); i++) { + out << ">" << nameVector[i].name << endl << nameVector[i].seq << endl; + } + out.close(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayerCommand", "sortFastaFile"); + exit(1); + } +} +/**************************************************************************************************/ diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h index 82e1228..4d53c5c 100644 --- a/chimeraslayercommand.h +++ b/chimeraslayercommand.h @@ -15,6 +15,7 @@ #include "chimera.h" #include "chimeraslayer.h" #include "sequenceparser.h" +#include "sequencecountparser.h" /***********************************************************/ @@ -51,12 +52,14 @@ private: int divideInHalf(Sequence, string&, string&); map sortFastaFile(string, string); map sortFastaFile(vector&, map&, string newFile); + int sortFastaFile(vector&, map&, string newFile); string getNamesFile(string&); //int setupChimera(string,); int MPIExecute(string, string, string, string, map&); - int deconvoluteResults(SequenceParser*, string, string, string); + int deconvoluteResults(map&, string, string, string); map priority; int setUpForSelfReference(SequenceParser*&, map&, map >&, int); + int setUpForSelfReference(SequenceCountParser*&, map&, map >&, int); int driverGroups(string, string, string, map >&, map&); int createProcessesGroups(string, string, string, map >&, map&); int MPIExecuteGroups(string, string, string, map >&, map&); @@ -66,8 +69,8 @@ private: int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&, string, map&, bool); #endif - bool abort, realign, trim, trimera, save; - string fastafile, groupfile, templatefile, outputDir, search, namefile, blastlocation; + bool abort, realign, trim, trimera, save, hasName, hasCount; + string fastafile, groupfile, templatefile, outputDir, search, namefile, countfile, blastlocation; int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength; float divR; diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index bd31c19..ae01190 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -19,8 +19,9 @@ vector ChimeraUchimeCommand::setParameters(){ try { CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate); CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -34,6 +35,8 @@ vector ChimeraUchimeCommand::setParameters(){ CommandParameter pchunks("chunks", "Number", "", "4", "", "", "",false,false); parameters.push_back(pchunks); CommandParameter pminchunk("minchunk", "Number", "", "64", "", "", "",false,false); parameters.push_back(pminchunk); CommandParameter pidsmoothwindow("idsmoothwindow", "Number", "", "32", "", "", "",false,false); parameters.push_back(pidsmoothwindow); + CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pdups); + //CommandParameter pminsmoothid("minsmoothid", "Number", "", "0.95", "", "", "",false,false); parameters.push_back(pminsmoothid); CommandParameter pmaxp("maxp", "Number", "", "2", "", "", "",false,false); parameters.push_back(pmaxp); CommandParameter pskipgaps("skipgaps", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pskipgaps); @@ -58,11 +61,13 @@ string ChimeraUchimeCommand::getHelpString(){ string helpString = ""; helpString += "The chimera.uchime command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; helpString += "This command is a wrapper for uchime written by Robert C. Edgar.\n"; - helpString += "The chimera.uchime command parameters are fasta, name, reference, processors, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n"; + helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, dereplicate, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n"; + helpString += "The count parameter allows you to provide a count file, if you are using template=self. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; + helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n"; helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; helpString += "The abskew parameter can only be used with template=self. Minimum abundance skew. Default 1.9. Abundance skew is: min [ abund(parent1), abund(parent2) ] / abund(query).\n"; @@ -137,7 +142,7 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(){ //*************************************************************************************************************** ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { try { - abort = false; calledHelp = false; + abort = false; calledHelp = false; hasName=false; hasCount=false; ReferenceDB* rdb = ReferenceDB::getInstance(); //allow user to run help @@ -247,9 +252,8 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { //check for required parameters - bool hasName = true; namefile = validParameter.validFile(parameters, "name", false); - if (namefile == "not found") { namefile = ""; hasName = false; } + if (namefile == "not found") { namefile = ""; } else { m->splitAtDash(namefile, nameFileNames); @@ -316,12 +320,91 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { } } } + } + + if (nameFileNames.size() != 0) { hasName = true; } + + //check for required parameters + vector countfileNames; + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); - //make sure there is at least one valid file left - if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; } + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (nameFileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } } - - if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + + if (countfileNames.size() != 0) { hasCount = true; } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + if (!hasName && hasCount) { nameFileNames = countfileNames; } + + if ((hasCount || hasName) && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } bool hasGroup = true; groupfile = validParameter.validFile(parameters, "group", false); @@ -399,6 +482,10 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -427,6 +514,7 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { } } }else if (hasName) { templatefile = "self"; } + else if (hasCount) { templatefile = "self"; } else { if (rdb->getSavedReference() != "") { templatefile = rdb->getSavedReference(); @@ -472,6 +560,15 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { temp = validParameter.validFile(parameters, "skipgaps2", false); if (temp == "not found") { temp = "t"; } skipgaps2 = m->isTrue(temp); + + string usedDups = "false"; + temp = validParameter.validFile(parameters, "dereplicate", false); + if (temp == "not found") { + if (groupfile != "") { temp = "false"; } + else { temp = "true"; usedDups = ""; } + } + dups = m->isTrue(temp); + if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } @@ -533,7 +630,8 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { int ChimeraUchimeCommand::execute(){ try{ - if (abort == true) { if (calledHelp) { return 0; } return 2; } + + if (abort == true) { if (calledHelp) { return 0; } return 2; } m->mothurOut("\nuchime by Robert C. Edgar\nhttp://drive5.com/uchime\nThis code is donated to the public domain.\n\n"); @@ -551,9 +649,14 @@ int ChimeraUchimeCommand::execute(){ //you provided a groupfile string groupFile = ""; - if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; } + bool hasGroup = false; + if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; hasGroup = true; } + else if (hasCount) { + CountTable ct; + if (ct.testGroups(nameFileNames[s])) { hasGroup = true; } + } - if ((templatefile == "self") && (groupFile == "")) { //you want to run uchime with a reference template + if ((templatefile == "self") && (!hasGroup)) { //you want to run uchime with a template=self and no groups if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one @@ -565,7 +668,21 @@ int ChimeraUchimeCommand::execute(){ //read namefile vector nameMapCount; - int error = m->readNames(nameFile, nameMapCount, seqs); if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + int error; + if (hasCount) { + CountTable ct; + ct.readTable(nameFile); + for(map::iterator it = seqs.begin(); it != seqs.end(); it++) { + int num = ct.getNumSeqs(it->first); + if (num == 0) { error = 1; } + else { + seqPriorityNode temp(num, it->second, it->first); + nameMapCount.push_back(temp); + } + } + }else { + error = m->readNames(nameFile, nameMapCount, seqs); if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + } if (error == 1) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } @@ -575,14 +692,23 @@ int ChimeraUchimeCommand::execute(){ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - if (groupFile != "") { + if (hasGroup) { if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one nameFile = nameFileNames[s]; }else { nameFile = getNamesFile(fastaFileNames[s]); } //Parse sequences by group - SequenceParser parser(groupFile, fastaFileNames[s], nameFile); - vector groups = parser.getNamesOfGroups(); + vector groups; + map uniqueNames; + if (hasCount) { + cparser = new SequenceCountParser(nameFile, fastaFileNames[s]); + groups = cparser->getNamesOfGroups(); + uniqueNames = cparser->getAllSeqsMap(); + }else{ + sparser = new SequenceParser(groupFile, fastaFileNames[s], nameFile); + groups = sparser->getNamesOfGroups(); + uniqueNames = sparser->getAllSeqsMap(); + } if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } @@ -593,16 +719,20 @@ int ChimeraUchimeCommand::execute(){ if (chimealns) { m->openOutputFile(alnsFileName, out2); out2.close(); } int totalSeqs = 0; - if(processors == 1) { totalSeqs = driverGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, 0, groups.size(), groups); } - else { totalSeqs = createProcessesGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, groups, nameFile, groupFile, fastaFileNames[s]); } + if(processors == 1) { totalSeqs = driverGroups(outputFileName, newFasta, accnosFileName, alnsFileName, 0, groups.size(), groups); } + else { totalSeqs = createProcessesGroups(outputFileName, newFasta, accnosFileName, alnsFileName, groups, nameFile, groupFile, fastaFileNames[s]); } if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } - - int totalChimeras = deconvoluteResults(parser, outputFileName, accnosFileName, alnsFileName); - - m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found."); m->mothurOutEndLine(); - m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); + if (hasCount) { delete cparser; } + else { delete sparser; } + + if (!dups) { + int totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, alnsFileName); + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found."); m->mothurOutEndLine(); + m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); + } + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } }else{ @@ -657,9 +787,8 @@ int ChimeraUchimeCommand::execute(){ } } //********************************************************************************************************************** -int ChimeraUchimeCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName, string alnsFileName){ +int ChimeraUchimeCommand::deconvoluteResults(map& uniqueNames, string outputFileName, string accnosFileName, string alnsFileName){ try { - map uniqueNames = parser.getAllSeqsMap(); map::iterator itUnique; int total = 0; @@ -685,7 +814,7 @@ int ChimeraUchimeCommand::deconvoluteResults(SequenceParser& parser, string outp //find unique name itUnique = uniqueNames.find(name); - if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; } + if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find " + name + "."); m->mothurOutEndLine(); m->control_pressed = true; } else { itChimeras = chimerasInFile.find((itUnique->second)); @@ -999,7 +1128,7 @@ string ChimeraUchimeCommand::getNamesFile(string& inputFile){ } } //********************************************************************************************************************** -int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, int start, int end, vector groups){ +int ChimeraUchimeCommand::driverGroups(string outputFName, string filename, string accnos, string alns, int start, int end, vector groups){ try { int totalSeqs = 0; @@ -1007,8 +1136,10 @@ int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFNam for (int i = start; i < end; i++) { int start = time(NULL); if (m->control_pressed) { return 0; } - - int error = parser.getSeqs(groups[i], filename, true); if ((error == 1) || m->control_pressed) { return 0; } + + int error; + if (hasCount) { error = cparser->getSeqs(groups[i], filename, true); if ((error == 1) || m->control_pressed) { return 0; } } + else { error = sparser->getSeqs(groups[i], filename, true); if ((error == 1) || m->control_pressed) { return 0; } } int numSeqs = driver((outputFName + groups[i]), filename, (accnos+ groups[i]), (alns+ groups[i]), numChimeras); totalSeqs += numSeqs; @@ -1026,7 +1157,6 @@ int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFNam m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + groups[i] + "."); m->mothurOutEndLine(); } - return totalSeqs; } @@ -1052,29 +1182,20 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc vector cPara; string uchimeCommand = uchimeLocation; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) - uchimeCommand += " "; -#else - uchimeCommand = "\"" + uchimeCommand + "\""; -#endif - - char* tempUchime; + uchimeCommand = "\"" + uchimeCommand + "\" "; + + char* tempUchime; tempUchime= new char[uchimeCommand.length()+1]; *tempUchime = '\0'; strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length()); cPara.push_back(tempUchime); - char* tempIn = new char[8]; - *tempIn = '\0'; strncat(tempIn, "--input", 7); - //strcpy(tempIn, "--input"); - cPara.push_back(tempIn); - char* temp = new char[filename.length()+1]; - *temp = '\0'; strncat(temp, filename.c_str(), filename.length()); - //strcpy(temp, filename.c_str()); - cPara.push_back(temp); - - //are you using a reference file + //are you using a reference file if (templatefile != "self") { + string outputFileName = filename.substr(1, filename.length()-2) + ".uchime_formatted"; + prepFile(filename.substr(1, filename.length()-2), outputFileName); + filename = outputFileName; + filename = "\"" + filename + "\""; //add reference file char* tempRef = new char[5]; //strcpy(tempRef, "--db"); @@ -1086,6 +1207,15 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc cPara.push_back(tempR); } + char* tempIn = new char[8]; + *tempIn = '\0'; strncat(tempIn, "--input", 7); + //strcpy(tempIn, "--input"); + cPara.push_back(tempIn); + char* temp = new char[filename.length()+1]; + *temp = '\0'; strncat(temp, filename.c_str(), filename.length()); + //strcpy(temp, filename.c_str()); + cPara.push_back(temp); + char* tempO = new char[12]; *tempO = '\0'; strncat(tempO, "--uchimeout", 11); //strcpy(tempO, "--uchimeout"); @@ -1339,6 +1469,8 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc in.close(); out.close(); + //if (templatefile != "self") { m->mothurRemove(filename); } + return num; } catch(exception& e) { @@ -1347,6 +1479,34 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc } } /**************************************************************************************************/ +//uchime can't handle some of the things allowed in mothurs fasta files. This functions "cleans up" the file. +int ChimeraUchimeCommand::prepFile(string filename, string output) { + try { + + ifstream in; + m->openInputFile(filename, in); + + ofstream out; + m->openOutputFile(output, out); + + while (!in.eof()) { + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { seq.printSequence(out); } + } + in.close(); + out.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ChimeraUchimeCommand", "prepFile"); + exit(1); + } +} +/**************************************************************************************************/ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename, string accnos, string alns, int& numChimeras) { try { @@ -1467,7 +1627,7 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename string extension = toString(i) + ".temp"; uchimeData* tempUchime = new uchimeData(outputFileName+extension, uchimeLocation, templatefile, files[i], "", "", "", accnos+extension, alns+extension, dummy, m, 0, 0, i); - tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract); + tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount); tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract); pDataArray.push_back(tempUchime); @@ -1519,7 +1679,7 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename } /**************************************************************************************************/ -int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, vector groups, string nameFile, string groupFile, string fastaFile) { +int ChimeraUchimeCommand::createProcessesGroups(string outputFName, string filename, string accnos, string alns, vector groups, string nameFile, string groupFile, string fastaFile) { try { processIDS.clear(); @@ -1549,7 +1709,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", filename + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", alns + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); + num = driverGroups(outputFName + toString(getpid()) + ".temp", filename + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", alns + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups); //pass numSeqs to parent ofstream out; @@ -1567,7 +1727,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o } //do my part - num = driverGroups(parser, outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups); + num = driverGroups(outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups); //force parent to wait until all the processes are done for (int i=0;isetBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract); + tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount); tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract); pDataArray.push_back(tempUchime); @@ -1612,7 +1772,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o //using the main process as a worker saves time and memory - num = driverGroups(parser, outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups); + num = driverGroups(outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups); //Wait until all threads have terminated. WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); diff --git a/chimerauchimecommand.h b/chimerauchimecommand.h index 3ca7939..f0c30d0 100644 --- a/chimerauchimecommand.h +++ b/chimerauchimecommand.h @@ -14,6 +14,8 @@ #include "mothur.h" #include "command.hpp" #include "sequenceparser.h" +#include "counttable.h" +#include "sequencecountparser.h" /***********************************************************/ @@ -45,11 +47,12 @@ private: int driver(string, string, string, string, int&); int createProcesses(string, string, string, string, int&); - bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract; - string fastafile, groupfile, templatefile, outputDir, namefile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation; + bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount, hasName, dups; + string fastafile, groupfile, templatefile, outputDir, namefile, countfile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation; int processors; - + SequenceParser* sparser; + SequenceCountParser* cparser; vector outputNames; vector fastaFileNames; vector nameFileNames; @@ -58,9 +61,10 @@ private: string getNamesFile(string&); int readFasta(string, map&); int printFile(vector&, string); - int deconvoluteResults(SequenceParser&, string, string, string); - int driverGroups(SequenceParser&, string, string, string, string, int, int, vector); - int createProcessesGroups(SequenceParser&, string, string, string, string, vector, string, string, string); + int deconvoluteResults(map&, string, string, string); + int driverGroups(string, string, string, string, int, int, vector); + int createProcessesGroups(string, string, string, string, vector, string, string, string); + int prepFile(string filename, string); }; @@ -81,7 +85,7 @@ struct uchimeData { int end; int threadID, count, numChimeras; vector groups; - bool useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract; + bool useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount; string abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract; uchimeData(){} @@ -103,7 +107,7 @@ struct uchimeData { numChimeras = 0; uchimeLocation = uloc; } - void setBooleans(bool Abskew, bool calns, bool MinH, bool Mindiv, bool Xn, bool Dn, bool Xa, bool Chunks, bool Minchunk, bool Idsmoothwindow, bool Minsmoothid, bool Maxp, bool skipgap, bool skipgap2, bool Minlen, bool Maxlen, bool uc, bool Queryfract) { + void setBooleans(bool Abskew, bool calns, bool MinH, bool Mindiv, bool Xn, bool Dn, bool Xa, bool Chunks, bool Minchunk, bool Idsmoothwindow, bool Minsmoothid, bool Maxp, bool skipgap, bool skipgap2, bool Minlen, bool Maxlen, bool uc, bool Queryfract, bool hc) { useAbskew = Abskew; chimealns = calns; useMinH = MinH; @@ -122,6 +126,7 @@ struct uchimeData { useMaxlen = Maxlen; ucl = uc; useQueryfract = Queryfract; + hasCount = hc; } void setVariables(string abske, string min, string mindi, string x, string d, string xa2, string chunk, string minchun, string idsmoothwindo, string minsmoothi, string max, string minle, string maxle, string queryfrac) { @@ -163,16 +168,30 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){ //parse fasta and name file by group SequenceParser* parser; - if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } - else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } + SequenceCountParser* cparser; + if (pDataArray->hasCount) { + CountTable* ct = new CountTable(); + ct->readTable(pDataArray->namefile); + cparser = new SequenceCountParser(pDataArray->fastafile, *ct); + delete ct; + }else { + if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } + else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } + } int totalSeqs = 0; int numChimeras = 0; for (int i = pDataArray->start; i < pDataArray->end; i++) { - int start = time(NULL); if (pDataArray->m->control_pressed) { delete parser; return 0; } + int start = time(NULL); if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } return 0; } - int error = parser->getSeqs(pDataArray->groups[i], pDataArray->filename, true); if ((error == 1) || pDataArray->m->control_pressed) { delete parser; return 0; } + + int error; + if (pDataArray->hasCount) { + error = cparser->getSeqs(pDataArray->groups[i], pDataArray->filename, true); if ((error == 1) || pDataArray->m->control_pressed) { delete cparser; return 0; } + }else { + error = parser->getSeqs(pDataArray->groups[i], pDataArray->filename, true); if ((error == 1) || pDataArray->m->control_pressed) { delete parser; return 0; } + } //int numSeqs = driver((outputFName + groups[i]), filename, (accnos+ groups[i]), (alns+ groups[i]), numChimeras); //////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -410,7 +429,7 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){ filename = filename.substr(1, filename.length()-2); alns = alns.substr(1, alns.length()-2); - if (pDataArray->m->control_pressed) { delete parser; return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } return 0; } //create accnos file from uchime results ifstream in; @@ -447,7 +466,7 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){ totalSeqs += num; pDataArray->numChimeras += numChimeras; - if (pDataArray->m->control_pressed) { delete parser; return 0; } + if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } return 0; } //remove file made for uchime pDataArray->m->mothurRemove(filename); @@ -462,7 +481,7 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){ } pDataArray->count = totalSeqs; - delete parser; + if (pDataArray->hasCount) { delete cparser; } { delete parser; } return totalSeqs; } @@ -506,16 +525,31 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){ strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length()); cPara.push_back(tempUchime); - char* tempIn = new char[8]; - *tempIn = '\0'; strncat(tempIn, "--input", 7); - //strcpy(tempIn, "--input"); - cPara.push_back(tempIn); - char* temp = new char[filename.length()+1]; - *temp = '\0'; strncat(temp, filename.c_str(), filename.length()); - //strcpy(temp, filename.c_str()); - cPara.push_back(temp); - - //add reference file + string outputFileName = filename.substr(1, filename.length()-2) + ".uchime_formatted"; + //prepFile(filename.substr(1, filename.length()-2), outputFileName); + //prepFile(filename, outputFileName); + /******************************************/ + ifstream in23; + pDataArray->m->openInputFile((filename.substr(1, filename.length()-2)), in23); + + ofstream out23; + pDataArray->m->openOutputFile(outputFileName, out23); + + while (!in23.eof()) { + if (pDataArray->m->control_pressed) { break; } + + Sequence seq(in23); pDataArray->m->gobble(in23); + + if (seq.getName() != "") { seq.printSequence(out23); } + } + in23.close(); + out23.close(); + /******************************************/ + + filename = outputFileName; + filename = "\"" + filename + "\""; + + //add reference file char* tempRef = new char[5]; //strcpy(tempRef, "--db"); *tempRef = '\0'; strncat(tempRef, "--db", 4); @@ -524,6 +558,15 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){ //strcpy(tempR, templatefile.c_str()); *tempR = '\0'; strncat(tempR, templatefile.c_str(), templatefile.length()); cPara.push_back(tempR); + + char* tempIn = new char[8]; + *tempIn = '\0'; strncat(tempIn, "--input", 7); + //strcpy(tempIn, "--input"); + cPara.push_back(tempIn); + char* temp = new char[filename.length()+1]; + *temp = '\0'; strncat(temp, filename.c_str(), filename.length()); + //strcpy(temp, filename.c_str()); + cPara.push_back(temp); char* tempO = new char[12]; *tempO = '\0'; strncat(tempO, "--uchimeout", 11); @@ -715,6 +758,8 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){ for (int j = 0; j < cPara.size(); j++) { uchimeParameters[j] = cPara[j]; commandString += toString(cPara[j]) + " "; } //int numArgs = cPara.size(); + commandString = "\"" + commandString + "\""; + //uchime_main(numArgs, uchimeParameters); //cout << "commandString = " << commandString << endl; if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: uchime command = " + commandString + ".\n"); } diff --git a/chopseqscommand.cpp b/chopseqscommand.cpp index 05037f6..4bcd707 100644 --- a/chopseqscommand.cpp +++ b/chopseqscommand.cpp @@ -14,7 +14,8 @@ vector ChopSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pnumbases("numbases", "Number", "", "0", "", "", "",false,true); parameters.push_back(pnumbases); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pnumbases("numbases", "Number", "", "0", "", "", "",false,true); parameters.push_back(pnumbases); CommandParameter pcountgaps("countgaps", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcountgaps); CommandParameter pshort("short", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pshort); CommandParameter pkeep("keep", "Multiple", "front-back", "front", "", "", "",false,false); parameters.push_back(pkeep); @@ -41,7 +42,8 @@ string ChopSeqsCommand::getHelpString(){ helpString += "The keep parameter allows you to specify whether you want to keep the front or the back of your sequence, default=front.\n"; helpString += "The countgaps parameter allows you to specify whether you want to count gaps as bases, default=false.\n"; helpString += "The short parameter allows you to specify you want to keep sequences that are too short to chop, default=false.\n"; - helpString += "For example, if you ran chop.seqs with numbases=200 and short=t, if a sequence had 100 bases mothur would keep the sequence rather than eliminate it.\n"; + helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; + helpString += "For example, if you ran chop.seqs with numbases=200 and short=t, if a sequence had 100 bases mothur would keep the sequence rather than eliminate it.\n"; helpString += "Example chop.seqs(fasta=amazon.fasta, numbases=200, keep=front).\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; return helpString; @@ -143,6 +145,10 @@ ChopSeqsCommand::ChopSeqsCommand(string option) { string temp = validParameter.validFile(parameters, "numbases", false); if (temp == "not found") { temp = "0"; } m->mothurConvert(temp, numbases); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + m->mothurConvert(temp, processors); + temp = validParameter.validFile(parameters, "countgaps", false); if (temp == "not found") { temp = "f"; } countGaps = m->isTrue(temp); @@ -169,39 +175,32 @@ int ChopSeqsCommand::execute(){ string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta"); string outputFileNameAccnos = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos"); + + + vector positions; + vector lines; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + positions = m->divideFile(fastafile, processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); } +#else + int numSeqs = 0; + positions = m->setFilePosFasta(fastafile, numSeqs); + if (positions.size() < processors) { processors = positions.size(); } - ofstream out; - m->openOutputFile(outputFileName, out); - - ofstream outAcc; - m->openOutputFile(outputFileNameAccnos, outAcc); - - ifstream in; - m->openInputFile(fastafile, in); - - bool wroteAccnos = false; - - while (!in.eof()) { - - Sequence seq(in); - - if (m->control_pressed) { outputTypes.clear(); in.close(); out.close(); outAcc.close(); m->mothurRemove(outputFileName); m->mothurRemove(outputFileNameAccnos); return 0; } - - if (seq.getName() != "") { - string newSeqString = getChopped(seq); - - //output trimmed sequence - if (newSeqString != "") { - out << ">" << seq.getName() << endl << newSeqString << endl; - }else{ - outAcc << seq.getName() << endl; - wroteAccnos = true; - } - } - } - in.close(); - out.close(); - outAcc.close(); + //figure out how many sequences you have to process + int numSeqsPerProcessor = numSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; } + lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor)); + } +#endif + + bool wroteAccnos = false; + if(processors == 1) { wroteAccnos = driver(lines[0], fastafile, outputFileName, outputFileNameAccnos); } + else { wroteAccnos = createProcesses(lines, fastafile, outputFileName, outputFileNameAccnos); } + + if (m->control_pressed) { return 0; } m->mothurOutEndLine(); m->mothurOut("Output File Name: "); m->mothurOutEndLine(); @@ -235,6 +234,202 @@ int ChopSeqsCommand::execute(){ exit(1); } } +/**************************************************************************************************/ +bool ChopSeqsCommand::createProcesses(vector lines, string filename, string outFasta, string outAccnos) { + try { + int process = 1; + bool wroteAccnos = false; + vector processIDS; + vector nonBlankAccnosFiles; + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + wroteAccnos = driver(lines[process], filename, outFasta + toString(getpid()) + ".temp", outAccnos + toString(getpid()) + ".temp"); + + //pass numSeqs to parent + ofstream out; + string tempFile = fastafile + toString(getpid()) + ".bool.temp"; + m->openOutputFile(tempFile, out); + out << wroteAccnos << endl; + out.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + //do your part + wroteAccnos = driver(lines[0], filename, outFasta, outAccnos); + + //force parent to wait until all the processes are done + for (int i=0;imothurRemove(outAccnos); } //remove so other files can be renamed to it + + //parent reads in and combine Filter info + for (int i = 0; i < processIDS.size(); i++) { + string tempFilename = fastafile + toString(processIDS[i]) + ".bool.temp"; + ifstream in; + m->openInputFile(tempFilename, in); + + bool temp; + in >> temp; m->gobble(in); + if (temp) { wroteAccnos = temp; nonBlankAccnosFiles.push_back(outAccnos + toString(processIDS[i]) + ".temp"); } + else { m->mothurRemove((outAccnos + toString(processIDS[i]) + ".temp")); } + + in.close(); + m->mothurRemove(tempFilename); + } +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the seqSumData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + //Taking advantage of shared memory to allow both threads to add info to vectors. + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; imothurRemove(outAccnos); } //remove so other files can be renamed to it + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + if (pDataArray[i]->wroteAccnos) { wroteAccnos = pDataArray[i]->wroteAccnos; nonBlankAccnosFiles.push_back(outAccnos + toString(processIDS[i]) + ".temp"); } + else { m->mothurRemove((outAccnos + toString(processIDS[i]) + ".temp")); } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } +#endif + + for (int i = 0; i < processIDS.size(); i++) { + m->appendFiles((outFasta + toString(processIDS[i]) + ".temp"), outFasta); + m->mothurRemove((outFasta + toString(processIDS[i]) + ".temp")); + } + + if (nonBlankAccnosFiles.size() != 0) { + m->renameFile(nonBlankAccnosFiles[0], outAccnos); + + for (int h=1; h < nonBlankAccnosFiles.size(); h++) { + m->appendFiles(nonBlankAccnosFiles[h], outAccnos); + m->mothurRemove(nonBlankAccnosFiles[h]); + } + }else { //recreate the accnosfile if needed + ofstream out; + m->openOutputFile(outAccnos, out); + out.close(); + } + + return wroteAccnos; + } + catch(exception& e) { + m->errorOut(e, "ChopSeqsCommand", "createProcesses"); + exit(1); + } +} +/**************************************************************************************/ +bool ChopSeqsCommand::driver(linePair filePos, string filename, string outFasta, string outAccnos) { + try { + + ofstream out; + m->openOutputFile(outFasta, out); + + ofstream outAcc; + m->openOutputFile(outAccnos, outAcc); + + ifstream in; + m->openInputFile(filename, in); + + in.seekg(filePos.start); + + bool done = false; + bool wroteAccnos = false; + int count = 0; + + while (!done) { + + if (m->control_pressed) { in.close(); out.close(); return 1; } + + Sequence seq(in); m->gobble(in); + + if (m->control_pressed) { in.close(); out.close(); outAcc.close(); m->mothurRemove(outFasta); m->mothurRemove(outAccnos); return 0; } + + if (seq.getName() != "") { + string newSeqString = getChopped(seq); + + //output trimmed sequence + if (newSeqString != "") { + out << ">" << seq.getName() << endl << newSeqString << endl; + }else{ + outAcc << seq.getName() << endl; + wroteAccnos = true; + } + count++; + } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + unsigned long long pos = in.tellg(); + if ((pos == -1) || (pos >= filePos.end)) { break; } +#else + if (in.eof()) { break; } +#endif + //report progress + if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + + } + //report progress + if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + + + in.close(); + out.close(); + outAcc.close(); + + return wroteAccnos; + } + catch(exception& e) { + m->errorOut(e, "ChopSeqsCommand", "driver"); + exit(1); + } +} //********************************************************************************************************************** string ChopSeqsCommand::getChopped(Sequence seq) { try { diff --git a/chopseqscommand.h b/chopseqscommand.h index cc22c75..fa3f559 100644 --- a/chopseqscommand.h +++ b/chopseqscommand.h @@ -34,14 +34,235 @@ class ChopSeqsCommand : public Command { void help() { m->mothurOut(getHelpString()); } private: + struct linePair { + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} + }; + string fastafile, outputDir, keep; bool abort, countGaps, Short; - int numbases; + int numbases, processors; vector outputNames; string getChopped(Sequence); + bool driver (linePair, string, string, string); + bool createProcesses(vector, string, string, string); }; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct chopData { + string filename; + string outFasta, outAccnos, keep; + unsigned long long start; + unsigned long long end; + int numbases; + bool countGaps, Short, wroteAccnos; + MothurOut* m; + string namefile; + map nameMap; + + + chopData(){} + chopData(string f, string ff, string a, MothurOut* mout, unsigned long long st, unsigned long long en, string k, bool cGaps, int nbases, bool S) { + filename = f; + outFasta = ff; + outAccnos = a; + m = mout; + start = st; + end = en; + keep = k; + countGaps = cGaps; + numbases = nbases; + Short = S; + wroteAccnos = false; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI MyChopThreadFunction(LPVOID lpParam){ + chopData* pDataArray; + pDataArray = (chopData*)lpParam; + + try { + ofstream out; + pDataArray->m->openOutputFile(pDataArray->outFasta, out); + + ofstream outAcc; + pDataArray->m->openOutputFile(pDataArray->outAccnos, outAcc); + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + bool done = false; + bool wroteAccnos = false; + int count = 0; + + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { in.close(); out.close(); outAcc.close(); pDataArray->m->mothurRemove(pDataArray->outFasta); pDataArray->m->mothurRemove(pDataArray->outAccnos); return 0; } + + Sequence seq(in); pDataArray->m->gobble(in); + + if (seq.getName() != "") { + //string newSeqString = getChopped(seq); + /////////////////////////////////////////////////////////////////////// + string temp = seq.getAligned(); + string tempUnaligned = seq.getUnaligned(); + + if (pDataArray->countGaps) { + //if needed trim sequence + if (pDataArray->keep == "front") {//you want to keep the beginning + int tempLength = temp.length(); + + if (tempLength > pDataArray->numbases) { //you have enough bases to remove some + + int stopSpot = 0; + int numBasesCounted = 0; + + for (int i = 0; i < temp.length(); i++) { + //eliminate N's + if (toupper(temp[i]) == 'N') { temp[i] = '.'; } + + numBasesCounted++; + + if (numBasesCounted >= pDataArray->numbases) { stopSpot = i; break; } + } + + if (stopSpot == 0) { temp = ""; } + else { temp = temp.substr(0, stopSpot+1); } + + }else { + if (!pDataArray->Short) { temp = ""; } //sequence too short + } + }else { //you are keeping the back + int tempLength = temp.length(); + if (tempLength > pDataArray->numbases) { //you have enough bases to remove some + + int stopSpot = 0; + int numBasesCounted = 0; + + for (int i = (temp.length()-1); i >= 0; i--) { + //eliminate N's + if (toupper(temp[i]) == 'N') { temp[i] = '.'; } + + numBasesCounted++; + + if (numBasesCounted >= pDataArray->numbases) { stopSpot = i; break; } + } + + if (stopSpot == 0) { temp = ""; } + else { temp = temp.substr(stopSpot+1); } + }else { + if (!pDataArray->Short) { temp = ""; } //sequence too short + } + } + + }else{ + + //if needed trim sequence + if (pDataArray->keep == "front") {//you want to keep the beginning + int tempLength = tempUnaligned.length(); + + if (tempLength > pDataArray->numbases) { //you have enough bases to remove some + + int stopSpot = 0; + int numBasesCounted = 0; + + for (int i = 0; i < temp.length(); i++) { + //eliminate N's + if (toupper(temp[i]) == 'N') { + temp[i] = '.'; + tempLength--; + if (tempLength < pDataArray->numbases) { stopSpot = 0; break; } + } + + if(isalpha(temp[i])) { numBasesCounted++; } + + if (numBasesCounted >= pDataArray->numbases) { stopSpot = i; break; } + } + + if (stopSpot == 0) { temp = ""; } + else { temp = temp.substr(0, stopSpot+1); } + + }else { + if (!pDataArray->Short) { temp = ""; } //sequence too short + } + }else { //you are keeping the back + int tempLength = tempUnaligned.length(); + if (tempLength > pDataArray->numbases) { //you have enough bases to remove some + + int stopSpot = 0; + int numBasesCounted = 0; + + for (int i = (temp.length()-1); i >= 0; i--) { + //eliminate N's + if (toupper(temp[i]) == 'N') { + temp[i] = '.'; + tempLength--; + if (tempLength < pDataArray->numbases) { stopSpot = 0; break; } + } + + if(isalpha(temp[i])) { numBasesCounted++; } + + if (numBasesCounted >= pDataArray->numbases) { stopSpot = i; break; } + } + + if (stopSpot == 0) { temp = ""; } + else { temp = temp.substr(stopSpot); } + }else { + if (!pDataArray->Short) { temp = ""; } //sequence too short + } + } + } + + string newSeqString = temp; + /////////////////////////////////////////////////////////////////////// + + //output trimmed sequence + if (newSeqString != "") { + out << ">" << seq.getName() << endl << newSeqString << endl; + }else{ + outAcc << seq.getName() << endl; + pDataArray->wroteAccnos = true; + } + count++; + } + //report progress + if((count) % 1000 == 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); } + + } + //report progress + if((count) % 1000 != 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); } + + + in.close(); + out.close(); + outAcc.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ChopsSeqsCommand", "MyChopThreadFunction"); + exit(1); + } +} +#endif + + + #endif diff --git a/classify.cpp b/classify.cpp index 212e563..8aa3cdb 100644 --- a/classify.cpp +++ b/classify.cpp @@ -61,7 +61,8 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me names.push_back(temp.getName()); database->addSequence(temp); } - database->generateDB(); + if ((method == "kmer") && (!shortcuts)) {;} //don't print + else {database->generateDB(); } }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); database->readKmerDB(kmerFileTest); @@ -200,7 +201,8 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } fastaFile.close(); - database->generateDB(); + if ((method == "kmer") && (!shortcuts)) {;} //don't print + else {database->generateDB(); } }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); @@ -260,9 +262,6 @@ int Classify::readTaxonomy(string file) { MPI_File inMPI; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are MPI_Comm_size(MPI_COMM_WORLD, &processors); - - //char* inFileName = new char[file.length()]; - //memcpy(inFileName, file.c_str(), file.length()); char inFileName[1024]; strcpy(inFileName, file.c_str()); @@ -355,3 +354,37 @@ vector Classify::parseTax(string tax) { } /**************************************************************************************************/ +double Classify::getLogExpSum(vector probabilities, int& maxIndex){ + try { + // http://jblevins.org/notes/log-sum-exp + + double maxProb = probabilities[0]; + maxIndex = 0; + + int numProbs = (int)probabilities.size(); + + for(int i=1;i= maxProb){ + maxProb = probabilities[i]; + maxIndex = i; + } + } + + double probSum = 0.0000; + + for(int i=0;ierrorOut(e, "Classify", "getLogExpSum"); + exit(1); + } +} + +/**************************************************************************************************/ + diff --git a/classify.h b/classify.h index 4e03547..7b4c102 100644 --- a/classify.h +++ b/classify.h @@ -17,10 +17,8 @@ #include "database.hpp" #include "phylotree.h" - class Sequence; - /**************************************************************************************************/ class Classify { @@ -37,7 +35,6 @@ public: protected: map taxonomy; //name maps to taxonomy - //map genusCount; //maps genus to count - in essence a list of how many seqs are in each taxonomy map::iterator itTax; map::iterator it; Database* database; @@ -45,11 +42,12 @@ protected: string taxFile, templateFile, simpleTax; vector names; - int threadID; - bool flip, flipped; + int threadID, numLevels, numTaxa; + bool flip, flipped, shortcuts; int readTaxonomy(string); vector parseTax(string); + double getLogExpSum(vector, int&); MothurOut* m; }; diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp index 00ae690..660d53c 100644 --- a/classifyotucommand.cpp +++ b/classifyotucommand.cpp @@ -17,8 +17,9 @@ vector ClassifyOtuCommand::setParameters(){ CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy); CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "",false,false); parameters.push_back(pbasis); CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff); @@ -39,11 +40,12 @@ vector ClassifyOtuCommand::setParameters(){ string ClassifyOtuCommand::getHelpString(){ try { string helpString = ""; - helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n"; + helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, cutoff, label, basis and probs. The taxonomy and list parameters are required unless you have a valid current file.\n"; helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n"; helpString += "The name parameter allows you add a names file with your taxonomy file.\n"; helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n"; - helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n"; + helpString += "The count parameter allows you add a count file associated with your list file. When using the count parameter mothur assumes your list file contains only uniques.\n"; + helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n"; helpString += "For example consider the following basis=sequence could give Clostridiales 3 105 16 43 46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n"; helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n"; helpString += "Now for basis=otu could give Clostridiales 3 7 6 1 2, where 7 is the number of otus that classified to Clostridiales.\n"; @@ -172,6 +174,14 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -211,6 +221,20 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { if (groupfile == "not open") { abort = true; } else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -235,10 +259,12 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; } - if (namefile == ""){ - vector files; files.push_back(taxfile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == ""){ + vector files; files.push_back(taxfile); + parser.getNameFile(files); + } + } } } @@ -255,7 +281,11 @@ int ClassifyOtuCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } //if user gave a namesfile then use it - if (namefile != "") { m->readNames(namefile, nameMap, true); } + if (namefile != "") { m->readNames(namefile, nameMap, true); } + if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); } + else { groupMap = NULL; } + if (countfile != "") { ct = new CountTable(); ct->readTable(countfile); } + else { ct = NULL; } //read taxonomy file and save in map for easy access in building bin trees m->readTax(taxfile, taxMap); @@ -270,7 +300,7 @@ int ClassifyOtuCommand::execute(){ set processedLabels; set userLabels = labels; - if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { @@ -278,7 +308,7 @@ int ClassifyOtuCommand::execute(){ m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine(); process(list); - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } + if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; return 0; } processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); @@ -293,7 +323,7 @@ int ClassifyOtuCommand::execute(){ process(list); - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } processedLabels.insert(list->getLabel()); userLabels.erase(list->getLabel()); @@ -329,10 +359,12 @@ int ClassifyOtuCommand::execute(){ process(list); delete list; - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete input; delete list; return 0; } } delete input; + if (groupMap != NULL) { delete groupMap; } + if (ct != NULL) { delete ct; } if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -400,10 +432,16 @@ vector ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* th if (it == taxMap.end()) { //this name is not in taxonomy file, skip it m->mothurOut(names[i] + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }else{ + if (countfile != "") { + int numDups = ct->getNumSeqs(names[i]); + for (int j = 0; j < numDups; j++) { phylo->addSeqToTree(names[i], it->second); } + size += numDups; + }else{ //add seq to tree - phylo->addSeqToTree(names[i], it->second); - size++; - allNames.push_back(names[i]); + phylo->addSeqToTree(names[i], it->second); + size++; + } + allNames.push_back(names[i]); } } @@ -486,24 +524,25 @@ int ClassifyOtuCommand::process(ListVector* processList) { if (outputDir == "") { outputDir += m->hasPath(listfile); } ofstream out; - string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + getOutputFileNameTag("constaxonomy"); + string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("constaxonomy"); m->openOutputFile(outputFile, out); outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile); ofstream outSum; - string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + getOutputFileNameTag("taxsummary"); + string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("taxsummary"); m->openOutputFile(outputSumFile, outSum); outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile); out << "OTU\tSize\tTaxonomy" << endl; PhyloSummary* taxaSum; - if (refTaxonomy != "") { - taxaSum = new PhyloSummary(refTaxonomy, groupfile); + if (countfile != "") { + if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct); } + else { taxaSum = new PhyloSummary(ct); } }else { - taxaSum = new PhyloSummary(groupfile); - } - + if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap); } + else { taxaSum = new PhyloSummary(groupMap); } + } //for each bin in the list vector string snumBins = toString(processList->getNumBins()); @@ -534,7 +573,34 @@ int ClassifyOtuCommand::process(ListVector* processList) { if (basis == "sequence") { for(int j = 0; j < names.size(); j++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); } }else { //otu - taxaSum->addSeqToTree(noConfidenceConTax, names); + map containsGroup; + if (countfile != "") { + if (ct->hasGroupInfo()) { + vector mGroups = ct->getNamesOfGroups(); + for (int k = 0; k < names.size(); k++) { + vector counts = ct->getGroupCounts(names[k]); + for (int h = 0; h < counts.size(); h++) { + if (counts[h] != 0) { containsGroup[mGroups[h]] = true; } + } + } + } + }else { + if (groupfile != "") { + vector mGroups = groupMap->getNamesOfGroups(); + for (int j = 0; j < mGroups.size(); j++) { containsGroup[mGroups[j]] = false; } + + for (int k = 0; k < names.size(); k++) { + //find out the sequences group + string group = groupMap->getGroup(names[k]); + + if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } + else { + containsGroup[group] = true; + } + } + } + } + taxaSum->addSeqToTree(noConfidenceConTax, containsGroup); } } diff --git a/classifyotucommand.h b/classifyotucommand.h index 36a0328..2e76057 100644 --- a/classifyotucommand.h +++ b/classifyotucommand.h @@ -13,6 +13,7 @@ #include "command.hpp" #include "listvector.hpp" #include "inputdata.h" +#include "counttable.h" class ClassifyOtuCommand : public Command { @@ -34,10 +35,11 @@ public: void help() { m->mothurOut(getHelpString()); } private: - + GroupMap* groupMap; + CountTable* ct; ListVector* list; InputData* input; - string listfile, namefile, taxfile, label, outputDir, refTaxonomy, groupfile, basis; + string listfile, namefile, taxfile, label, outputDir, refTaxonomy, groupfile, basis, countfile; bool abort, allLines, probs; int cutoff; set labels; //holds labels to be used diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index c76b047..36bccc2 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -17,11 +17,13 @@ vector ClassifySeqsCommand::setParameters(){ CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy); CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate); CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); + + CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance-align", "kmer", "", "", "",false,false); parameters.push_back(psearch); CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize); - CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod); + CommandParameter pmethod("method", "Multiple", "wang-knn-zap", "wang", "", "", "",false,false); parameters.push_back(pmethod); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch); CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch); @@ -32,6 +34,7 @@ vector ClassifySeqsCommand::setParameters(){ CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs); CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters); CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); + CommandParameter pshortcuts("shortcuts", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshortcuts); CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -50,12 +53,13 @@ string ClassifySeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n"; - helpString += "The classify.seqs command parameters are reference, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"; + helpString += "The classify.seqs command parameters are reference, fasta, name, group, count, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"; helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"; - helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n"; + helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast, align and distance. The default is kmer.\n"; helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n"; helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"; - helpString += "The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"; + helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n"; + helpString += "The method parameter allows you to specify classification method to use. Your options are: wang, knn and zap. The default is wang.\n"; helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"; helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n"; #ifdef USE_MPI @@ -68,8 +72,8 @@ string ClassifySeqsCommand::getHelpString(){ helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"; helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n"; helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n"; - helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n"; - helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n"; + helpString += "The probs parameter shuts off the bootstrapping results for the wang and zap method. The default is true, meaning you want the bootstrapping to be shown.\n"; + helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the wang method. The default is 100.\n"; //helpString += "The flip parameter allows you shut off mothur's The default is T.\n"; helpString += "The classify.seqs command should be in the following format: \n"; helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n"; @@ -127,7 +131,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(){ ClassifySeqsCommand::ClassifySeqsCommand(string option) { try { abort = false; calledHelp = false; - rdb = ReferenceDB::getInstance(); + rdb = ReferenceDB::getInstance(); hasName = false; hasCount=false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -185,6 +189,14 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } fastaFileName = validParameter.validFile(parameters, "fasta", false); @@ -333,11 +345,90 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } } } - + + if (namefileNames.size() != 0) { hasName = true; } + if (namefile != "") { if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); } } + //check for required parameters + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (countfileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } + } + + if (countfileNames.size() != 0) { hasCount = true; if (countfileNames.size() != fastaFileNames.size()) {m->mothurOut("If you provide a count file, you must have one for each fasta file."); m->mothurOutEndLine(); } } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + groupfile = validParameter.validFile(parameters, "group", false); if (groupfile == "not found") { groupfile = ""; } else { @@ -393,6 +484,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (groupfile != "") { if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); } + if (hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } }else { for (int i = 0; i < fastaFileNames.size(); i++) { groupfileNames.push_back(""); } } @@ -400,9 +492,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; - temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; } - m->mothurConvert(temp, kmerSize); - temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); @@ -444,7 +533,13 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; } - method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "bayesian"; } + method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "wang"; } + + temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ + temp = "8"; + if (method == "zap") { temp = "7"; } + } + m->mothurConvert(temp, kmerSize); temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; } m->mothurConvert(temp, match); @@ -466,6 +561,9 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; } probs = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "shortcuts", false); if (temp == "not found"){ temp = "true"; } + writeShortcuts = m->isTrue(temp); //temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ temp = "T"; } //flip = m->isTrue(temp); @@ -475,16 +573,23 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { m->mothurConvert(temp, iters); - if ((method == "bayesian") && (search != "kmer")) { - m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine(); + if ((method == "wang") && (search != "kmer")) { + m->mothurOut("The wang method requires the kmer search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine(); + search = "kmer"; + } + + if ((method == "zap") && ((search != "kmer") && (search != "align"))) { + m->mothurOut("The zap method requires the kmer or align search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine(); search = "kmer"; } if (!abort) { - if (namefileNames.size() == 0){ - if (fastaFileNames.size() != 0) { - vector files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); - parser.getNameFile(files); + if (!hasCount) { + if (namefileNames.size() == 0){ + if (fastaFileNames.size() != 0) { + vector files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); + parser.getNameFile(files); + } } } } @@ -508,12 +613,18 @@ int ClassifySeqsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } - if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip); } + string outputMethodTag = method + "."; + if(method == "wang"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); } + else if(method == "zap"){ + outputMethodTag = search + "_" + outputMethodTag; + if (search == "kmer") { classify = new KmerTree(templateFileName, taxonomyFileName, kmerSize, cutoff); } + else { classify = new AlignTree(templateFileName, taxonomyFileName, cutoff); } + } else { - m->mothurOut(search + " is not a valid method option. I will run the command using bayesian."); + m->mothurOut(search + " is not a valid method option. I will run the command using wang."); m->mothurOutEndLine(); - classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip); + classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } if (m->control_pressed) { delete classify; return 0; } @@ -522,7 +633,7 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); - string baseTName = taxonomyFileName; + string baseTName = m->getSimpleName(taxonomyFileName); if (taxonomyFileName == "saved") {baseTName = rdb->getSavedTaxonomy(); } //set rippedTaxName to @@ -536,10 +647,10 @@ int ClassifySeqsCommand::execute(){ if (RippedTaxName != "") { RippedTaxName += "."; } if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); } - string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxonomy"); - string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("accnos"); + string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxonomy"); + string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag +getOutputFileNameTag("accnos"); string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp"; - string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxsummary"); + string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxsummary"); if ((method == "knn") && (search == "distance")) { string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("matchdist"); @@ -694,46 +805,58 @@ int ClassifySeqsCommand::execute(){ } #endif - string group = ""; - if (groupfile != "") { group = groupfileNames[s]; } - - PhyloSummary taxaSum(baseTName, group); - - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } - - if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); } - else { - ifstream in; - m->openInputFile(tempTaxonomyFile, in); - - //read in users taxonomy file and add sequences to tree - string name, taxon; - - while(!in.eof()){ - in >> name >> taxon; m->gobble(in); - - itNames = nameMap.find(name); - - if (itNames == nameMap.end()) { - m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); - }else{ - for (int i = 0; i < itNames->second.size(); i++) { - taxaSum.addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs - } - itNames->second.clear(); - nameMap.erase(itNames->first); - } - } - in.close(); - } + string group = ""; + GroupMap* groupMap = NULL; + CountTable* ct = NULL; + PhyloSummary* taxaSum; + if (hasCount) { + ct = new CountTable(); + ct->readTable(countfileNames[s]); + taxaSum = new PhyloSummary(taxonomyFileName, ct); + taxaSum->summarize(tempTaxonomyFile); + }else { + if (groupfile != "") { group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); } + + taxaSum = new PhyloSummary(taxonomyFileName, groupMap); + + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + + if (namefile == "") { taxaSum->summarize(tempTaxonomyFile); } + else { + ifstream in; + m->openInputFile(tempTaxonomyFile, in); + + //read in users taxonomy file and add sequences to tree + string name, taxon; + + while(!in.eof()){ + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + + in >> name >> taxon; m->gobble(in); + + itNames = nameMap.find(name); + + if (itNames == nameMap.end()) { + m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); + }else{ + for (int i = 0; i < itNames->second.size(); i++) { + taxaSum->addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs + } + itNames->second.clear(); + nameMap.erase(itNames->first); + } + } + in.close(); + } + } m->mothurRemove(tempTaxonomyFile); - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } //print summary file ofstream outTaxTree; m->openOutputFile(taxSummary, outTaxTree); - taxaSum.print(outTaxTree); + taxaSum->print(outTaxTree); outTaxTree.close(); //output taxonomy with the unclassified bins added @@ -745,12 +868,12 @@ int ClassifySeqsCommand::execute(){ m->openOutputFile(unclass, outTax); //get maxLevel from phylotree so you know how many 'unclassified's to add - int maxLevel = taxaSum.getMaxLevel(); + int maxLevel = taxaSum->getMaxLevel(); //read taxfile - this reading and rewriting is done to preserve the confidence scores. string name, taxon; while (!inTax.eof()) { - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } m->mothurRemove(unclass); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } m->mothurRemove(unclass); delete classify; return 0; } inTax >> name >> taxon; m->gobble(inTax); @@ -761,6 +884,8 @@ int ClassifySeqsCommand::execute(){ inTax.close(); outTax.close(); + if (ct != NULL) { delete ct; } + if (groupMap != NULL) { delete groupMap; } delete taxaSum; m->mothurRemove(newTaxonomyFile); rename(unclass.c_str(), newTaxonomyFile.c_str()); @@ -897,7 +1022,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string extension = ""; if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); } - classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flip); + classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flip, writeShortcuts); pDataArray.push_back(tempclass); //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 4965642..0cffec6 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -19,9 +19,11 @@ #include "phylotree.h" #include "phylosummary.h" #include "knn.h" +#include "kmertree.h" +#include "aligntree.h" -//KNN and Bayesian methods modeled from algorithms in +//KNN and Wang methods modeled from algorithms in //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences //into the New Bacterial Taxonomy􏰎† //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* @@ -62,6 +64,7 @@ private: vector lines; vector fastaFileNames; vector namefileNames; + vector countfileNames; vector groupfileNames; vector outputNames; map > nameMap; @@ -70,10 +73,10 @@ private: Classify* classify; ReferenceDB* rdb; - string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; + string fastaFileName, templateFileName, countfile, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; - bool abort, probs, save, flip; + bool abort, probs, save, flip, hasName, hasCount, writeShortcuts; int driver(linePair*, string, string, string, string); int createProcesses(string, string, string, string); @@ -99,10 +102,10 @@ struct classifyData { MothurOut* m; float match, misMatch, gapOpen, gapExtend; int count, kmerSize, threadID, cutoff, iters, numWanted; - bool probs, flip; + bool probs, flip, writeShortcuts; classifyData(){} - classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli) { + classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli, bool wsh) { accnos = acc; taxonomyFileName = tx; templateFileName = te; @@ -126,6 +129,7 @@ struct classifyData { probs = p; count = 0; flip = fli; + writeShortcuts = wsh; } }; @@ -162,12 +166,17 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ //make classify Classify* myclassify; - if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); } + if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } + else if(pDataArray->method == "zap"){ + outputMethodTag = search + "_" + outputMethodTag; + if (pDataArray->search == "kmer") { myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); } + else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); } + } else { pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); pDataArray->m->mothurOutEndLine(); - myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); + myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } if (pDataArray->m->control_pressed) { delete myclassify; return 0; } diff --git a/classifysharedcommand.cpp b/classifysharedcommand.cpp new file mode 100755 index 0000000..f964937 --- /dev/null +++ b/classifysharedcommand.cpp @@ -0,0 +1,364 @@ +// +// classifysharedcommand.cpp +// Mothur +// +// Created by Abu Zaher Md. Faridee on 8/13/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "classifysharedcommand.h" +#include "randomforest.hpp" +#include "decisiontree.hpp" +#include "rftreenode.hpp" + +//********************************************************************************************************************** +vector ClassifySharedCommand::setParameters(){ + try { + //CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared); + CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign); + CommandParameter potupersplit("otupersplit", "Multiple", "log2-squareroot", "log2", "", "", "",false,false); parameters.push_back(potupersplit); + CommandParameter psplitcriteria("splitcriteria", "Multiple", "gainratio-infogain", "gainratio", "", "", "",false,false); parameters.push_back(psplitcriteria); + CommandParameter pnumtrees("numtrees", "Number", "", "100", "", "", "",false,false); parameters.push_back(pnumtrees); + + CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); + CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string ClassifySharedCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The classify.shared command allows you to ....\n"; + helpString += "The classify.shared command parameters are: shared, design, label, groups, otupersplit.\n"; + helpString += "The label parameter is used to analyze specific labels in your input.\n"; + helpString += "The groups parameter allows you to specify which of the groups in your designfile you would like analyzed.\n"; + helpString += "The classify.shared should be in the following format: \n"; + helpString += "classify.shared(shared=yourSharedFile, design=yourDesignFile)\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** +string ClassifySharedCommand::getOutputFileNameTag(string type, string inputName=""){ + try { + string tag = ""; + map >::iterator it; + + //is this a type this command creates + it = outputTypes.find(type); + if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } + else { + if (type == "summary") { tag = "summary"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } + } + return tag; + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "getOutputFileName"); + exit(1); + } +} +//********************************************************************************************************************** + +ClassifySharedCommand::ClassifySharedCommand() { + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["summary"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand"); + exit(1); + } +} +//********************************************************************************************************************** +ClassifySharedCommand::ClassifySharedCommand(string option) { + try { + abort = false; calledHelp = false; + allLines = 1; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + //valid paramters for this command + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + vector tempOutNames; + outputTypes["summary"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("shared"); + //user has given a shared file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["shared"] = inputDir + it->second; } + } + + it = parameters.find("design"); + //user has given a design file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["design"] = inputDir + it->second; } + } + + } + + //check for parameters + //get shared file, it is required + sharedfile = validParameter.validFile(parameters, "shared", true); + if (sharedfile == "not open") { sharedfile = ""; abort = true; } + else if (sharedfile == "not found") { + //if there is a current shared file, use it + sharedfile = m->getSharedFile(); + if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; } + }else { m->setSharedFile(sharedfile); } + + //get design file, it is required + designfile = validParameter.validFile(parameters, "design", true); + if (designfile == "not open") { sharedfile = ""; abort = true; } + else if (designfile == "not found") { + //if there is a current shared file, use it + designfile = m->getDesignFile(); + if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current designfile and the design parameter is required."); m->mothurOutEndLine(); abort = true; } + }else { m->setDesignFile(designfile); } + + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = m->hasPath(sharedfile); //if user entered a file with a path then preserve it + } + + + // NEW CODE for OTU per split selection criteria + otupersplit = validParameter.validFile(parameters, "otupersplit", false); + if (otupersplit == "not found") { otupersplit = "log2"; } + if ((otupersplit == "squareroot") || (otupersplit == "log2")) { + optimumFeatureSubsetSelectionCriteria = otupersplit; + }else { m->mothurOut("Not a valid OTU per split selection method. Valid OTU per split selection methods are 'log2' and 'squareroot'."); m->mothurOutEndLine(); abort = true; } + + // splitcriteria + splitcriteria = validParameter.validFile(parameters, "splitcriteria", false); + if (splitcriteria == "not found") { splitcriteria = "gainratio"; } + if ((splitcriteria == "gainratio") || (splitcriteria == "infogain")) { + treeSplitCriterion = splitcriteria; + }else { m->mothurOut("Not a valid tree splitting criterio. Valid tree splitting criteria are 'gainratio' and 'infogain'."); m->mothurOutEndLine(); abort = true; } + + + string temp = validParameter.validFile(parameters, "numtrees", false); if (temp == "not found"){ temp = "100"; } + m->mothurConvert(temp, numDecisionTrees); + + //Groups must be checked later to make sure they are valid. SharedUtilities has functions of check the validity, just make to so m->setGroups() after the checks. If you are using these with a shared file no need to check the SharedRAbundVector class will call SharedUtilites for you, kinda nice, huh? + string groups = validParameter.validFile(parameters, "groups", false); + if (groups == "not found") { groups = ""; } + else { m->splitAtDash(groups, Groups); } + m->setGroups(Groups); + + //Commonly used to process list, rabund, sabund, shared and relabund files. Look at "smart distancing" examples below in the execute function. + string label = validParameter.validFile(parameters, "label", false); + if (label == "not found") { label = ""; } + else { + if(label != "all") { m->splitAtDash(label, labels); allLines = 0; } + else { allLines = 1; } + } + } + + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand"); + exit(1); + } +} +//********************************************************************************************************************** +int ClassifySharedCommand::execute() { + try { + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + InputData input(sharedfile, "sharedfile"); + vector lookup = input.getSharedRAbundVectors(); + + //read design file + designMap.readDesignMap(designfile); + + string lastLabel = lookup[0]->getLabel(); + set processedLabels; + set userLabels = labels; + + //as long as you are not at the end of the file or done wih the lines you want + while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; } + + if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){ + + m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); + + processSharedAndDesignData(lookup); + + processedLabels.insert(lookup[0]->getLabel()); + userLabels.erase(lookup[0]->getLabel()); + } + + if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = lookup[0]->getLabel(); + + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input.getSharedRAbundVectors(lastLabel); + m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); + + processSharedAndDesignData(lookup); + + processedLabels.insert(lookup[0]->getLabel()); + userLabels.erase(lookup[0]->getLabel()); + + //restore real lastlabel to save below + lookup[0]->setLabel(saveLabel); + } + + lastLabel = lookup[0]->getLabel(); + //prevent memory leak + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; } + + if (m->control_pressed) { return 0; } + + //get next line to process + lookup = input.getSharedRAbundVectors(); + } + + if (m->control_pressed) { return 0; } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } } + lookup = input.getSharedRAbundVectors(lastLabel); + + m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); + + processSharedAndDesignData(lookup); + + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +void ClassifySharedCommand::processSharedAndDesignData(vector lookup){ + try { +// for (int i = 0; i < designMap->getNamesOfGroups().size(); i++) { +// string groupName = designMap->getNamesOfGroups()[i]; +// cout << groupName << endl; +// } + +// for (int i = 0; i < designMap->getNumSeqs(); i++) { +// string sharedGroupName = designMap->getNamesSeqs()[i]; +// string treatmentName = designMap->getGroup(sharedGroupName); +// cout << sharedGroupName << " : " << treatmentName << endl; +// } + + map treatmentToIntMap; + map intToTreatmentMap; + for (int i = 0; i < designMap.getNumGroups(); i++) { + string treatmentName = designMap.getNamesOfGroups()[i]; + treatmentToIntMap[treatmentName] = i; + intToTreatmentMap[i] = treatmentName; + } + + int numSamples = lookup.size(); + int numFeatures = lookup[0]->getNumBins(); + + int numRows = numSamples; + int numColumns = numFeatures + 1; // extra one space needed for the treatment/outcome + + vector< vector > dataSet(numRows, vector(numColumns, 0)); + + for (int i = 0; i < lookup.size(); i++) { + string sharedGroupName = lookup[i]->getGroup(); + string treatmentName = designMap.getGroup(sharedGroupName); + + int j = 0; + for (; j < lookup[i]->getNumBins(); j++) { + int otuCount = lookup[i]->getAbundance(j); + dataSet[i][j] = otuCount; + } + dataSet[i][j] = treatmentToIntMap[treatmentName]; + } + + RandomForest randomForest(dataSet, numDecisionTrees, treeSplitCriterion); + randomForest.populateDecisionTrees(); + randomForest.calcForrestErrorRate(); + + string filename = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + lookup[0]->getLabel() + "." + getOutputFileNameTag("summary"); + outputNames.push_back(filename); outputTypes["summary"].push_back(filename); + + randomForest.calcForrestVariableImportance(filename); + + m->mothurOutEndLine(); + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "processSharedAndDesignData"); + exit(1); + } +} +//********************************************************************************************************************** + diff --git a/classifysharedcommand.h b/classifysharedcommand.h new file mode 100755 index 0000000..93c6286 --- /dev/null +++ b/classifysharedcommand.h @@ -0,0 +1,54 @@ +// +// classifysharedcommand.h +// Mothur +// +// Created by Abu Zaher Md. Faridee on 8/13/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#ifndef __Mothur__classifysharedcommand__ +#define __Mothur__classifysharedcommand__ + +#include "command.hpp" +#include "inputdata.h" + +class ClassifySharedCommand : public Command { +public: + ClassifySharedCommand(); + ClassifySharedCommand(string); + ~ClassifySharedCommand() {}; + + vector setParameters(); + string getCommandName() { return "classify.shared"; } + string getCommandCategory() { return "OTU-Based Approaches"; } + string getOutputFileNameTag(string, string); + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Classify.shared\n"; } + string getDescription() { return "description"; } + int execute(); + + void help() { m->mothurOut(getHelpString()); } + +private: + bool abort; + string outputDir; + vector outputNames, Groups; + + string sharedfile, designfile, otupersplit, splitcriteria; + set labels; + bool allLines; + + int processors; + bool useTiming; + + GroupMap designMap; + + int numDecisionTrees; + string treeSplitCriterion, optimumFeatureSubsetSelectionCriteria; + bool doPruning, discardHighErrorTrees; + double pruneAggressiveness, highErrorTreeDiscardThreshold, featureStandardDeviationThreshold; + + void processSharedAndDesignData(vector lookup); +}; + +#endif /* defined(__Mothur__classifysharedcommand__) */ diff --git a/classifytreecommand.cpp b/classifytreecommand.cpp index 7861a01..69da8e0 100644 --- a/classifytreecommand.cpp +++ b/classifytreecommand.cpp @@ -15,8 +15,9 @@ vector ClassifyTreeCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy); - CommandParameter pname("name", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -37,8 +38,9 @@ string ClassifyTreeCommand::getHelpString(){ helpString += "The classify.tree command reads a tree and taxonomy file and output the consensus taxonomy for each node on the tree. \n"; helpString += "If you provide a group file, the concensus for each group will also be provided. \n"; helpString += "The new tree contains labels at each internal node. The label is the node number so you can relate the tree to the summary file.\n"; + helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n"; helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n"; - helpString += "The classify.tree command parameters are tree, group, name and taxonomy. The tree and taxonomy files are required.\n"; + helpString += "The classify.tree command parameters are tree, group, name, count and taxonomy. The tree and taxonomy files are required.\n"; helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n"; helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n"; helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n"; @@ -147,6 +149,14 @@ ClassifyTreeCommand::ClassifyTreeCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -178,16 +188,30 @@ ClassifyTreeCommand::ClassifyTreeCommand(string option) { else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; } m->mothurConvert(temp, cutoff); if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; } - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } } - } } catch(exception& e) { @@ -213,7 +237,7 @@ int ClassifyTreeCommand::execute(){ TreeReader* reader = new TreeReader(treefile, groupfile, namefile); vector T = reader->getTrees(); - TreeMap* tmap = T[0]->getTreeMap(); + CountTable* tmap = T[0]->getCountTable(); Tree* outputTree = T[0]; delete reader; @@ -367,10 +391,15 @@ string ClassifyTreeCommand::getTaxonomy(set names, int& size) { if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }else{ - //add seq to tree - phylo->addSeqToTree((*it), itTax->second); - size++; - } + if (countfile != "") { + int numDups = ct->getNumSeqs((*it)); + for (int j = 0; j < numDups; j++) { phylo->addSeqToTree((*it), itTax->second); } + size += numDups; + }else{ + //add seq to tree + phylo->addSeqToTree((*it), itTax->second); + size++; + } } } if (m->control_pressed) { delete phylo; return conTax; } @@ -444,12 +473,12 @@ map > ClassifyTreeCommand::getDescendantList(Tree*& T, int i int lc = T->tree[i].getLChild(); int rc = T->tree[i].getRChild(); - TreeMap* tmap = T->getTreeMap(); + // TreeMap* tmap = T->getTreeMap(); if (lc == -1) { //you are a leaf your only descendant is yourself - string group = tmap->getGroup(T->tree[i].getName()); + vector groups = T->tree[i].getGroup(); set mynames; mynames.insert(T->tree[i].getName()); - names[group] = mynames; //mygroup -> me + for (int j = 0; j < groups.size(); j++) { names[groups[j]] = mynames; } //mygroup -> me names["AllGroups"] = mynames; }else{ //your descedants are the combination of your childrens descendants names = descendants[lc]; diff --git a/classifytreecommand.h b/classifytreecommand.h index 758a438..dd972b6 100644 --- a/classifytreecommand.h +++ b/classifytreecommand.h @@ -12,6 +12,7 @@ #include "command.hpp" #include "readtree.h" #include "treemap.h" +#include "counttable.h" class ClassifyTreeCommand : public Command { public: @@ -31,13 +32,14 @@ public: void help() { m->mothurOut(getHelpString()); } private: - string treefile, taxonomyfile, groupfile, namefile, outputDir; + string treefile, taxonomyfile, groupfile, namefile, countfile, outputDir; bool abort; vector outputNames; int numUniquesInName, cutoff; map nameMap; map nameCount; map taxMap; + CountTable* ct; int getClassifications(Tree*&); map > getDescendantList(Tree*&, int, map > >); diff --git a/clusterclassic.cpp b/clusterclassic.cpp index 2d1b9a6..32a9341 100644 --- a/clusterclassic.cpp +++ b/clusterclassic.cpp @@ -231,6 +231,205 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) { exit(1); } +} +/***********************************************************************/ +int ClusterClassic::readPhylipFile(string filename, CountTable* countTable) { + try { + double distance; + int square; + string name; + vector matrixNames; + + ifstream fileHandle; + m->openInputFile(filename, fileHandle); + + string numTest; + fileHandle >> numTest >> name; + + if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); } + else { convert(numTest, nseqs); } + + + matrixNames.push_back(name); + + if(countTable == NULL){ + list = new ListVector(nseqs); + list->set(0, name); + } + else{ list = new ListVector(countTable->getListVector()); } + + + //initialize distance matrix to cutoff + dMatrix.resize(nseqs); + //rowSmallDists.resize(nseqs, temp); + for (int i = 1; i < nseqs; i++) { + dMatrix[i].resize(i, aboveCutoff); + } + + + char d; + while((d=fileHandle.get()) != EOF){ + + if(isalnum(d)){ + square = 1; + fileHandle.putback(d); + for(int i=0;i> distance; + } + break; + } + if(d == '\n'){ + square = 0; + break; + } + } + + Progress* reading; + + if(square == 0){ + + reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2); + + int index = 0; + + for(int i=1;icontrol_pressed) { fileHandle.close(); delete reading; return 0; } + + fileHandle >> name; + matrixNames.push_back(name); + + + //there's A LOT of repeated code throughout this method... + if(countTable == NULL){ + list->set(i, name); + + for(int j=0;jcontrol_pressed) { delete reading; fileHandle.close(); return 0; } + + fileHandle >> distance; + + if (distance == -1) { distance = 1000000; } + else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + //if(distance < cutoff){ + dMatrix[i][j] = distance; + if (distance < smallDist) { smallDist = distance; } + //if (rowSmallDists[i].dist > distance) { rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; } + //if (rowSmallDists[j].dist > distance) { rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; } + //} + index++; + reading->update(index); + } + + } + else{ + for(int j=0;j> distance; + + if (m->control_pressed) { delete reading; fileHandle.close(); return 0; } + + if (distance == -1) { distance = 1000000; } + else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + if (distance < smallDist) { smallDist = distance; } + + int row = countTable->get(matrixNames[i]); + int col = countTable->get(matrixNames[j]); + + if (row < col) { dMatrix[col][row] = distance; } + else { dMatrix[row][col] = distance; } + + index++; + reading->update(index); + } + } + } + } + else{ + + reading = new Progress("Reading matrix: ", nseqs * nseqs); + + int index = nseqs; + + for(int i=1;i> name; + matrixNames.push_back(name); + + if(countTable == NULL){ + list->set(i, name); + for(int j=0;j> distance; + + if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } + + if (distance == -1) { distance = 1000000; } + else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + if(j < i){ + if (distance < smallDist) { smallDist = distance; } + + dMatrix[i][j] = distance; + } + index++; + reading->update(index); + } + + } + else{ + + for(int j=0;j> distance; + + if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } + + if (distance == -1) { distance = 1000000; } + else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. + + if(j < i){ + if (distance < smallDist) { smallDist = distance; } + + int row = countTable->get(matrixNames[i]); + int col = countTable->get(matrixNames[j]); + + if (row < col) { dMatrix[col][row] = distance; } + else { dMatrix[row][col] = distance; } + } + index++; + reading->update(index); + } + } + } + } + + if (m->control_pressed) { fileHandle.close(); delete reading; return 0; } + + reading->finish(); + delete reading; + + list->setLabel("0"); + rabund = new RAbundVector(); + rabund->setLabel(list->getLabel()); + + for(int i = 0; i < list->getNumBins(); i++) { + if (m->control_pressed) { break; } + vector binNames; + string bin = list->get(i); + m->splitAtComma(bin, binNames); + int total = 0; + for (int j = 0; j < binNames.size(); j++) { total += countTable->getNumSeqs(binNames[j]); } + rabund->push_back(total); + } + + fileHandle.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClusterClassic", "readPhylipFile"); + exit(1); + } + } /***********************************************************************/ //sets smallCol and smallRow, returns distance @@ -398,16 +597,12 @@ void ClusterClassic::setMapWanted(bool f) { //parse bin string names = list->get(i); - while (names.find_first_of(',') != -1) { - //get name from bin - string name = names.substr(0,names.find_first_of(',')); + vector binnames; + m->splitAtComma(names, binnames); + for (int j = 0; j < binnames.size(); j++) { //save name and bin number - seq2Bin[name] = i; - names = names.substr(names.find_first_of(',')+1, names.length()); + seq2Bin[binnames[j]] = i; } - - //get last name - seq2Bin[names] = i; } } @@ -420,17 +615,13 @@ void ClusterClassic::setMapWanted(bool f) { void ClusterClassic::updateMap() { try { //update location of seqs in smallRow since they move to smallCol now - string names = list->get(smallRow); - while (names.find_first_of(',') != -1) { - //get name from bin - string name = names.substr(0,names.find_first_of(',')); - //save name and bin number - seq2Bin[name] = smallCol; - names = names.substr(names.find_first_of(',')+1, names.length()); - } - - //get last name - seq2Bin[names] = smallCol; + string names = list->get(smallRow); + vector binnames; + m->splitAtComma(names, binnames); + for (int j = 0; j < binnames.size(); j++) { + //save name and bin number + seq2Bin[binnames[j]] = smallCol; + } } catch(exception& e) { diff --git a/clusterclassic.h b/clusterclassic.h index a650bbf..eaccb27 100644 --- a/clusterclassic.h +++ b/clusterclassic.h @@ -6,6 +6,7 @@ #include "listvector.hpp" #include "rabundvector.hpp" #include "nameassignment.hpp" +#include "counttable.h" /* * clusterclassic.h @@ -22,6 +23,7 @@ class ClusterClassic { public: ClusterClassic(float, string, bool); int readPhylipFile(string, NameAssignment*); + int readPhylipFile(string, CountTable*); void update(double&); double getSmallDist() { return smallDist; } int getNSeqs() { return nseqs; } diff --git a/clustercommand.cpp b/clustercommand.cpp index 19eaf85..06e627a 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -154,6 +154,14 @@ ClusterCommand::ClusterCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -481,12 +489,12 @@ void ClusterCommand::printData(string label){ loops = 0; start = time(NULL); + oldRAbund.setLabel(label); if (countfile == "") { oldRAbund.print(rabundFile); oldRAbund.getSAbundVector().print(sabundFile); } - - oldRAbund.setLabel(label); + if (m->isTrue(showabund)) { oldRAbund.getSAbundVector().print(cout); } diff --git a/clusterdoturcommand.cpp b/clusterdoturcommand.cpp index 9bfb52b..2515b5c 100644 --- a/clusterdoturcommand.cpp +++ b/clusterdoturcommand.cpp @@ -14,7 +14,8 @@ vector ClusterDoturCommand::setParameters(){ try { CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pphylip); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff); CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision); CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod); @@ -37,7 +38,7 @@ string ClusterDoturCommand::getHelpString(){ try { string helpString = ""; helpString += "The cluster.classic command clusters using the algorithm from dotur. \n"; - helpString += "The cluster.classic command parameter options are phylip, name, method, cuttoff, hard, sim, precision. Phylip is required, unless you have a valid current file.\n"; + helpString += "The cluster.classic command parameter options are phylip, name, count, method, cuttoff, hard, sim, precision. Phylip is required, unless you have a valid current file.\n"; helpString += "The cluster.classic command should be in the following format: \n"; helpString += "cluster.classic(phylip=yourDistanceMatrix, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"; helpString += "The acceptable cluster methods are furthest, nearest, weighted and average. If no method is provided then average is assumed.\n"; @@ -132,7 +133,14 @@ ClusterDoturCommand::ClusterDoturCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } - + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //initialize outputTypes @@ -159,10 +167,17 @@ ClusterDoturCommand::ClusterDoturCommand(string option) { //check for optional parameter and set defaults namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { abort = true; namefile = ""; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.classic command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + string temp; temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; } @@ -204,36 +219,49 @@ int ClusterDoturCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - if(namefile != ""){ + + ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim); + + NameAssignment* nameMap = NULL; + CountTable* ct = NULL; + if(namefile != "") { nameMap = new NameAssignment(namefile); nameMap->readMap(); - }else{ - nameMap = NULL; - } - - //reads phylip file storing data in 2D vector, also fills list and rabund - ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim); - cluster->readPhylipFile(phylipfile, nameMap); - - if (m->control_pressed) { delete cluster; delete list; delete rabund; return 0; } + cluster->readPhylipFile(phylipfile, nameMap); + delete nameMap; + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(countfile); + cluster->readPhylipFile(phylipfile, ct); + delete ct; + }else { + cluster->readPhylipFile(phylipfile, nameMap); + } + tag = cluster->getTag(); + + if (m->control_pressed) { delete cluster; return 0; } list = cluster->getListVector(); rabund = cluster->getRAbundVector(); - + if (outputDir == "") { outputDir += m->hasPath(phylipfile); } fileroot = outputDir + m->getRootName(m->getSimpleName(phylipfile)); string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund"); string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund"); - string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list"); + string listFileName = fileroot+ tag + "."; + if (countfile != "") { listFileName += "unique_"; } + listFileName += getOutputFileNameTag("list"); - m->openOutputFile(sabundFileName, sabundFile); - m->openOutputFile(rabundFileName, rabundFile); + if (countfile == "") { + m->openOutputFile(sabundFileName, sabundFile); + m->openOutputFile(rabundFileName, rabundFile); + outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); + outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); + + } m->openOutputFile(listFileName, listFile); - - outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); - outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); - outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); + outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); float previousDist = 0.00000; float rndPreviousDist = 0.00000; @@ -245,7 +273,8 @@ int ClusterDoturCommand::execute(){ int estart = time(NULL); while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){ - if (m->control_pressed) { delete cluster; delete list; delete rabund; sabundFile.close();rabundFile.close();listFile.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0; } + if (m->control_pressed) { delete cluster; delete list; delete rabund; if(countfile == "") {rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } + listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear(); return 0; } cluster->update(cutoff); @@ -276,18 +305,14 @@ int ClusterDoturCommand::execute(){ else if(rndPreviousDistceilDist(saveCutoff, precision); } - // else { saveCutoff = m->roundDist(saveCutoff, precision); } - // m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); - //} + delete cluster; delete list; delete rabund; //set list file as new current listfile string current = ""; @@ -327,11 +352,12 @@ int ClusterDoturCommand::execute(){ void ClusterDoturCommand::printData(string label){ try { - - oldRAbund.setLabel(label); - oldRAbund.print(rabundFile); - oldRAbund.getSAbundVector().print(sabundFile); - + oldRAbund.setLabel(label); + if (countfile == "") { + oldRAbund.print(rabundFile); + oldRAbund.getSAbundVector().print(sabundFile); + } + oldRAbund.getSAbundVector().print(cout); oldList.setLabel(label); diff --git a/clusterdoturcommand.h b/clusterdoturcommand.h index 09ee822..dd61a35 100644 --- a/clusterdoturcommand.h +++ b/clusterdoturcommand.h @@ -37,7 +37,7 @@ public: private: bool abort, hard, sim; - string method, fileroot, tag, outputDir, phylipfile, namefile; + string method, fileroot, tag, outputDir, phylipfile, namefile, countfile; double cutoff; int precision, length; ofstream sabundFile, rabundFile, listFile; diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp index 4a33841..f785c50 100644 --- a/clusterfragmentscommand.cpp +++ b/clusterfragmentscommand.cpp @@ -29,7 +29,8 @@ inline bool comparePriority(seqRNode first, seqRNode second) { vector ClusterFragmentsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs); CommandParameter ppercent("percent", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppercent); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -49,8 +50,8 @@ string ClusterFragmentsCommand::getHelpString(){ try { string helpString = ""; helpString += "The cluster.fragments command groups sequences that are part of a larger sequence.\n"; - helpString += "The cluster.fragments command outputs a new fasta and name file.\n"; - helpString += "The cluster.fragments command parameters are fasta, name, diffs and percent. The fasta parameter is required, unless you have a valid current file. \n"; + helpString += "The cluster.fragments command outputs a new fasta and name or count file.\n"; + helpString += "The cluster.fragments command parameters are fasta, name, count, diffs and percent. The fasta parameter is required, unless you have a valid current file. \n"; helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n"; helpString += "The diffs parameter allows you to set the number of differences allowed, default=0. \n"; helpString += "The percent parameter allows you to set percentage of differences allowed, default=0. percent=2 means if the number of difference is less than or equal to two percent of the length of the fragment, then cluster.\n"; @@ -78,6 +79,7 @@ string ClusterFragmentsCommand::getOutputFileNameTag(string type, string inputNa else { if (type == "fasta") { outputFileName = "fragclust.fasta"; } else if (type == "name") { outputFileName = "fragclust.names"; } + else if (type == "count") { outputFileName = "fragclust.count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -96,6 +98,7 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ClusterFragmentsCommand", "ClusterFragmentsCommand"); @@ -129,6 +132,7 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -150,6 +154,14 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -171,6 +183,13 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { if (namefile == "not found") { namefile = ""; } else if (namefile == "not open") { namefile = ""; abort = true; } else { readNameFile(); m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { ct.readTable(countfile); m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.fragments command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } string temp; temp = validParameter.validFile(parameters, "diffs", false); if (temp == "not found"){ temp = "0"; } @@ -179,10 +198,12 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { temp = validParameter.validFile(parameters, "percent", false); if (temp == "not found"){ temp = "0"; } m->mothurConvert(temp, percent); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } @@ -229,10 +250,13 @@ int ClusterFragmentsCommand::execute(){ string jBases = alignSeqs[j].seq.getUnaligned(); if (isFragment(iBases, jBases)) { - //merge - alignSeqs[i].names += ',' + alignSeqs[j].names; - alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; - + if (countfile != "") { + ct.mergeCounts(alignSeqs[i].names, alignSeqs[j].names); + }else { + //merge + alignSeqs[i].names += ',' + alignSeqs[j].names; + alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; + } alignSeqs[j].active = 0; alignSeqs[j].numIdentical = 0; count++; @@ -254,6 +278,7 @@ int ClusterFragmentsCommand::execute(){ string newFastaFile = fileroot + getOutputFileNameTag("fasta"); string newNamesFile = fileroot + getOutputFileNameTag("name"); + if (countfile != "") { newNamesFile = fileroot + getOutputFileNameTag("count"); } if (m->control_pressed) { return 0; } @@ -285,6 +310,11 @@ int ClusterFragmentsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } return 0; @@ -372,7 +402,10 @@ int ClusterFragmentsCommand::readFASTA(){ else{ seqRNode tempNode(itSize->second, seq, names[seq.getName()], seq.getUnaligned().length()); alignSeqs.push_back(tempNode); - } + } + }else if(countfile != "") { + seqRNode tempNode(ct.getNumSeqs(seq.getName()), seq, seq.getName(), seq.getUnaligned().length()); + alignSeqs.push_back(tempNode); }else { //no names file, you are identical to yourself seqRNode tempNode(1, seq, seq.getName(), seq.getUnaligned().length()); alignSeqs.push_back(tempNode); @@ -396,17 +429,18 @@ void ClusterFragmentsCommand::printData(string newfasta, string newname){ ofstream outNames; m->openOutputFile(newfasta, outFasta); - m->openOutputFile(newname, outNames); + if (countfile == "") { m->openOutputFile(newname, outNames); } for (int i = 0; i < alignSeqs.size(); i++) { if (alignSeqs[i].numIdentical != 0) { alignSeqs[i].seq.printSequence(outFasta); - outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; + if (countfile == "") { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } } } outFasta.close(); - outNames.close(); + if (countfile == "") { outNames.close(); } + else { ct.printTable(newname); } } catch(exception& e) { m->errorOut(e, "ClusterFragmentsCommand", "printData"); @@ -438,6 +472,5 @@ void ClusterFragmentsCommand::readNameFile(){ exit(1); } } - /**************************************************************************************************/ diff --git a/clusterfragmentscommand.h b/clusterfragmentscommand.h index c322529..e3d861a 100644 --- a/clusterfragmentscommand.h +++ b/clusterfragmentscommand.h @@ -13,6 +13,7 @@ #include "command.hpp" #include "sequence.hpp" +#include "counttable.h" /************************************************************/ struct seqRNode { @@ -46,8 +47,9 @@ public: void help() { m->mothurOut(getHelpString()); } private: + CountTable ct; bool abort; - string fastafile, namefile, outputDir; + string fastafile, namefile, countfile, outputDir; int diffs, percent; vector alignSeqs; map names; //represents the names file first column maps to second column diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index b097f38..b3ce0f9 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -16,7 +16,8 @@ vector ClusterSplitCommand::setParameters(){ CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip); CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount); CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn); CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel); CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod); @@ -45,7 +46,7 @@ vector ClusterSplitCommand::setParameters(){ string ClusterSplitCommand::getHelpString(){ try { string helpString = ""; - helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n"; + helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n"; helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n"; helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n"; helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify. \n"; @@ -54,7 +55,8 @@ string ClusterSplitCommand::getHelpString(){ helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n"; helpString += "The phylip and column parameter allow you to enter your distance file. \n"; helpString += "The fasta parameter allows you to enter your aligned fasta file. \n"; - helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n"; + helpString += "The name parameter allows you to enter your name file. \n"; + helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format"; helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n"; helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n"; helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n"; @@ -196,6 +198,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["fasta"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -210,9 +220,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { else { distfile = columnfile; format = "column"; m->setColumnFile(columnfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { abort = true; namefile = "";} else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = "";} + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } fastafile = validParameter.validFile(parameters, "fasta", true); if (fastafile == "not open") { abort = true; } @@ -243,14 +258,20 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { } } else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; } - + + if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + if (columnfile != "") { - if (namefile == "") { + if ((namefile == "") && (countfile == "")) { namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); + abort = true; + } } } } @@ -265,12 +286,16 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { } } - if (namefile == "") { + if ((namefile == "") && (countfile == "")) { namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine(); + abort = true; + } } } } @@ -379,7 +404,7 @@ int ClusterSplitCommand::execute(){ //if no names file given with phylip file, create it ListVector* listToMakeNameFile = convert->getListVector(); - if (namefile == "") { //you need to make a namefile for split matrix + if ((namefile == "") && (countfile == "")) { //you need to make a namefile for split matrix ofstream out; namefile = phylipfile + ".names"; m->openOutputFile(namefile, out); @@ -401,9 +426,9 @@ int ClusterSplitCommand::execute(){ //split matrix into non-overlapping groups SplitMatrix* split; - if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); } - else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); } - else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); } + if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large); } + else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large); } + else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); } else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; } split->split(); @@ -666,15 +691,21 @@ map ClusterSplitCommand::completeListFile(vector listNames, //read in singletons if (singleton != "none") { - ifstream in; - m->openInputFile(singleton, in); + + ifstream in; + m->openInputFile(singleton, in); string firstCol, secondCol; listSingle = new ListVector(); + + if (countfile != "") { m->getline(in); m->gobble(in); } + while (!in.eof()) { - in >> firstCol >> secondCol; m->gobble(in); - listSingle->push_back(secondCol); + in >> firstCol >> secondCol; m->getline(in); m->gobble(in); + if (countfile == "") { listSingle->push_back(secondCol); } + else { listSingle->push_back(firstCol); } } + in.close(); m->mothurRemove(singleton); @@ -775,15 +806,21 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund"); string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund"); - string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list"); + string listFileName = fileroot+ tag + "."; + if (countfile != "") { listFileName += "unique_"; } + listFileName += getOutputFileNameTag("list"); - m->openOutputFile(sabundFileName, outSabund); - m->openOutputFile(rabundFileName, outRabund); + if (countfile == "") { + m->openOutputFile(sabundFileName, outSabund); + m->openOutputFile(rabundFileName, outRabund); + outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); + outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); + + } m->openOutputFile(listFileName, outList); + outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); + - outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName); - outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName); - outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName); map::iterator itLabel; //for each label needed @@ -794,22 +831,25 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us else { thisLabel = toString(itLabel->first, length-1); } outList << thisLabel << '\t' << itLabel->second << '\t'; - - RAbundVector* rabund = new RAbundVector(); - rabund->setLabel(thisLabel); + + RAbundVector* rabund = NULL; + if (countfile == "") { + rabund = new RAbundVector(); + rabund->setLabel(thisLabel); + } //add in singletons if (listSingle != NULL) { for (int j = 0; j < listSingle->getNumBins(); j++) { outList << listSingle->get(j) << '\t'; - rabund->push_back(m->getNumNames(listSingle->get(j))); + if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); } } } //get the list info from each file for (int k = 0; k < listNames.size(); k++) { - if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } delete rabund; return 0; } + if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } if (rabund != NULL) { delete rabund; } return 0; } InputData* input = new InputData(listNames[k], "list"); ListVector* list = input->getListVector(thisLabel); @@ -819,26 +859,28 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us else { for (int j = 0; j < list->getNumBins(); j++) { outList << list->get(j) << '\t'; - rabund->push_back(m->getNumNames(list->get(j))); + if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); } } delete list; } delete input; } - SAbundVector sabund = rabund->getSAbundVector(); - - sabund.print(outSabund); - rabund->print(outRabund); + if (countfile == "") { + SAbundVector sabund = rabund->getSAbundVector(); + sabund.print(outSabund); + rabund->print(outRabund); + } outList << endl; - delete rabund; + if (rabund != NULL) { delete rabund; } } outList.close(); - outRabund.close(); - outSabund.close(); - + if (countfile == "") { + outRabund.close(); + outSabund.close(); + } if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } @@ -993,7 +1035,7 @@ vector ClusterSplitCommand::createProcesses(vector< map //Above fork() will clone, so memory is separate, but that's not the case with windows, //Taking advantage of shared memory to allow both threads to add labels. ////////////////////////////////////////////////////////////////////////////////////////////////////// - + /* vector pDataArray; DWORD dwThreadIdArray[processors-1]; HANDLE hThreadArray[processors-1]; @@ -1031,7 +1073,7 @@ vector ClusterSplitCommand::createProcesses(vector< map CloseHandle(hThreadArray[i]); delete pDataArray[i]; } - +*/ #endif return listFiles; @@ -1101,16 +1143,25 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine(); - NameAssignment* nameMap = new NameAssignment(thisNamefile); - nameMap->readMap(); - - //reads phylip file storing data in 2D vector, also fills list and rabund + //reads phylip file storing data in 2D vector, also fills list and rabund bool sim = false; ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim); - cluster->readPhylipFile(thisDistFile, nameMap); - tag = cluster->getTag(); - if (m->control_pressed) { delete cluster; return 0; } + NameAssignment* nameMap = NULL; + CountTable* ct = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(thisNamefile); + nameMap->readMap(); + cluster->readPhylipFile(thisDistFile, nameMap); + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(thisNamefile); + cluster->readPhylipFile(thisDistFile, ct); + } + tag = cluster->getTag(); + + if (m->control_pressed) { if(namefile != ""){ delete nameMap; } + else { delete ct; } delete cluster; return 0; } list = cluster->getListVector(); rabund = cluster->getRAbundVector(); @@ -1136,7 +1187,8 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine(); while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){ - if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); return listFileName; } + if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); if(namefile != ""){ delete nameMap; } + else { delete ct; } return listFileName; } cluster->update(cutoff); @@ -1179,8 +1231,12 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN listFile.close(); - delete cluster; delete nameMap; delete list; delete rabund; - + delete cluster; delete list; delete rabund; + if(namefile != ""){ delete nameMap; } + else { delete ct; } + + m->mothurRemove(thisDistFile); + m->mothurRemove(thisNamefile); return listFileName; @@ -1219,18 +1275,30 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile ReadMatrix* read = new ReadColumnMatrix(thisDistFile); read->setCutoff(cutoff); - NameAssignment* nameMap = new NameAssignment(thisNamefile); - nameMap->readMap(); - read->read(nameMap); - - if (m->control_pressed) { delete read; delete nameMap; return listFileName; } - - list = read->getListVector(); + NameAssignment* nameMap = NULL; + CountTable* ct = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(thisNamefile); + nameMap->readMap(); + read->read(nameMap); + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(thisNamefile); + read->read(ct); + } + + list = read->getListVector(); oldList = *list; - matrix = read->getDMatrix(); + matrix = read->getDMatrix(); + if(countfile != "") { + rabund = new RAbundVector(); + createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list + delete ct; + }else { rabund = new RAbundVector(list->getRAbundVector()); } + delete read; read = NULL; - delete nameMap; nameMap = NULL; + if (namefile != "") { delete nameMap; nameMap = NULL; } #ifdef USE_MPI @@ -1242,8 +1310,6 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine(); - rabund = new RAbundVector(list->getRAbundVector()); - //create cluster if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); } else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); } @@ -1385,3 +1451,24 @@ int ClusterSplitCommand::createMergedDistanceFile(vector< map > } } //********************************************************************************************************************** +int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){ + try { + rabund->setLabel(list->getLabel()); + for(int i = 0; i < list->getNumBins(); i++) { + if (m->control_pressed) { break; } + vector binNames; + string bin = list->get(i); + m->splitAtComma(bin, binNames); + int total = 0; + for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]); } + rabund->push_back(total); + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClusterCommand", "createRabund"); + exit(1); + } + +} +//********************************************************************************************************************** diff --git a/clustersplitcommand.h b/clustersplitcommand.h index 59039ea..936ae6f 100644 --- a/clustersplitcommand.h +++ b/clustersplitcommand.h @@ -47,7 +47,7 @@ private: vector processIDS; //processid vector outputNames; - string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile; + string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, countfile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile; double cutoff, splitcutoff; int precision, length, processors, taxLevelCutoff; bool print_start, abort, hard, large, classic; @@ -62,6 +62,7 @@ private: int mergeLists(vector, map, ListVector*); map completeListFile(vector, string, set&, ListVector*&); int createMergedDistanceFile(vector< map >); + int createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund); }; /////////////////not working for Windows//////////////////////////////////////////////////////////// @@ -75,7 +76,7 @@ private: // anything to do with mothur's use of copy constructors in many of our data structures. ie. listvector // is copied by nameassignment and passed to read which passes to the thread? -westcott 2-8-12 //////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************************************************************************/ +/************************************************************************************************** //custom data structure for threads to use. // This is passed by void pointer so it can be any data type // that can be passed using a single void pointer (LPVOID). @@ -105,7 +106,7 @@ struct clusterData { } }; -/**************************************************************************************************/ +/************************************************************************************************** #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) #else static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){ @@ -257,7 +258,7 @@ static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){ } #endif - +*/ #endif diff --git a/commandfactory.cpp b/commandfactory.cpp index 02af676..6d87a68 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -134,6 +134,8 @@ #include "removeotulabelscommand.h" #include "makecontigscommand.h" #include "loadlogfilecommand.h" +#include "sffmultiplecommand.h" +#include "classifysharedcommand.h" /*******************************************************/ @@ -290,7 +292,10 @@ CommandFactory::CommandFactory(){ commands["make.contigs"] = "make.contigs"; commands["load.logfile"] = "load.logfile"; commands["make.table"] = "make.table"; + commands["sff.multiple"] = "sff.multiple"; commands["quit"] = "MPIEnabled"; + commands["classify.shared"] = "classify.shared"; + } /***********************************************************/ @@ -503,6 +508,8 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "remove.otulabels") { command = new RemoveOtuLabelsCommand(optionString); } else if(commandName == "make.contigs") { command = new MakeContigsCommand(optionString); } else if(commandName == "load.logfile") { command = new LoadLogfileCommand(optionString); } + else if(commandName == "sff.multiple") { command = new SffMultipleCommand(optionString); } + else if(commandName == "classify.shared") { command = new ClassifySharedCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -657,6 +664,8 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "remove.otulabels") { pipecommand = new RemoveOtuLabelsCommand(optionString); } else if(commandName == "make.contigs") { pipecommand = new MakeContigsCommand(optionString); } else if(commandName == "load.logfile") { pipecommand = new LoadLogfileCommand(optionString); } + else if(commandName == "sff.multiple") { pipecommand = new SffMultipleCommand(optionString); } + else if(commandName == "classify.shared") { pipecommand = new ClassifySharedCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -797,6 +806,8 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "remove.otulabels") { shellcommand = new RemoveOtuLabelsCommand(); } else if(commandName == "make.contigs") { shellcommand = new MakeContigsCommand(); } else if(commandName == "load.logfile") { shellcommand = new LoadLogfileCommand(); } + else if(commandName == "sff.multiple") { shellcommand = new SffMultipleCommand(); } + else if(commandName == "classify.shared") { shellcommand = new ClassifySharedCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/consensus.cpp b/consensus.cpp index 1be052f..d45a395 100644 --- a/consensus.cpp +++ b/consensus.cpp @@ -21,7 +21,7 @@ Tree* Consensus::getTree(vector& t){ if (m->control_pressed) { return 0; } - consensusTree = new Tree(t[0]->getTreeMap()); + consensusTree = new Tree(t[0]->getCountTable()); it2 = nodePairs.find(treeSet); @@ -37,8 +37,7 @@ Tree* Consensus::getTree(vector& t){ if (m->control_pressed) { delete consensusTree; return 0; } - map empty; - consensusTree->assembleTree(empty); + consensusTree->assembleTree(); if (m->control_pressed) { delete consensusTree; return 0; } diff --git a/consensusseqscommand.cpp b/consensusseqscommand.cpp index d6158ba..94d6682 100644 --- a/consensusseqscommand.cpp +++ b/consensusseqscommand.cpp @@ -15,7 +15,8 @@ vector ConsensusSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plist); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pcutoff("cutoff", "Number", "", "100", "", "", "",false,false); parameters.push_back(pcutoff); @@ -36,7 +37,7 @@ string ConsensusSeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The consensus.seqs command can be used in 2 ways: create a consensus sequence from a fastafile, or with a listfile create a consensus sequence for each otu. Sequences must be aligned.\n"; - helpString += "The consensus.seqs command parameters are fasta, list, name, cutoff and label.\n"; + helpString += "The consensus.seqs command parameters are fasta, list, name, count, cutoff and label.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The list parameter allows you to enter a your list file. \n"; helpString += "The name parameter allows you to enter a names file associated with the fasta file. \n"; @@ -65,6 +66,7 @@ string ConsensusSeqsCommand::getOutputFileNameTag(string type, string inputName= else { if (type == "fasta") { outputFileName = "cons.fasta"; } else if (type == "name") { outputFileName = "cons.names"; } + else if (type == "count") { outputFileName = "cons.count_table"; } else if (type == "summary") { outputFileName = "cons.summary"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -84,6 +86,7 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["summary"] = tempOutNames; } catch(exception& e) { @@ -120,6 +123,7 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["summary"] = tempOutNames; @@ -151,6 +155,14 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["list"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -168,6 +180,13 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + listfile = validParameter.validFile(parameters, "list", true); if (listfile == "not open") { abort = true; } else if (listfile == "not found") { listfile = ""; } @@ -186,10 +205,12 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); } - if (namefile == ""){ - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == ""){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } catch(exception& e) { @@ -209,6 +230,7 @@ int ConsensusSeqsCommand::execute(){ if (m->control_pressed) { return 0; } if (namefile != "") { readNames(); } + if (countfile != "") { ct.readTable(countfile); } if (m->control_pressed) { return 0; } @@ -227,25 +249,12 @@ int ConsensusSeqsCommand::execute(){ string outputFastaFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta"); m->openOutputFile(outputFastaFile, outFasta); outputNames.push_back(outputFastaFile); outputTypes["fasta"].push_back(outputFastaFile); - - vector seqs; - int seqLength = 0; - for (map::iterator it = nameMap.begin(); it != nameMap.end(); it++) { - - if (m->control_pressed) { outSummary.close(); outFasta.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - - string seq = fastaMap[it->second]; - seqs.push_back(seq); - - if (seqLength == 0) { seqLength = seq.length(); } - else if (seqLength != seq.length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } - - } - + vector< vector > percentages; percentages.resize(5); for (int j = 0; j < percentages.size(); j++) { percentages[j].resize(seqLength, 0.0); } string consSeq = ""; + int thisCount; //get counts for (int j = 0; j < seqLength; j++) { @@ -253,41 +262,55 @@ int ConsensusSeqsCommand::execute(){ vector counts; counts.resize(5, 0); //A,T,G,C,Gap int numDots = 0; - - for (int i = 0; i < seqs.size(); i++) { + thisCount = 0; + for (map::iterator it = fastaMap.begin(); it != fastaMap.end(); it++) { - if (seqs[i][j] == '.') { numDots++; } - - char base = toupper(seqs[i][j]); - if (base == 'A') { counts[0]++; } - else if (base == 'T') { counts[1]++; } - else if (base == 'G') { counts[2]++; } - else if (base == 'C') { counts[3]++; } - else { counts[4]++; } + string thisSeq = it->second; + int size = 0; + + if (countfile != "") { size = ct.getNumSeqs(it->first); } + else { + map::iterator itCount = nameFileMap.find(it->first); + if (itCount != nameFileMap.end()) { + size = itCount->second; + }else { m->mothurOut("[ERROR]: file mismatch, aborting.\n"); m->control_pressed = true; break; } + } + + for (int k = 0; k < size; k++) { + if (thisSeq[j] == '.') { numDots++; } + + char base = toupper(thisSeq[j]); + if (base == 'A') { counts[0]++; } + else if (base == 'T') { counts[1]++; } + else if (base == 'G') { counts[2]++; } + else if (base == 'C') { counts[3]++; } + else { counts[4]++; } + thisCount++; + } } char conBase = '.'; - if (numDots != seqs.size()) { conBase = getBase(counts, seqs.size()); } + if (numDots != thisCount) { conBase = getBase(counts, thisCount); } consSeq += conBase; - percentages[0][j] = counts[0] / (float) seqs.size(); - percentages[1][j] = counts[1] / (float) seqs.size(); - percentages[2][j] = counts[2] / (float) seqs.size(); - percentages[3][j] = counts[3] / (float) seqs.size(); - percentages[4][j] = counts[4] / (float) seqs.size(); - + percentages[0][j] = counts[0] / (float) thisCount; + percentages[1][j] = counts[1] / (float) thisCount; + percentages[2][j] = counts[2] / (float) thisCount; + percentages[3][j] = counts[3] / (float) thisCount; + percentages[4][j] = counts[4] / (float) thisCount; } for (int j = 0; j < seqLength; j++) { - outSummary << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << seqs.size() << '\t' << consSeq[j] << endl; + outSummary << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << thisCount << '\t' << consSeq[j] << endl; } outFasta << ">conseq" << endl << consSeq << endl; outSummary.close(); outFasta.close(); - + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } }else { @@ -414,12 +437,10 @@ int ConsensusSeqsCommand::processList(ListVector*& list){ if (m->control_pressed) { outSummary.close(); outName.close(); outFasta.close(); return 0; } string bin = list->get(i); - - string newName = ""; - string consSeq = getConsSeq(bin, outSummary, newName, i); + string consSeq = getConsSeq(bin, outSummary, i); outFasta << ">seq" << (i+1) << endl << consSeq << endl; - outName << "seq" << (i+1) << '\t' << "seq" << (i+1) << "," << newName << endl; + outName << "seq" << (i+1) << '\t' << "seq" << (i+1) << "," << bin << endl; } outSummary.close(); outName.close(); outFasta.close(); @@ -434,96 +455,127 @@ int ConsensusSeqsCommand::processList(ListVector*& list){ } //*************************************************************************************************************** -//made this smart enough to owrk with unique or non unique list file -string ConsensusSeqsCommand::getConsSeq(string bin, ofstream& outSummary, string& name, int binNumber){ +string ConsensusSeqsCommand::getConsSeq(string bin, ofstream& outSummary, int binNumber){ try{ string consSeq = ""; bool error = false; - - //the whole bin is the second column if no names file, otherwise build it - name = bin; - if (namefile != "") { name = ""; } - + int totalSize=0; + vector binNames; m->splitAtComma(bin, binNames); - - //get sequence strings for each name in the bin - vector seqs; - - set addedAlready; - int seqLength = 0; - for (int i = 0; i < binNames.size(); i++) { - - map::iterator it; - - it = nameMap.find(binNames[i]); - if (it == nameMap.end()) { - if (namefile == "") { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); error = true; } - else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta or name file, please correct."); m->mothurOutEndLine(); error = true; } - break; - }else { - - //add sequence string to seqs vector to process below - string seq = fastaMap[it->second]; - seqs.push_back(seq); - - if (seqLength == 0) { seqLength = seq.length(); } - else if (seqLength != seq.length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); error = true; break; } - - if (namefile != "") { - //did we add this line from name file already? - if (addedAlready.count(it->second) == 0) { - name += "," + nameFileMap[it->second]; - addedAlready.insert(it->second); - } - } - - } - } - - if (error) { m->control_pressed = true; return consSeq; } - - if (namefile != "") { name = name.substr(1); } - - vector< vector > percentages; percentages.resize(5); + + vector< vector > percentages; percentages.resize(5); for (int j = 0; j < percentages.size(); j++) { percentages[j].resize(seqLength, 0.0); } - - //get counts - for (int j = 0; j < seqLength; j++) { - - if (m->control_pressed) { return consSeq; } - - vector counts; counts.resize(5, 0); //A,T,G,C,Gap - int numDots = 0; - - for (int i = 0; i < seqs.size(); i++) { - - if (seqs[i][j] == '.') { numDots++; } - - char base = toupper(seqs[i][j]); - if (base == 'A') { counts[0]++; } - else if (base == 'T') { counts[1]++; } - else if (base == 'G') { counts[2]++; } - else if (base == 'C') { counts[3]++; } - else { counts[4]++; } - } - - char conBase = '.'; - if (numDots != seqs.size()) { conBase = getBase(counts, seqs.size()); } - - consSeq += conBase; - - percentages[0][j] = counts[0] / (float) seqs.size(); - percentages[1][j] = counts[1] / (float) seqs.size(); - percentages[2][j] = counts[2] / (float) seqs.size(); - percentages[3][j] = counts[3] / (float) seqs.size(); - percentages[4][j] = counts[4] / (float) seqs.size(); - + + if (countfile != "") { + //get counts + for (int j = 0; j < seqLength; j++) { + + if (m->control_pressed) { return consSeq; } + + vector counts; counts.resize(5, 0); //A,T,G,C,Gap + int numDots = 0; + totalSize = 0; + for (int i = 0; i < binNames.size(); i++) { + if (m->control_pressed) { return consSeq; } + + string thisSeq = ""; + map::iterator itFasta = fastaMap.find(binNames[i]); + if (itFasta != fastaMap.end()) { + thisSeq = itFasta->second; + }else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + + int size = ct.getNumSeqs(binNames[i]); + if (size != 0) { + for (int k = 0; k < size; k++) { + if (thisSeq[j] == '.') { numDots++; } + + char base = toupper(thisSeq[j]); + if (base == 'A') { counts[0]++; } + else if (base == 'T') { counts[1]++; } + else if (base == 'G') { counts[2]++; } + else if (base == 'C') { counts[3]++; } + else { counts[4]++; } + totalSize++; + } + }else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + } + char conBase = '.'; + if (numDots != totalSize) { conBase = getBase(counts, totalSize); } + + consSeq += conBase; + + percentages[0][j] = counts[0] / (float) totalSize; + percentages[1][j] = counts[1] / (float) totalSize; + percentages[2][j] = counts[2] / (float) totalSize; + percentages[3][j] = counts[3] / (float) totalSize; + percentages[4][j] = counts[4] / (float) totalSize; + } + + }else { + + //get sequence strings for each name in the bin + vector seqs; + for (int i = 0; i < binNames.size(); i++) { + + map::iterator it; + it = nameMap.find(binNames[i]); + if (it == nameMap.end()) { + if (namefile == "") { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); error = true; } + else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta or name file, please correct."); m->mothurOutEndLine(); error = true; } + break; + }else { + //add sequence string to seqs vector to process below + map::iterator itFasta = fastaMap.find(it->second); + + if (itFasta != fastaMap.end()) { + string seq = itFasta->second; + seqs.push_back(seq); + }else { m->mothurOut("[ERROR]: file mismatch, aborting. \n"); } + } + } + + if (error) { m->control_pressed = true; return consSeq; } + totalSize = seqs.size(); + //get counts + for (int j = 0; j < seqLength; j++) { + + if (m->control_pressed) { return consSeq; } + + vector counts; counts.resize(5, 0); //A,T,G,C,Gap + int numDots = 0; + + for (int i = 0; i < seqs.size(); i++) { + + if (seqs[i][j] == '.') { numDots++; } + + char base = toupper(seqs[i][j]); + if (base == 'A') { counts[0]++; } + else if (base == 'T') { counts[1]++; } + else if (base == 'G') { counts[2]++; } + else if (base == 'C') { counts[3]++; } + else { counts[4]++; } + } + + char conBase = '.'; + if (numDots != seqs.size()) { conBase = getBase(counts, seqs.size()); } + + consSeq += conBase; + + percentages[0][j] = counts[0] / (float) seqs.size(); + percentages[1][j] = counts[1] / (float) seqs.size(); + percentages[2][j] = counts[2] / (float) seqs.size(); + percentages[3][j] = counts[3] / (float) seqs.size(); + percentages[4][j] = counts[4] / (float) seqs.size(); + + } } - + + + for (int j = 0; j < seqLength; j++) { - outSummary << (binNumber + 1) << '\t' << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << seqs.size() << '\t' << consSeq[j] << endl; + outSummary << (binNumber + 1) << '\t' << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << totalSize << '\t' << consSeq[j] << endl; } return consSeq; @@ -646,7 +698,8 @@ int ConsensusSeqsCommand::readFasta(){ ifstream in; m->openInputFile(fastafile, in); - + seqLength = 0; + while (!in.eof()) { if (m->control_pressed) { break; } @@ -657,7 +710,10 @@ int ConsensusSeqsCommand::readFasta(){ if (name != "") { fastaMap[name] = seq.getAligned(); nameMap[name] = name; //set nameMap incase no names file - nameFileMap[name] = name; + nameFileMap[name] = 1; + + if (seqLength == 0) { seqLength = seq.getAligned().length(); } + else if (seqLength != seq.getAligned().length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); m->control_pressed = true; break; } } } @@ -688,7 +744,7 @@ int ConsensusSeqsCommand::readNames(){ it = nameMap.find(thisname); if (it != nameMap.end()) { //then this sequence was in the fastafile - nameFileMap[thisname] = repnames; //for later when outputting the new namesFile if the list file is unique + nameFileMap[thisname] = m->getNumNames(repnames); //for later when outputting the new namesFile if the list file is unique vector splitRepNames; m->splitAtComma(repnames, splitRepNames); diff --git a/consensusseqscommand.h b/consensusseqscommand.h index 1459b43..e0c9715 100644 --- a/consensusseqscommand.h +++ b/consensusseqscommand.h @@ -13,6 +13,7 @@ #include "command.hpp" #include "listvector.hpp" +#include "counttable.h" class ConsensusSeqsCommand : public Command { public: @@ -34,19 +35,20 @@ public: private: + CountTable ct; bool abort, allLines; - string fastafile, listfile, namefile, label, outputDir; + string fastafile, listfile, namefile, countfile, label, outputDir; set labels; vector outputNames; map fastaMap; map nameMap; - map nameFileMap; - int cutoff; + map nameFileMap; + int cutoff, seqLength; int readFasta(); int readNames(); int processList(ListVector*&); - string getConsSeq(string, ofstream&, string&, int); + string getConsSeq(string, ofstream&, int); char getBase(vector, int); }; diff --git a/countgroupscommand.cpp b/countgroupscommand.cpp index ccf8988..716dc90 100644 --- a/countgroupscommand.cpp +++ b/countgroupscommand.cpp @@ -16,6 +16,7 @@ vector CountGroupsCommand::setParameters(){ try { CommandParameter pshared("shared", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none",false,false); parameters.push_back(pshared); CommandParameter pgroup("group", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none",false,false); parameters.push_back(pgroup); + CommandParameter pcount("count", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none",false,false); parameters.push_back(pcount); CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -34,7 +35,7 @@ vector CountGroupsCommand::setParameters(){ string CountGroupsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The count.groups command counts sequences from a specific group or set of groups from the following file types: group or shared file.\n"; + helpString += "The count.groups command counts sequences from a specific group or set of groups from the following file types: group, count or shared file.\n"; helpString += "The count.groups command parameters are accnos, group, shared and groups. You must provide a group or shared file.\n"; helpString += "The accnos parameter allows you to provide a file containing the list of groups.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like. You can separate group names with dashes.\n"; @@ -114,6 +115,14 @@ CountGroupsCommand::CountGroupsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["shared"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -138,9 +147,23 @@ CountGroupsCommand::CountGroupsCommand(string option) { groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } - else { m->setGroupFile(groupfile); } + else { m->setGroupFile(groupfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { + m->setCountTableFile(countfile); + CountTable ct; + if (!ct.testGroups(countfile)) { m->mothurOut("[ERROR]: Your count file does not have any group information, aborting."); m->mothurOutEndLine(); abort=true; } + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + - if ((sharedfile == "") && (groupfile == "")) { + if ((sharedfile == "") && (groupfile == "") && (countfile == "")) { //give priority to shared, then group sharedfile = m->getSharedFile(); if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } @@ -148,7 +171,11 @@ CountGroupsCommand::CountGroupsCommand(string option) { groupfile = m->getGroupFile(); if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You have no current groupfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current groupfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + } } } } @@ -182,9 +209,36 @@ int CountGroupsCommand::execute(){ vector nameGroups = groupMap.getNamesOfGroups(); util.setGroups(Groups, nameGroups); + int total = 0; + for (int i = 0; i < Groups.size(); i++) { + int num = groupMap.getNumSeqs(Groups[i]); + total += num; + m->mothurOut(Groups[i] + " contains " + toString(num) + "."); m->mothurOutEndLine(); + } + + m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine(); + } + + if (m->control_pressed) { return 0; } + + if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + + //make sure groups are valid + //takes care of user setting groupNames that are invalid or setting groups=all + SharedUtil util; + vector nameGroups = ct.getNamesOfGroups(); + util.setGroups(Groups, nameGroups); + + int total = 0; for (int i = 0; i < Groups.size(); i++) { - m->mothurOut(Groups[i] + " contains " + toString(groupMap.getNumSeqs(Groups[i])) + "."); m->mothurOutEndLine(); + int num = ct.getGroupCount(Groups[i]); + total += num; + m->mothurOut(Groups[i] + " contains " + toString(num) + "."); m->mothurOutEndLine(); } + + m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine(); } if (m->control_pressed) { return 0; } @@ -193,10 +247,15 @@ int CountGroupsCommand::execute(){ InputData input(sharedfile, "sharedfile"); vector lookup = input.getSharedRAbundVectors(); + int total = 0; for (int i = 0; i < lookup.size(); i++) { - m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + "."); m->mothurOutEndLine(); + int num = lookup[i]->getNumSeqs(); + total += num; + m->mothurOut(lookup[i]->getGroup() + " contains " + toString(num) + "."); m->mothurOutEndLine(); delete lookup[i]; - } + } + + m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine(); } return 0; diff --git a/countgroupscommand.h b/countgroupscommand.h index dd0e0a2..d27a7f8 100644 --- a/countgroupscommand.h +++ b/countgroupscommand.h @@ -33,7 +33,7 @@ public: private: - string sharedfile, groupfile, outputDir, groups, accnosfile; + string sharedfile, groupfile, countfile, outputDir, groups, accnosfile; bool abort; vector Groups; }; diff --git a/countseqscommand.cpp b/countseqscommand.cpp index 210dd96..fa6fd4f 100644 --- a/countseqscommand.cpp +++ b/countseqscommand.cpp @@ -10,6 +10,7 @@ #include "countseqscommand.h" #include "groupmap.h" #include "sharedutilities.h" +#include "counttable.h" //********************************************************************************************************************** vector CountSeqsCommand::setParameters(){ @@ -34,7 +35,7 @@ vector CountSeqsCommand::setParameters(){ string CountSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count.table file. You may also provide a group file to get the counts broken down by group.\n"; + helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count_table file. You may also provide a group file to get the counts broken down by group.\n"; helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n"; helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n"; helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n"; @@ -58,7 +59,7 @@ string CountSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ it = outputTypes.find(type); if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } else { - if (type == "counttable") { outputFileName = "count.table"; } + if (type == "counttable") { outputFileName = "count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -175,7 +176,7 @@ int CountSeqsCommand::execute(){ int total = 0; if (!large) { total = processSmall(outputFileName); } else { total = processLarge(outputFileName); } - + if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } //set rabund file as new current rabundfile @@ -450,6 +451,26 @@ map CountSeqsCommand::processNameFile(string name) { in.close(); out.close(); + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + //parse names into vector + vector theseNames; + m->splitAtComma(secondCol, theseNames); + for (int i = 0; i < theseNames.size(); i++) { out << theseNames[i] << '\t' << count << endl; } + indexToNames[count] = firstCol; + pairDone = false; + count++; + } + } + + } + return indexToNames; } catch(exception& e) { @@ -502,6 +523,26 @@ map CountSeqsCommand::getGroupNames(string filename, set& n } in.close(); out.close(); + + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + it = groupIndex.find(secondCol); + if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above + groupIndex[secondCol] = count; + count++; + } + out << firstCol << '\t' << groupIndex[secondCol] << endl; + namesOfGroups.insert(secondCol); + pairDone = false; + } + } + } for (it = groupIndex.begin(); it != groupIndex.end(); it++) { indexToGroups[it->second] = it->first; } diff --git a/counttable.cpp b/counttable.cpp index a664228..2ab0e34 100644 --- a/counttable.cpp +++ b/counttable.cpp @@ -8,7 +8,199 @@ #include "counttable.h" +/************************************************************/ +int CountTable::createTable(set& n, map& g, set& gs) { + try { + int numGroups = 0; + groups.clear(); + totalGroups.clear(); + indexGroupMap.clear(); + indexNameMap.clear(); + counts.clear(); + for (set::iterator it = gs.begin(); it != gs.end(); it++) { groups.push_back(*it); hasGroups = true; } + numGroups = groups.size(); + totalGroups.resize(numGroups, 0); + + //sort groups to keep consistent with how we store the groups in groupmap + sort(groups.begin(), groups.end()); + for (int i = 0; i < groups.size(); i++) { indexGroupMap[groups[i]] = i; } + m->setAllGroups(groups); + + uniques = 0; + total = 0; + for (set::iterator it = n.begin(); it != n.end(); it++) { + + if (m->control_pressed) { break; } + + string seqName = *it; + + vector groupCounts; groupCounts.resize(numGroups, 0); + map::iterator itGroup = g.find(seqName); + + if (itGroup != g.end()) { + groupCounts[indexGroupMap[itGroup->second]] = 1; + totalGroups[indexGroupMap[itGroup->second]]++; + }else { m->mothurOut("[ERROR]: Your group file does not contain " + seqName + ". Please correct."); m->mothurOutEndLine(); } + + map::iterator it2 = indexNameMap.find(seqName); + if (it2 == indexNameMap.end()) { + if (hasGroups) { counts.push_back(groupCounts); } + indexNameMap[seqName] = uniques; + totals.push_back(1); + total++; + uniques++; + } + } + + if (hasGroups) { + for (int i = 0; i < totalGroups.size(); i++) { + if (totalGroups[i] == 0) { m->mothurOut("\nRemoving group: " + groups[i] + " because all sequences have been removed.\n"); removeGroup(groups[i]); i--; } + } + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "createTable"); + exit(1); + } +} +/************************************************************/ +bool CountTable::testGroups(string file) { + try { + m = MothurOut::getInstance(); hasGroups = false; total = 0; + ifstream in; + m->openInputFile(file, in); + + string headers = m->getline(in); m->gobble(in); + vector columnHeaders = m->splitWhiteSpace(headers); + if (columnHeaders.size() > 2) { hasGroups = true; } + return hasGroups; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "readTable"); + exit(1); + } +} +/************************************************************/ +int CountTable::createTable(string namefile, string groupfile, bool createGroup) { + try { + + if (namefile == "") { m->mothurOut("[ERROR]: namefile cannot be blank when creating a count table.\n"); m->control_pressed = true; } + + GroupMap* groupMap; + int numGroups = 0; + groups.clear(); + totalGroups.clear(); + indexGroupMap.clear(); + indexNameMap.clear(); + counts.clear(); + map originalGroupIndexes; + + if (groupfile != "") { + hasGroups = true; + groupMap = new GroupMap(groupfile); groupMap->readMap(); + numGroups = groupMap->getNumGroups(); + groups = groupMap->getNamesOfGroups(); + totalGroups.resize(numGroups, 0); + }else if(createGroup) { + hasGroups = true; + numGroups = 1; + groups.push_back("Group1"); + totalGroups.resize(numGroups, 0); + } + //sort groups to keep consistent with how we store the groups in groupmap + sort(groups.begin(), groups.end()); + for (int i = 0; i < groups.size(); i++) { indexGroupMap[groups[i]] = i; } + m->setAllGroups(groups); + + bool error = false; + string name; + uniques = 0; + total = 0; + + + //open input file + ifstream in; + m->openInputFile(namefile, in); + + int total = 0; + while (!in.eof()) { + if (m->control_pressed) { break; } + + string firstCol, secondCol; + in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in); + + vector names; + m->splitAtChar(secondCol, names, ','); + + map groupCounts; + int thisTotal = 0; + if (groupfile != "") { + //set to 0 + for (int i = 0; i < groups.size(); i++) { groupCounts[groups[i]] = 0; } + + //get counts for each of the users groups + for (int i = 0; i < names.size(); i++) { + string group = groupMap->getGroup(names[i]); + + if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); error=true; } + else { + map::iterator it = groupCounts.find(group); + + //if not found, then this sequence is not from a group we care about + if (it != groupCounts.end()) { + it->second++; + thisTotal++; + } + } + } + }else if (createGroup) { + groupCounts["Group1"]=0; + for (int i = 0; i < names.size(); i++) { + string group = "Group1"; + groupCounts["Group1"]++; thisTotal++; + } + }else { thisTotal = names.size(); } + + //if group info, then read it + vector thisGroupsCount; thisGroupsCount.resize(numGroups, 0); + for (int i = 0; i < numGroups; i++) { + thisGroupsCount[i] = groupCounts[groups[i]]; + totalGroups[i] += thisGroupsCount[i]; + } + + map::iterator it = indexNameMap.find(firstCol); + if (it == indexNameMap.end()) { + if (hasGroups) { counts.push_back(thisGroupsCount); } + indexNameMap[firstCol] = uniques; + totals.push_back(thisTotal); + total += thisTotal; + uniques++; + }else { + error = true; + m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + firstCol + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); + } + } + in.close(); + + if (error) { m->control_pressed = true; } + else { //check for zero groups + if (hasGroups) { + for (int i = 0; i < totalGroups.size(); i++) { + if (totalGroups[i] == 0) { m->mothurOut("\nRemoving group: " + groups[i] + " because all sequences have been removed.\n"); removeGroup(groups[i]); i--; } + } + } + } + if (groupfile != "") { delete groupMap; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "createTable"); + exit(1); + } +} /************************************************************/ int CountTable::readTable(string file) { try { @@ -64,6 +256,13 @@ int CountTable::readTable(string file) { in.close(); if (error) { m->control_pressed = true; } + else { //check for zero groups + if (hasGroups) { + for (int i = 0; i < totalGroups.size(); i++) { + if (totalGroups[i] == 0) { m->mothurOut("\nRemoving group: " + groups[i] + " because all sequences have been removed.\n"); removeGroup(groups[i]); i--; } + } + } + } return 0; } @@ -73,6 +272,68 @@ int CountTable::readTable(string file) { } } /************************************************************/ +int CountTable::printTable(string file) { + try { + ofstream out; + m->openOutputFile(file, out); + out << "Representative_Sequence\ttotal\t"; + for (int i = 0; i < groups.size(); i++) { out << groups[i] << '\t'; } + out << endl; + + for (map::iterator itNames = indexNameMap.begin(); itNames != indexNameMap.end(); itNames++) { + out << itNames->first << '\t' << totals[itNames->second] << '\t'; + if (hasGroups) { + + for (int i = 0; i < groups.size(); i++) { + out << counts[itNames->second][i] << '\t'; + } + } + out << endl; + } + out.close(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "printTable"); + exit(1); + } +} +/************************************************************/ +int CountTable::printHeaders(ofstream& out) { + try { + out << "Representative_Sequence\ttotal\t"; + for (int i = 0; i < groups.size(); i++) { out << groups[i] << '\t'; } + out << endl; + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "printHeaders"); + exit(1); + } +} +/************************************************************/ +int CountTable::printSeq(ofstream& out, string seqName) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it == indexNameMap.end()) { + m->mothurOut("[ERROR]: " + seqName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + out << it->first << '\t' << totals[it->second] << '\t'; + if (hasGroups) { + for (int i = 0; i < groups.size(); i++) { + out << counts[it->second][i] << '\t'; + } + } + out << endl; + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "printSeq"); + exit(1); + } +} +/************************************************************/ //group counts for a seq vector CountTable::getGroupCounts(string seqName) { try { @@ -138,6 +399,179 @@ int CountTable::getGroupCount(string seqName, string groupName) { exit(1); } } +/************************************************************/ +//set the number of sequences for the seq for the group +int CountTable::setAbund(string seqName, string groupName, int num) { + try { + if (hasGroups) { + map::iterator it = indexGroupMap.find(groupName); + if (it == indexGroupMap.end()) { + m->mothurOut("[ERROR]: " + groupName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + map::iterator it2 = indexNameMap.find(seqName); + if (it2 == indexNameMap.end()) { + m->mothurOut("[ERROR]: " + seqName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + int oldCount = counts[it2->second][it->second]; + counts[it2->second][it->second] = num; + totalGroups[it->second] += (num - oldCount); + total += (num - oldCount); + totals[it2->second] += (num - oldCount); + } + } + }else{ m->mothurOut("[ERROR]: Your count table does not have group info. Please correct.\n"); m->control_pressed = true; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "set"); + exit(1); + } +} +/************************************************************/ +//add group +int CountTable::addGroup(string groupName) { + try { + bool sanity = m->inUsersGroups(groupName, groups); + if (sanity) { m->mothurOut("[ERROR]: " + groupName + " is already in the count table, cannot add again.\n"); m->control_pressed = true; return 0; } + + groups.push_back(groupName); + if (!hasGroups) { counts.resize(uniques); } + + for (int i = 0; i < counts.size(); i++) { counts[i].push_back(0); } + totalGroups.push_back(0); + indexGroupMap[groupName] = groups.size()-1; + map originalGroupMap = indexGroupMap; + + //important to play well with others, :) + sort(groups.begin(), groups.end()); + + //fix indexGroupMap && totalGroups + vector newTotals; newTotals.resize(groups.size(), 0); + for (int i = 0; i < groups.size(); i++) { + indexGroupMap[groups[i]] = i; + //find original spot of group[i] + int index = originalGroupMap[groups[i]]; + newTotals[i] = totalGroups[index]; + } + totalGroups = newTotals; + + //fix counts vectors + for (int i = 0; i < counts.size(); i++) { + vector newCounts; newCounts.resize(groups.size(), 0); + for (int j = 0; j < groups.size(); j++) { + //find original spot of group[i] + int index = originalGroupMap[groups[j]]; + newCounts[j] = counts[i][index]; + } + counts[i] = newCounts; + } + hasGroups = true; + m->setAllGroups(groups); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "addGroup"); + exit(1); + } +} +/************************************************************/ +//remove group +int CountTable::removeGroup(string groupName) { + try { + if (hasGroups) { + //save for later in case removing a group means we need to remove a seq. + map reverse; + for (map::iterator it = indexNameMap.begin(); it !=indexNameMap.end(); it++) { reverse[it->second] = it->first; } + + map::iterator it = indexGroupMap.find(groupName); + if (it == indexGroupMap.end()) { + m->mothurOut("[ERROR]: " + groupName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + int indexOfGroupToRemove = it->second; + map currentGroupIndex = indexGroupMap; + vector newGroups; + for (int i = 0; i < groups.size(); i++) { + if (groups[i] != groupName) { + newGroups.push_back(groups[i]); + indexGroupMap[groups[i]] = newGroups.size()-1; + } + } + indexGroupMap.erase(groupName); + groups = newGroups; + totalGroups.erase(totalGroups.begin()+indexOfGroupToRemove); + + int thisIndex = 0; + map newIndexNameMap; + for (int i = 0; i < counts.size(); i++) { + int num = counts[i][indexOfGroupToRemove]; + counts[i].erase(counts[i].begin()+indexOfGroupToRemove); + totals[i] -= num; + total -= num; + if (totals[i] == 0) { //your sequences are only from the group we want to remove, then remove you. + counts.erase(counts.begin()+i); + totals.erase(totals.begin()+i); + uniques--; + i--; + } + newIndexNameMap[reverse[thisIndex]] = i; + thisIndex++; + } + indexNameMap = newIndexNameMap; + + if (groups.size() == 0) { hasGroups = false; } + } + }else { m->mothurOut("[ERROR]: your count table does not contain group information, can not remove group " + groupName + ".\n"); m->control_pressed = true; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "removeGroup"); + exit(1); + } +} +/************************************************************/ +//vector of groups for the seq +vector CountTable::getGroups(string seqName) { + try { + vector thisGroups; + if (hasGroups) { + vector thisCounts = getGroupCounts(seqName); + for (int i = 0; i < thisCounts.size(); i++) { + if (thisCounts[i] != 0) { thisGroups.push_back(groups[i]); } + } + }else{ m->mothurOut("[ERROR]: Your count table does not have group info. Please correct.\n"); m->control_pressed = true; } + + return thisGroups; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "getGroups"); + exit(1); + } +} +/************************************************************/ +//total number of seqs represented by seq +int CountTable::renameSeq(string oldSeqName, string newSeqName) { + try { + + map::iterator it = indexNameMap.find(oldSeqName); + if (it == indexNameMap.end()) { + m->mothurOut("[ERROR]: " + oldSeqName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + int index = it->second; + indexNameMap.erase(it); + indexNameMap[newSeqName] = index; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "renameSeq"); + exit(1); + } +} + /************************************************************/ //total number of seqs represented by seq int CountTable::getNumSeqs(string seqName) { @@ -174,6 +608,100 @@ int CountTable::get(string seqName) { exit(1); } } +/************************************************************/ +//add seqeunce without group info +int CountTable::push_back(string seqName) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it == indexNameMap.end()) { + if (hasGroups) { m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true; } + indexNameMap[seqName] = uniques; + totals.push_back(1); + total++; + uniques++; + }else { + m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "push_back"); + exit(1); + } +} +/************************************************************/ +//remove sequence +int CountTable::remove(string seqName) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it != indexNameMap.end()) { + uniques--; + if (hasGroups){ //remove this sequences counts from group totals + for (int i = 0; i < totalGroups.size(); i++) { totalGroups[i] -= counts[it->second][i]; counts[it->second][i] = 0; } + } + int thisTotal = totals[it->second]; totals[it->second] = 0; + total -= thisTotal; + indexNameMap.erase(it); + }else { + m->mothurOut("[ERROR]: Your count table contains does not include " + seqName + ", cannot remove."); m->mothurOutEndLine(); m->control_pressed = true; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "push_back"); + exit(1); + } +} +/************************************************************/ +//add seqeunce without group info +int CountTable::push_back(string seqName, int thisTotal) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it == indexNameMap.end()) { + if (hasGroups) { m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true; } + indexNameMap[seqName] = uniques; + totals.push_back(thisTotal); + total+=thisTotal; + uniques++; + }else { + m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "push_back"); + exit(1); + } +} +/************************************************************/ +//add sequence with group info +int CountTable::push_back(string seqName, vector groupCounts) { + try { + map::iterator it = indexNameMap.find(seqName); + if (it == indexNameMap.end()) { + if ((hasGroups) && (groupCounts.size() != getNumGroups())) { m->mothurOut("[ERROR]: Your count table has a " + toString(getNumGroups()) + " groups and " + seqName + " has " + toString(groupCounts.size()) + ", please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + int thisTotal = 0; + for (int i = 0; i < getNumGroups(); i++) { totalGroups[i] += groupCounts[i]; thisTotal += groupCounts[i]; } + if (hasGroups) { counts.push_back(groupCounts); } + indexNameMap[seqName] = uniques; + totals.push_back(thisTotal); + total+= thisTotal; + uniques++; + }else { + m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "push_back"); + exit(1); + } +} + /************************************************************/ //create ListVector from uniques ListVector CountTable::getListVector() { @@ -208,7 +736,46 @@ vector CountTable::getNamesOfSeqs() { } } /************************************************************/ -//returns names of seqs +//returns the names of all unique sequences in file mapped to their seqCounts +map CountTable::getNameMap() { + try { + map names; + for (map::iterator it = indexNameMap.begin(); it != indexNameMap.end(); it++) { + names[it->first] = totals[it->second]; + } + + return names; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "getNameMap"); + exit(1); + } +} +/************************************************************/ +//returns the names of all unique sequences in file +vector CountTable::getNamesOfSeqs(string group) { + try { + vector names; + if (hasGroups) { + map::iterator it = indexGroupMap.find(group); + if (it == indexGroupMap.end()) { + m->mothurOut("[ERROR]: " + group + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + for (map::iterator it2 = indexNameMap.begin(); it2 != indexNameMap.end(); it2++) { + if (counts[it2->second][it->second] != 0) { names.push_back(it2->first); } + } + } + }else{ m->mothurOut("[ERROR]: Your count table does not have group info. Please correct.\n"); m->control_pressed = true; } + + return names; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "getNamesOfSeqs"); + exit(1); + } +} +/************************************************************/ +//merges counts of seq1 and seq2, saving in seq1 int CountTable::mergeCounts(string seq1, string seq2) { try { map::iterator it = indexNameMap.find(seq1); @@ -220,17 +787,12 @@ int CountTable::mergeCounts(string seq1, string seq2) { m->mothurOut("[ERROR]: " + seq2 + " is not in your count table. Please correct.\n"); m->control_pressed = true; }else { //merge data - for (int i = 0; i < groups.size(); i++) { - counts[it->second][i] += counts[it2->second][i]; - counts[it2->second][i] = 0; - } + for (int i = 0; i < groups.size(); i++) { counts[it->second][i] += counts[it2->second][i]; } totals[it->second] += totals[it2->second]; - totals[it2->second] = 0; uniques--; indexNameMap.erase(it2); } } - return 0; } catch(exception& e) { @@ -238,6 +800,25 @@ int CountTable::mergeCounts(string seq1, string seq2) { exit(1); } } +/************************************************************/ +int CountTable::copy(CountTable* ct) { + try { + vector thisGroups = ct->getNamesOfGroups(); + for (int i = 0; i < thisGroups.size(); i++) { addGroup(thisGroups[i]); } + vector names = ct->getNamesOfSeqs(); + + for (int i = 0; i < names.size(); i++) { + vector thisCounts = ct->getGroupCounts(names[i]); + push_back(names[i], thisCounts); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "copy"); + exit(1); + } +} /************************************************************/ diff --git a/counttable.h b/counttable.h index 8baff30..34c941b 100644 --- a/counttable.h +++ b/counttable.h @@ -38,32 +38,55 @@ #include "mothurout.h" #include "listvector.hpp" +#include "groupmap.h" class CountTable { public: - CountTable() { m = MothurOut::getInstance(); hasGroups = false; total = 0; } + CountTable() { m = MothurOut::getInstance(); hasGroups = false; total = 0; uniques = 0; } ~CountTable() {} - int readTable(string); + //reads and creates smart enough to eliminate groups with zero counts + int createTable(set&, map&, set&); //seqNames, seqName->group, groupNames + int createTable(string, string, bool); //namefile, groupfile, createGroup + int readTable(string); + + int printTable(string); + int printHeaders(ofstream&); + int printSeq(ofstream&, string); + bool testGroups(string file); //used to check if file has group data without reading it. + int copy(CountTable*); bool hasGroupInfo() { return hasGroups; } int getNumGroups() { return groups.size(); } vector getNamesOfGroups() { return groups; } //returns group names, if no group info vector is blank. + int addGroup(string); + int removeGroup(string); + + int renameSeq(string, string); //used to change name of sequence for use with trees + int setAbund(string, string, int); //set abundance number of seqs for that group for that seq + int push_back(string); //add a sequence + int push_back(string, int); //add a sequence + int push_back(string, vector); //add a sequence with group info + int remove(string); //remove seq + int get(string); //returns unique sequence index for reading distance matrices like NameAssignment + int size() { return indexNameMap.size(); } + vector getGroups(string); //returns vector of groups represented by this sequences vector getGroupCounts(string); //returns group counts for a seq passed in, if no group info is in file vector is blank. Order is the same as the groups returned by getGroups function. int getGroupCount(string, string); //returns number of seqs for that group for that seq int getGroupCount(string); // returns total seqs for that group - int getNumSeqs(string); //returns total seqs for that seq + int getNumSeqs(string); //returns total seqs for that seq, 0 if not found int getNumSeqs() { return total; } //return total number of seqs int getNumUniqueSeqs() { return uniques; } //return number of unique/representative seqs int getGroupIndex(string); //returns index in getGroupCounts vector of specific group + vector getNamesOfSeqs(); + vector getNamesOfSeqs(string); int mergeCounts(string, string); //combines counts for 2 seqs, saving under the first name passed in. - int get(string); //returns unique sequence index for reading distance matrices like NameAssignment ListVector getListVector(); - int size() { return indexNameMap.size(); } + map getNameMap(); private: string filename; diff --git a/decisiontree.cpp b/decisiontree.cpp new file mode 100644 index 0000000..99853f3 --- /dev/null +++ b/decisiontree.cpp @@ -0,0 +1,399 @@ +// +// decisiontree.cpp +// Mothur +// +// Created by Sarah Westcott on 10/1/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "decisiontree.hpp" + +DecisionTree::DecisionTree(vector< vector > baseDataSet, + vector globalDiscardedFeatureIndices, + OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, + string treeSplitCriterion) : AbstractDecisionTree(baseDataSet, + globalDiscardedFeatureIndices, + optimumFeatureSubsetSelector, + treeSplitCriterion), variableImportanceList(numFeatures, 0){ + try { + m = MothurOut::getInstance(); + createBootStrappedSamples(); + buildDecisionTree(); + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "DecisionTree"); + exit(1); + } +} + +/***********************************************************************/ + +int DecisionTree::calcTreeVariableImportanceAndError() { + try { + + int numCorrect; + double treeErrorRate; + calcTreeErrorRate(numCorrect, treeErrorRate); + + if (m->control_pressed) {return 0; } + + for (int i = 0; i < numFeatures; i++) { + if (m->control_pressed) {return 0; } + // NOTE: only shuffle the features, never shuffle the output vector + // so i = 0 and i will be alwaays <= (numFeatures - 1) as the index at numFeatures will denote + // the feature vector + vector< vector > randomlySampledTestData = randomlyShuffleAttribute(bootstrappedTestSamples, i); + + int numCorrectAfterShuffle = 0; + for (int j = 0; j < randomlySampledTestData.size(); j++) { + if (m->control_pressed) {return 0; } + vector shuffledSample = randomlySampledTestData[j]; + int actualSampleOutputClass = shuffledSample[numFeatures]; + int predictedSampleOutputClass = evaluateSample(shuffledSample); + if (actualSampleOutputClass == predictedSampleOutputClass) { numCorrectAfterShuffle++; } + } + variableImportanceList[i] += (numCorrect - numCorrectAfterShuffle); + } + + // TODO: do we need to save the variableRanks in the DecisionTree, do we need it later? + vector< vector > variableRanks; + for (int i = 0; i < variableImportanceList.size(); i++) { + if (m->control_pressed) {return 0; } + if (variableImportanceList[i] > 0) { + // TODO: is there a way to optimize the follow line's code? + vector variableRank(2, 0); + variableRank[0] = i; variableRank[1] = variableImportanceList[i]; + variableRanks.push_back(variableRank); + } + } + VariableRankDescendingSorter variableRankDescendingSorter; + sort(variableRanks.begin(), variableRanks.end(), variableRankDescendingSorter); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "calcTreeVariableImportanceAndError"); + exit(1); + } + +} +/***********************************************************************/ + +// TODO: there must be a way to optimize this function +int DecisionTree::evaluateSample(vector testSample) { + try { + RFTreeNode *node = rootNode; + while (true) { + if (m->control_pressed) {return 0; } + if (node->checkIsLeaf()) { return node->getOutputClass(); } + int sampleSplitFeatureValue = testSample[node->getSplitFeatureIndex()]; + if (sampleSplitFeatureValue < node->getSplitFeatureValue()) { node = node->getLeftChildNode(); } + else { node = node->getRightChildNode(); } + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "evaluateSample"); + exit(1); + } + +} +/***********************************************************************/ + +int DecisionTree::calcTreeErrorRate(int& numCorrect, double& treeErrorRate){ + try { + numCorrect = 0; + for (int i = 0; i < bootstrappedTestSamples.size(); i++) { + if (m->control_pressed) {return 0; } + + vector testSample = bootstrappedTestSamples[i]; + int testSampleIndex = bootstrappedTestSampleIndices[i]; + + int actualSampleOutputClass = testSample[numFeatures]; + int predictedSampleOutputClass = evaluateSample(testSample); + + if (actualSampleOutputClass == predictedSampleOutputClass) { numCorrect++; } + + outOfBagEstimates[testSampleIndex] = predictedSampleOutputClass; + } + + treeErrorRate = 1 - ((double)numCorrect / (double)bootstrappedTestSamples.size()); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "calcTreeErrorRate"); + exit(1); + } +} + +/***********************************************************************/ + +// TODO: optimize the algo, instead of transposing two time, we can extarct the feature, +// shuffle it and then re-insert in the original place, thus iproving runnting time +//This function randomize abundances for a given OTU/feature. +vector< vector > DecisionTree::randomlyShuffleAttribute(vector< vector > samples, int featureIndex) { + try { + // NOTE: we need (numFeatures + 1) featureVecotors, the last extra vector is actually outputVector + vector< vector > shuffledSample = samples; + vector featureVectors(samples.size(), 0); + + for (int j = 0; j < samples.size(); j++) { + if (m->control_pressed) { return shuffledSample; } + featureVectors[j] = samples[j][featureIndex]; + } + + random_shuffle(featureVectors.begin(), featureVectors.end()); + + for (int j = 0; j < samples.size(); j++) { + if (m->control_pressed) {return shuffledSample; } + shuffledSample[j][featureIndex] = featureVectors[j]; + } + + return shuffledSample; + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "randomlyShuffleAttribute"); + exit(1); + } +} +/***********************************************************************/ + +int DecisionTree::purgeTreeNodesDataRecursively(RFTreeNode* treeNode) { + try { + treeNode->bootstrappedTrainingSamples.clear(); + treeNode->bootstrappedFeatureVectors.clear(); + treeNode->bootstrappedOutputVector.clear(); + treeNode->localDiscardedFeatureIndices.clear(); + treeNode->globalDiscardedFeatureIndices.clear(); + + if (treeNode->leftChildNode != NULL) { purgeTreeNodesDataRecursively(treeNode->leftChildNode); } + if (treeNode->rightChildNode != NULL) { purgeTreeNodesDataRecursively(treeNode->rightChildNode); } + return 0; + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "purgeTreeNodesDataRecursively"); + exit(1); + } +} +/***********************************************************************/ + +void DecisionTree::buildDecisionTree(){ + try { + + int generation = 0; + rootNode = new RFTreeNode(bootstrappedTrainingSamples, globalDiscardedFeatureIndices, numFeatures, numSamples, numOutputClasses, generation); + + splitRecursively(rootNode); + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "buildDecisionTree"); + exit(1); + } +} + +/***********************************************************************/ + +int DecisionTree::splitRecursively(RFTreeNode* rootNode) { + try { + + if (rootNode->getNumSamples() < 2){ + rootNode->setIsLeaf(true); + rootNode->setOutputClass(rootNode->getBootstrappedTrainingSamples()[0][rootNode->getNumFeatures()]); + return 0; + } + + int classifiedOutputClass; + bool isAlreadyClassified = checkIfAlreadyClassified(rootNode, classifiedOutputClass); + if (isAlreadyClassified == true){ + rootNode->setIsLeaf(true); + rootNode->setOutputClass(classifiedOutputClass); + return 0; + } + if (m->control_pressed) {return 0;} + vector featureSubsetIndices = selectFeatureSubsetRandomly(globalDiscardedFeatureIndices, rootNode->getLocalDiscardedFeatureIndices()); + rootNode->setFeatureSubsetIndices(featureSubsetIndices); + if (m->control_pressed) {return 0;} + + findAndUpdateBestFeatureToSplitOn(rootNode); + + if (m->control_pressed) {return 0;} + + vector< vector > leftChildSamples; + vector< vector > rightChildSamples; + getSplitPopulation(rootNode, leftChildSamples, rightChildSamples); + + if (m->control_pressed) {return 0;} + + // TODO: need to write code to clear this memory + RFTreeNode* leftChildNode = new RFTreeNode(leftChildSamples, globalDiscardedFeatureIndices, numFeatures, (int)leftChildSamples.size(), numOutputClasses, rootNode->getGeneration() + 1); + RFTreeNode* rightChildNode = new RFTreeNode(rightChildSamples, globalDiscardedFeatureIndices, numFeatures, (int)rightChildSamples.size(), numOutputClasses, rootNode->getGeneration() + 1); + + rootNode->setLeftChildNode(leftChildNode); + leftChildNode->setParentNode(rootNode); + + rootNode->setRightChildNode(rightChildNode); + rightChildNode->setParentNode(rootNode); + + // TODO: This recursive split can be parrallelized later + splitRecursively(leftChildNode); + if (m->control_pressed) {return 0;} + + splitRecursively(rightChildNode); + return 0; + + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "splitRecursively"); + exit(1); + } +} +/***********************************************************************/ + +int DecisionTree::findAndUpdateBestFeatureToSplitOn(RFTreeNode* node){ + try { + + vector< vector > bootstrappedFeatureVectors = node->getBootstrappedFeatureVectors(); + if (m->control_pressed) {return 0;} + vector bootstrappedOutputVector = node->getBootstrappedOutputVector(); + if (m->control_pressed) {return 0;} + vector featureSubsetIndices = node->getFeatureSubsetIndices(); + if (m->control_pressed) {return 0;} + + vector featureSubsetEntropies; + vector featureSubsetSplitValues; + vector featureSubsetIntrinsicValues; + vector featureSubsetGainRatios; + + for (int i = 0; i < featureSubsetIndices.size(); i++) { + if (m->control_pressed) {return 0;} + + int tryIndex = featureSubsetIndices[i]; + + double featureMinEntropy; + int featureSplitValue; + double featureIntrinsicValue; + + getMinEntropyOfFeature(bootstrappedFeatureVectors[tryIndex], bootstrappedOutputVector, featureMinEntropy, featureSplitValue, featureIntrinsicValue); + if (m->control_pressed) {return 0;} + + featureSubsetEntropies.push_back(featureMinEntropy); + featureSubsetSplitValues.push_back(featureSplitValue); + featureSubsetIntrinsicValues.push_back(featureIntrinsicValue); + + double featureInformationGain = node->getOwnEntropy() - featureMinEntropy; + double featureGainRatio = (double)featureInformationGain / (double)featureIntrinsicValue; + featureSubsetGainRatios.push_back(featureGainRatio); + + } + + vector::iterator minEntropyIterator = min_element(featureSubsetEntropies.begin(), featureSubsetEntropies.end()); + vector::iterator maxGainRatioIterator = max_element(featureSubsetGainRatios.begin(), featureSubsetGainRatios.end()); + double featureMinEntropy = *minEntropyIterator; + //double featureMaxGainRatio = *maxGainRatioIterator; + + double bestFeatureSplitEntropy = featureMinEntropy; + int bestFeatureToSplitOnIndex = -1; + if (treeSplitCriterion == "gainRatio"){ + bestFeatureToSplitOnIndex = (int)(maxGainRatioIterator - featureSubsetGainRatios.begin()); + // if using 'gainRatio' measure, then featureMinEntropy must be re-updated, as the index + // for 'featureMaxGainRatio' would be different + bestFeatureSplitEntropy = featureSubsetEntropies[bestFeatureToSplitOnIndex]; + } + else { bestFeatureToSplitOnIndex = (int)(minEntropyIterator - featureSubsetEntropies.begin()); } + + int bestFeatureSplitValue = featureSubsetSplitValues[bestFeatureToSplitOnIndex]; + + node->setSplitFeatureIndex(featureSubsetIndices[bestFeatureToSplitOnIndex]); + node->setSplitFeatureValue(bestFeatureSplitValue); + node->setSplitFeatureEntropy(bestFeatureSplitEntropy); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "findAndUpdateBestFeatureToSplitOn"); + exit(1); + } +} +/***********************************************************************/ +vector DecisionTree::selectFeatureSubsetRandomly(vector globalDiscardedFeatureIndices, vector localDiscardedFeatureIndices){ + try { + + vector featureSubsetIndices; + + vector combinedDiscardedFeatureIndices; + combinedDiscardedFeatureIndices.insert(combinedDiscardedFeatureIndices.end(), globalDiscardedFeatureIndices.begin(), globalDiscardedFeatureIndices.end()); + combinedDiscardedFeatureIndices.insert(combinedDiscardedFeatureIndices.end(), localDiscardedFeatureIndices.begin(), localDiscardedFeatureIndices.end()); + + sort(combinedDiscardedFeatureIndices.begin(), combinedDiscardedFeatureIndices.end()); + + int numberOfRemainingSuitableFeatures = (int)(numFeatures - combinedDiscardedFeatureIndices.size()); + int currentFeatureSubsetSize = numberOfRemainingSuitableFeatures < optimumFeatureSubsetSize ? numberOfRemainingSuitableFeatures : optimumFeatureSubsetSize; + + while (featureSubsetIndices.size() < currentFeatureSubsetSize) { + + if (m->control_pressed) { return featureSubsetIndices; } + + // TODO: optimize rand() call here + int randomIndex = rand() % numFeatures; + vector::iterator it = find(featureSubsetIndices.begin(), featureSubsetIndices.end(), randomIndex); + if (it == featureSubsetIndices.end()){ // NOT FOUND + vector::iterator it2 = find(combinedDiscardedFeatureIndices.begin(), combinedDiscardedFeatureIndices.end(), randomIndex); + if (it2 == combinedDiscardedFeatureIndices.end()){ // NOT FOUND AGAIN + featureSubsetIndices.push_back(randomIndex); + } + } + } + sort(featureSubsetIndices.begin(), featureSubsetIndices.end()); + + //#ifdef DEBUG_LEVEL_3 + // PRINT_VAR(featureSubsetIndices); + //#endif + + return featureSubsetIndices; + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "selectFeatureSubsetRandomly"); + exit(1); + } +} +/***********************************************************************/ + +// TODO: printTree() needs a check if correct +int DecisionTree::printTree(RFTreeNode* treeNode, string caption){ + try { + string tabs = ""; + for (int i = 0; i < treeNode->getGeneration(); i++) { tabs += " "; } + // for (int i = 0; i < treeNode->getGeneration() - 1; i++) { tabs += "| "; } + // if (treeNode->getGeneration() != 0) { tabs += "|--"; } + + if (treeNode != NULL && treeNode->checkIsLeaf() == false){ + m->mothurOut(tabs + caption + " [ gen: " + toString(treeNode->getGeneration()) + " ] ( " + toString(treeNode->getSplitFeatureValue()) + " < X" + toString(treeNode->getSplitFeatureIndex()) +" )\n"); + + printTree(treeNode->getLeftChildNode(), "leftChild"); + printTree(treeNode->getRightChildNode(), "rightChild"); + }else { + m->mothurOut(tabs + caption + " [ gen: " + toString(treeNode->getGeneration()) + " ] ( classified to: " + toString(treeNode->getOutputClass()) + ", samples: " + toString(treeNode->getNumSamples()) + " )\n"); + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "printTree"); + exit(1); + } +} +/***********************************************************************/ +void DecisionTree::deleteTreeNodesRecursively(RFTreeNode* treeNode) { + try { + if (treeNode == NULL) { return; } + deleteTreeNodesRecursively(treeNode->leftChildNode); + deleteTreeNodesRecursively(treeNode->rightChildNode); + delete treeNode; + } + catch(exception& e) { + m->errorOut(e, "DecisionTree", "deleteTreeNodesRecursively"); + exit(1); + } +} +/***********************************************************************/ + diff --git a/decisiontree.hpp b/decisiontree.hpp new file mode 100755 index 0000000..d4441ed --- /dev/null +++ b/decisiontree.hpp @@ -0,0 +1,59 @@ + // + // decisiontree.hpp + // rrf-fs-prototype + // + // Created by Abu Zaher Faridee on 5/28/12. + // Copyright (c) 2012 Schloss Lab. All rights reserved. + // + +#ifndef rrf_fs_prototype_decisiontree_hpp +#define rrf_fs_prototype_decisiontree_hpp + +#include "macros.h" +#include "rftreenode.hpp" +#include "abstractdecisiontree.hpp" + +/***********************************************************************/ + +struct VariableRankDescendingSorter { + bool operator() (vector first, vector second){ return first[1] > second[1]; } +}; +struct VariableRankDescendingSorterDouble { + bool operator() (vector first, vector second){ return first[1] > second[1]; } +}; +/***********************************************************************/ + +class DecisionTree: public AbstractDecisionTree{ + + friend class RandomForest; + +public: + + DecisionTree(vector< vector > baseDataSet, + vector globalDiscardedFeatureIndices, + OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, + string treeSplitCriterion); + virtual ~DecisionTree(){ deleteTreeNodesRecursively(rootNode); } + + int calcTreeVariableImportanceAndError(); + int evaluateSample(vector testSample); + int calcTreeErrorRate(int& numCorrect, double& treeErrorRate); + vector< vector > randomlyShuffleAttribute(vector< vector > samples, int featureIndex); + void purgeDataSetsFromTree() { purgeTreeNodesDataRecursively(rootNode); } + int purgeTreeNodesDataRecursively(RFTreeNode* treeNode); + + +private: + + void buildDecisionTree(); + int splitRecursively(RFTreeNode* rootNode); + int findAndUpdateBestFeatureToSplitOn(RFTreeNode* node); + vector selectFeatureSubsetRandomly(vector globalDiscardedFeatureIndices, vector localDiscardedFeatureIndices); + int printTree(RFTreeNode* treeNode, string caption); + void deleteTreeNodesRecursively(RFTreeNode* treeNode); + + vector variableImportanceList; + map outOfBagEstimates; +}; + +#endif diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index bab5a63..90a40ce 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -14,7 +14,8 @@ vector DeconvoluteCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -31,7 +32,7 @@ vector DeconvoluteCommand::setParameters(){ string DeconvoluteCommand::getHelpString(){ try { string helpString = ""; - helpString += "The unique.seqs command reads a fastafile and creates a namesfile.\n"; + helpString += "The unique.seqs command reads a fastafile and creates a name or count file.\n"; helpString += "It creates a file where the first column is the groupname and the second column is a list of sequence names who have the same sequence. \n"; helpString += "If the sequence is unique the second column will just contain its name. \n"; helpString += "The unique.seqs command parameters are fasta and name. fasta is required, unless there is a valid current fasta file.\n"; @@ -56,6 +57,7 @@ string DeconvoluteCommand::getOutputFileNameTag(string type, string inputName="" else { if (type == "fasta") { outputFileName = "unique" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "names"; } + else if (type == "count") { outputFileName = "count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -73,6 +75,7 @@ DeconvoluteCommand::DeconvoluteCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "DeconvoluteCommand", "DeconvoluteCommand"); @@ -106,6 +109,7 @@ DeconvoluteCommand::DeconvoluteCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -127,6 +131,14 @@ DeconvoluteCommand::DeconvoluteCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -149,11 +161,21 @@ DeconvoluteCommand::DeconvoluteCommand(string option) { if (oldNameMapFName == "not open") { oldNameMapFName = ""; abort = true; } else if (oldNameMapFName == "not found"){ oldNameMapFName = ""; } else { m->setNameFile(oldNameMapFName); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } - if (oldNameMapFName == "") { - vector files; files.push_back(inFastaName); - parser.getNameFile(files); - } + if ((countfile != "") && (oldNameMapFName != "")) { m->mothurOut("When executing a unique.seqs command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + + if (countfile == "") { + if (oldNameMapFName == "") { + vector files; files.push_back(inFastaName); + parser.getNameFile(files); + } + } } @@ -171,6 +193,7 @@ int DeconvoluteCommand::execute() { //prepare filenames and open files string outNameFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + getOutputFileNameTag("name"); + string outCountFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + getOutputFileNameTag("count"); string outFastaFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + getOutputFileNameTag("fasta", inFastaName); map nameMap; @@ -179,6 +202,11 @@ int DeconvoluteCommand::execute() { m->readNames(oldNameMapFName, nameMap); if (oldNameMapFName == outNameFile){ outNameFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "unique." + getOutputFileNameTag("name"); } } + CountTable ct; + if (countfile != "") { + ct.readTable(countfile); + if (countfile == outCountFile){ outCountFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "unique." + getOutputFileNameTag("count"); } + } if (m->control_pressed) { return 0; } @@ -222,7 +250,10 @@ int DeconvoluteCommand::execute() { sequenceStrings[seq.getAligned()] = itNames->second; nameFileOrder.push_back(seq.getAligned()); } - }else { sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } + }else if (countfile != "") { + ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table + sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); + }else { sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } }else { //this is a dup if (oldNameMapFName != "") { itNames = nameMap.find(seq.getName()); @@ -232,7 +263,12 @@ int DeconvoluteCommand::execute() { }else { sequenceStrings[seq.getAligned()] += "," + itNames->second; } - }else { sequenceStrings[seq.getAligned()] += "," + seq.getName(); } + }else if (countfile != "") { + int num = ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table + if (num != 0) { //its in the table + ct.mergeCounts(itStrings->second, seq.getName()); //merges counts and saves in uniques name + } + }else { sequenceStrings[seq.getAligned()] += "," + seq.getName(); } } count++; @@ -252,34 +288,35 @@ int DeconvoluteCommand::execute() { //print new names file ofstream outNames; - m->openOutputFile(outNameFile, outNames); + if (countfile == "") { m->openOutputFile(outNameFile, outNames); outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile); } + else { m->openOutputFile(outCountFile, outNames); ct.printHeaders(outNames); outputTypes["count"].push_back(outCountFile); outputNames.push_back(outCountFile); } for (int i = 0; i < nameFileOrder.size(); i++) { - //for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) { - if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); m->mothurRemove(outNameFile); return 0; } + if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } itStrings = sequenceStrings.find(nameFileOrder[i]); if (itStrings != sequenceStrings.end()) { - //get rep name - int pos = (itStrings->second).find_first_of(','); - - if (pos == string::npos) { // only reps itself - outNames << itStrings->second << '\t' << itStrings->second << endl; - }else { - outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; - } + if (countfile == "") { + //get rep name + int pos = (itStrings->second).find_first_of(','); + + if (pos == string::npos) { // only reps itself + outNames << itStrings->second << '\t' << itStrings->second << endl; + }else { + outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; + } + }else { ct.printSeq(outNames, itStrings->second); } }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; } } outNames.close(); - if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); m->mothurRemove(outNameFile); return 0; } + if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(outFastaFile); m->mothurOutEndLine(); - m->mothurOut(outNameFile); m->mothurOutEndLine(); - outputNames.push_back(outFastaFile); outputNames.push_back(outNameFile); outputTypes["fasta"].push_back(outFastaFile); outputTypes["name"].push_back(outNameFile); + outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); //set fasta file as new current fastafile @@ -293,6 +330,11 @@ int DeconvoluteCommand::execute() { if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } return 0; } diff --git a/deconvolutecommand.h b/deconvolutecommand.h index 7d4cb50..673ffc9 100644 --- a/deconvolutecommand.h +++ b/deconvolutecommand.h @@ -11,6 +11,7 @@ #include "command.hpp" #include "fastamap.h" +#include "counttable.h" /* The unique.seqs command reads a fasta file, finds the duplicate sequences and outputs a names file containing 2 columns. The first being the groupname and the second the list of identical sequence names. */ @@ -37,7 +38,7 @@ public: private: - string inFastaName, oldNameMapFName, outputDir; + string inFastaName, oldNameMapFName, outputDir, countfile; vector outputNames; bool abort; diff --git a/deuniquetreecommand.cpp b/deuniquetreecommand.cpp index 662282b..d334f8f 100644 --- a/deuniquetreecommand.cpp +++ b/deuniquetreecommand.cpp @@ -161,7 +161,8 @@ int DeuniqueTreeCommand::execute() { TreeReader* reader = new TreeReader(treefile, "", namefile); vector T = reader->getTrees(); - map nameMap = reader->getNameMap(); + map nameMap; + m->readNames(namefile, nameMap); delete reader; //print new Tree @@ -172,7 +173,7 @@ int DeuniqueTreeCommand::execute() { T[0]->print(out, nameMap); out.close(); - delete (T[0]->getTreeMap()); + delete (T[0]->getCountTable()); for (int i = 0; i < T.size(); i++) { delete T[i]; } //set phylip file as new current phylipfile diff --git a/flowdata.cpp b/flowdata.cpp index 1420f84..1fe7d7f 100644 --- a/flowdata.cpp +++ b/flowdata.cpp @@ -43,13 +43,15 @@ bool FlowData::getNext(ifstream& flowFile){ try { flowFile >> seqName >> endFlow; - //cout << "in Flowdata " + seqName << endl; - for(int i=0;i> flowData[i]; } - //cout << "in Flowdata read " << seqName + " done" << endl; - updateEndFlow(); - translateFlow(); - - m->gobble(flowFile); + if (seqName.length() != 0) { + //cout << "in Flowdata " + seqName << endl; + for(int i=0;i> flowData[i]; } + //cout << "in Flowdata read " << seqName + " done" << endl; + updateEndFlow(); + translateFlow(); + m->gobble(flowFile); + }else{ m->mothurOut("Error in reading your flowfile, at position " + toString(flowFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); } + if(flowFile){ return 1; } else { return 0; } } diff --git a/getgroupscommand.cpp b/getgroupscommand.cpp index fe6f571..69f4403 100644 --- a/getgroupscommand.cpp +++ b/getgroupscommand.cpp @@ -18,8 +18,9 @@ vector GetGroupsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(pfasta); CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup); CommandParameter pdesign("design", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pdesign); CommandParameter plist("list", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(ptaxonomy); @@ -43,7 +44,7 @@ string GetGroupsCommand::getHelpString(){ string helpString = ""; helpString += "The get.groups command selects sequences from a specfic group or set of groups from the following file types: fasta, name, group, list, taxonomy, design or shared file.\n"; helpString += "It outputs a file containing the sequences in the those specified groups, or a sharedfile containing only those groups.\n"; - helpString += "The get.groups command parameters are accnos, fasta, name, group, list, taxonomy, shared, design and groups. The group parameter is required, unless you have a current group file, or are using a shared file.\n"; + helpString += "The get.groups command parameters are accnos, fasta, name, group, list, taxonomy, shared, design and groups. The group or count parameter is required, unless you have a current group or count file, or are using a shared file.\n"; helpString += "You must also provide an accnos containing the list of groups to get or set the groups parameter to the groups you wish to select.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like. You can separate group names with dashes.\n"; helpString += "The get.groups command should be in the following format: get.groups(accnos=yourAccnos, fasta=yourFasta, group=yourGroupFile).\n"; @@ -71,6 +72,7 @@ string GetGroupsCommand::getOutputFileNameTag(string type, string inputName=""){ else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "shared") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "design") { outputFileName = "pick" + m->getExtension(inputName); } @@ -97,6 +99,7 @@ GetGroupsCommand::GetGroupsCommand(){ outputTypes["list"] = tempOutNames; outputTypes["shared"] = tempOutNames; outputTypes["design"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "GetGroupsCommand", "GetGroupsCommand"); @@ -135,6 +138,7 @@ GetGroupsCommand::GetGroupsCommand(string option) { outputTypes["list"] = tempOutNames; outputTypes["shared"] = tempOutNames; outputTypes["design"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter @@ -208,6 +212,14 @@ GetGroupsCommand::GetGroupsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["design"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -227,11 +239,6 @@ GetGroupsCommand::GetGroupsCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } - groupfile = validParameter.validFile(parameters, "group", true); - if (groupfile == "not open") { groupfile = ""; abort = true; } - else if (groupfile == "not found") { groupfile = ""; } - else { m->setGroupFile(groupfile); } - listfile = validParameter.validFile(parameters, "list", true); if (listfile == "not open") { abort = true; } else if (listfile == "not found") { listfile = ""; } @@ -263,8 +270,22 @@ GetGroupsCommand::GetGroupsCommand(string option) { if (designfile == "not open") { designfile = ""; abort = true; } else if (designfile == "not found") { designfile = ""; } else { m->setDesignFile(designfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + - if ((sharedfile == "") && (groupfile == "") && (designfile == "")) { + if ((sharedfile == "") && (groupfile == "") && (designfile == "") && (countfile == "")) { //is there are current file available for any of these? if ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != "")) { //give priority to group, then shared @@ -274,7 +295,11 @@ GetGroupsCommand::GetGroupsCommand(string option) { sharedfile = m->getSharedFile(); if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You have no current groupfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current groupfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + } } } }else { @@ -288,7 +313,12 @@ GetGroupsCommand::GetGroupsCommand(string option) { designfile = m->getDesignFile(); if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You have no current groupfile or sharedfile or designfile and one is required."); m->mothurOutEndLine(); abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current groupfile, designfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + } + } } } @@ -297,13 +327,15 @@ GetGroupsCommand::GetGroupsCommand(string option) { if ((accnosfile == "") && (Groups.size() == 0)) { m->mothurOut("You must provide an accnos file or specify groups using the groups parameter."); m->mothurOutEndLine(); abort = true; } - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (designfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared, design or list."); m->mothurOutEndLine(); abort = true; } - if ((groupfile == "") && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != ""))) { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group file."); m->mothurOutEndLine(); abort = true; } - - if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ - vector files; files.push_back(fastafile); files.push_back(taxfile); - parser.getNameFile(files); - } + if ((fastafile == "") && (namefile == "") && (countfile == "") && (groupfile == "") && (designfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared, design, count or list."); m->mothurOutEndLine(); abort = true; } + if (((groupfile == "") && (countfile == "")) && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != ""))) { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group or count file."); m->mothurOutEndLine(); abort = true; } + + if (countfile == "") { + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } } @@ -331,6 +363,7 @@ int GetGroupsCommand::execute(){ SharedUtil* util = new SharedUtil(); vector gNamesOfGroups = groupMap->getNamesOfGroups(); util->setGroups(Groups, gNamesOfGroups); + m->setGroups(Groups); groupMap->setNamesOfGroups(gNamesOfGroups); delete util; @@ -338,7 +371,23 @@ int GetGroupsCommand::execute(){ fillNames(); delete groupMap; - } + }else if (countfile != ""){ + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + CountTable ct; + ct.readTable(countfile); + if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: your count file does not contain group info, aborting.\n"); return 0; } + + vector gNamesOfGroups = ct.getNamesOfGroups(); + SharedUtil util; + util.setGroups(Groups, gNamesOfGroups); + m->setGroups(Groups); + for (int i = 0; i < Groups.size(); i++) { + vector thisGroupsSeqs = ct.getNamesOfSeqs(Groups[i]); + for (int j = 0; j < thisGroupsSeqs.size(); j++) { names.insert(thisGroupsSeqs[j]); } + } + } if (m->control_pressed) { return 0; } @@ -346,6 +395,7 @@ int GetGroupsCommand::execute(){ if (namefile != "") { readName(); } if (fastafile != "") { readFasta(); } if (groupfile != "") { readGroup(); } + if (countfile != "") { readCount(); } if (listfile != "") { readList(); } if (taxfile != "") { readTax(); } if (sharedfile != "") { readShared(); } @@ -396,6 +446,11 @@ int GetGroupsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setDesignFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -742,6 +797,82 @@ int GetGroupsCommand::readGroup(){ } } //********************************************************************************************************************** +int GetGroupsCommand::readCount(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + int selectedCount = 0; + + string headers = m->getline(in); m->gobble(in); + vector columnHeaders = m->splitWhiteSpace(headers); + + vector groups; + map originalGroupIndexes; + map GroupIndexes; + set indexOfGroupsChosen; + for (int i = 2; i < columnHeaders.size(); i++) { groups.push_back(columnHeaders[i]); originalGroupIndexes[i-2] = columnHeaders[i]; } + //sort groups to keep consistent with how we store the groups in groupmap + sort(groups.begin(), groups.end()); + for (int i = 0; i < groups.size(); i++) { GroupIndexes[groups[i]] = i; } + sort(Groups.begin(), Groups.end()); + out << "Representative_Sequence\ttotal\t"; + for (int i = 0; i < Groups.size(); i++) { out << Groups[i] << '\t'; indexOfGroupsChosen.insert(GroupIndexes[Groups[i]]); } + out << endl; + + string name; int oldTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); in >> oldTotal; m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + toString(oldTotal) + "\n"); } + + if (names.count(name) != 0) { + //if group info, then read it + vector selectedCounts; int thisTotal = 0; int temp; + for (int i = 0; i < groups.size(); i++) { + int thisIndex = GroupIndexes[originalGroupIndexes[i]]; + in >> temp; m->gobble(in); + if (indexOfGroupsChosen.count(thisIndex) != 0) { //we want this group + selectedCounts.push_back(temp); thisTotal += temp; + } + } + + out << name << '\t' << thisTotal << '\t'; + for (int i = 0; i < selectedCounts.size(); i++) { out << selectedCounts[i] << '\t'; } + out << endl; + + wroteSomething = true; + selectedCount+= thisTotal; + }else { m->getline(in); } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file does NOT contain sequences from the groups you wish to get."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + m->mothurOut("Selected " + toString(selectedCount) + " sequences from your count file."); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "GetGroupsCommand", "readCount"); + exit(1); + } +} +//********************************************************************************************************************** int GetGroupsCommand::readDesign(){ try { string thisOutputDir = outputDir; diff --git a/getgroupscommand.h b/getgroupscommand.h index 80230b4..6bb6088 100644 --- a/getgroupscommand.h +++ b/getgroupscommand.h @@ -40,7 +40,7 @@ private: map uniqueToRedundant; //if a namefile is given and the first column name is not selected //then the other files need to change the unique name in their file to match. //only add the names that need to be changed to keep the map search quick - string accnosfile, fastafile, namefile, groupfile, listfile, designfile, taxfile, outputDir, groups, sharedfile; + string accnosfile, countfile, fastafile, namefile, groupfile, listfile, designfile, taxfile, outputDir, groups, sharedfile; bool abort; vector outputNames, Groups; GroupMap* groupMap; @@ -48,6 +48,7 @@ private: int readFasta(); int readName(); int readGroup(); + int readCount(); int readList(); int readTax(); int fillNames(); diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp index 1aba0fe..645655d 100644 --- a/getlineagecommand.cpp +++ b/getlineagecommand.cpp @@ -10,13 +10,15 @@ #include "getlineagecommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** vector GetLineageCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,true); parameters.push_back(ptaxonomy); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport); @@ -38,9 +40,9 @@ vector GetLineageCommand::setParameters(){ string GetLineageCommand::getHelpString(){ try { string helpString = ""; - helpString += "The get.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, list or alignreport file.\n"; + helpString += "The get.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, count, list or alignreport file.\n"; helpString += "It outputs a file containing only the sequences from the taxonomy file that are from the taxon requested.\n"; - helpString += "The get.lineage command parameters are taxon, fasta, name, group, list, taxonomy, alignreport and dups. You must provide taxonomy unless you have a valid current taxonomy file.\n"; + helpString += "The get.lineage command parameters are taxon, fasta, name, group, count, list, taxonomy, alignreport and dups. You must provide taxonomy unless you have a valid current taxonomy file.\n"; helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"; helpString += "The taxon parameter allows you to select the taxons you would like to get and is required.\n"; helpString += "You may enter your taxons with confidence scores, doing so will get only those sequences that belong to the taxonomy and whose cofidence scores is above the scores you give.\n"; @@ -70,6 +72,7 @@ string GetLineageCommand::getOutputFileNameTag(string type, string inputName="") if (type == "fasta") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "alignreport") { outputFileName = "pick.align.report"; } @@ -94,6 +97,7 @@ GetLineageCommand::GetLineageCommand(){ outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "GetLineageCommand", "GetLineageCommand"); @@ -131,6 +135,7 @@ GetLineageCommand::GetLineageCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -187,6 +192,14 @@ GetLineageCommand::GetLineageCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -230,6 +243,19 @@ GetLineageCommand::GetLineageCommand(string option) { else { temp = "false"; usedDups = ""; } } dups = m->isTrue(temp); + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } taxons = validParameter.validFile(parameters, "taxon", false); if (taxons == "not found") { taxons = ""; m->mothurOut("No taxons given, please correct."); m->mothurOutEndLine(); abort = true; } @@ -240,12 +266,14 @@ GetLineageCommand::GetLineageCommand(string option) { } m->splitAtChar(taxons, listOfTaxons, '-'); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (countfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } - if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ - vector files; files.push_back(fastafile); files.push_back(taxfile); - parser.getNameFile(files); - } + if (countfile == "") { + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } } @@ -262,11 +290,18 @@ int GetLineageCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } if (m->control_pressed) { return 0; } + + if (countfile != "") { + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + } //read through the correct file and output lines you want to keep if (taxfile != "") { readTax(); } //fills the set of names to get if (namefile != "") { readName(); } if (fastafile != "") { readFasta(); } + if (countfile != "") { readCount(); } if (groupfile != "") { readGroup(); } if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } @@ -305,7 +340,12 @@ int GetLineageCommand::execute(){ itTypes = outputTypes.find("taxonomy"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } - } + } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -353,7 +393,7 @@ int GetLineageCommand::readFasta(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName); return 0; @@ -365,6 +405,60 @@ int GetLineageCommand::readFasta(){ } } //********************************************************************************************************************** +int GetLineageCommand::readCount(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + + string headers = m->getline(in); m->gobble(in); + out << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); } + + if (names.count(name) != 0) { + out << name << '\t' << thisTotal << '\t' << rest << endl; + wroteSomething = true; + } + } + in.close(); + out.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(outputFileName)) { + ct.readTable(outputFileName); + ct.printTable(outputFileName); + } + + + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "readCount"); + exit(1); + } +} +//********************************************************************************************************************** int GetLineageCommand::readList(){ try { string thisOutputDir = outputDir; @@ -425,7 +519,7 @@ int GetLineageCommand::readList(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["list"].push_back(outputFileName); return 0; @@ -510,7 +604,7 @@ int GetLineageCommand::readName(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["name"].push_back(outputFileName); return 0; @@ -558,7 +652,7 @@ int GetLineageCommand::readGroup(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["group"].push_back(outputFileName); return 0; @@ -606,15 +700,17 @@ int GetLineageCommand::readTax(){ in >> name; //read from first column in >> tax; //read from second column + string noQuotesTax = m->removeQuotes(tax); + for (int j = 0; j < listOfTaxons.size(); j++) { - string newtax = tax; + string newtax = noQuotesTax; //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them if (!taxonsHasConfidence[j]) { - int hasConfidences = tax.find_first_of('('); + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences != string::npos) { - newtax = tax; + newtax = noQuotesTax; m->removeConfidences(newtax); } @@ -627,7 +723,7 @@ int GetLineageCommand::readTax(){ break; } }else{//if listOfTaxons[i] has them and you don't them remove taxons - int hasConfidences = tax.find_first_of('('); + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences == string::npos) { int pos = newtax.find(noConfidenceTaxons[j]); @@ -641,10 +737,10 @@ int GetLineageCommand::readTax(){ }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons //first remove confidences from both and see if the taxonomy exists - string noNewTax = tax; - int hasConfidences = tax.find_first_of('('); + string noNewTax = noQuotesTax; + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences != string::npos) { - noNewTax = tax; + noNewTax = noQuotesTax; m->removeConfidences(noNewTax); } @@ -814,7 +910,7 @@ int GetLineageCommand::readAlign(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["alignreport"].push_back(outputFileName); return 0; diff --git a/getlineagecommand.h b/getlineagecommand.h index 0ab042b..99bc0fa 100644 --- a/getlineagecommand.h +++ b/getlineagecommand.h @@ -36,11 +36,12 @@ class GetLineageCommand : public Command { private: set names; vector outputNames, listOfTaxons; - string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons; + string fastafile, namefile, groupfile, alignfile, countfile, listfile, taxfile, outputDir, taxons; bool abort, dups; int readFasta(); int readName(); + int readCount(); int readGroup(); int readAlign(); int readList(); diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index 4967f24..9f4dd54 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -41,9 +41,10 @@ vector GetOTURepCommand::setParameters(){ try { CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "ColumnName",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); @@ -68,9 +69,9 @@ vector GetOTURepCommand::setParameters(){ string GetOTURepCommand::getHelpString(){ try { string helpString = ""; - helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n"; + helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, count, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n"; helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n"; - helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n"; + helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name or count filename is required. \n"; helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n"; helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n"; helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n"; @@ -106,6 +107,7 @@ string GetOTURepCommand::getOutputFileNameTag(string type, string inputName=""){ else { if (type == "fasta") { outputFileName = "rep.fasta"; } else if (type == "name") { outputFileName = "rep.names"; } + else if (type == "count") { outputFileName = "rep.count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -123,6 +125,7 @@ GetOTURepCommand::GetOTURepCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand"); @@ -157,6 +160,7 @@ GetOTURepCommand::GetOTURepCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -210,6 +214,14 @@ GetOTURepCommand::GetOTURepCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -245,6 +257,24 @@ GetOTURepCommand::GetOTURepCommand(string option) { if (namefile == "not open") { abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + hasGroups = false; + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not found") { countfile = ""; } + else if (countfile == "not open") { abort = true; countfile = ""; } + else { + m->setCountTableFile(countfile); + ct.readTable(countfile); + if (ct.hasGroupInfo()) { hasGroups = true; } + } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these? //give priority to column, then phylip @@ -261,14 +291,18 @@ GetOTURepCommand::GetOTURepCommand(string option) { }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; } if (columnfile != "") { - if (namefile == "") { + if ((namefile == "") && (countfile == "")) { namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); + abort = true; + } } - } + } } //check for optional parameter and set defaults @@ -292,15 +326,15 @@ GetOTURepCommand::GetOTURepCommand(string option) { sorted = ""; } - if ((sorted == "group") && (groupfile == "")) { - m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine(); + if ((sorted == "group") && ((groupfile == "")&& !hasGroups)) { + m->mothurOut("You must provide a groupfile or have a count file with group info to sort by group. I will not sort."); m->mothurOutEndLine(); sorted = ""; } groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = ""; } else { - if (groupfile == "") { + if ((groupfile == "") && (!hasGroups)) { m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine(); abort = true; }else { @@ -340,106 +374,9 @@ int GetOTURepCommand::execute(){ int error; list = NULL; - if (!large) { - //read distance files - if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); } - else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); } - else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; } - - readMatrix->setCutoff(cutoff); - - if(namefile != ""){ - nameMap = new NameAssignment(namefile); - nameMap->readMap(); - }else{ nameMap = NULL; } - - readMatrix->read(nameMap); - - if (m->control_pressed) { delete readMatrix; return 0; } - - list = readMatrix->getListVector(); - - SparseDistanceMatrix* matrix = readMatrix->getDMatrix(); - - // Create a data structure to quickly access the distance information. - // It consists of a vector of distance maps, where each map contains - // all distances of a certain sequence. Vector and maps are accessed - // via the index of a sequence in the distance matrix - seqVec = vector(list->size()); - for (int i = 0; i < matrix->seqVec.size(); i++) { - for (int j = 0; j < matrix->seqVec[i].size(); j++) { - if (m->control_pressed) { delete readMatrix; return 0; } - //already added everyone else in row - if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; } - } - } - //add dummy map for unweighted calc - SeqMap dummy; - seqVec.push_back(dummy); - - delete matrix; - delete readMatrix; - delete nameMap; - - if (m->control_pressed) { return 0; } - }else { - //process file and set up indexes - if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); } - else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); } - else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; } - - formatMatrix->setCutoff(cutoff); - - if(namefile != ""){ - nameMap = new NameAssignment(namefile); - nameMap->readMap(); - }else{ nameMap = NULL; } - - formatMatrix->read(nameMap); - - if (m->control_pressed) { delete formatMatrix; return 0; } - - list = formatMatrix->getListVector(); - - distFile = formatMatrix->getFormattedFileName(); - - //positions in file where the distances for each sequence begin - //rowPositions[1] = position in file where distance related to sequence 1 start. - rowPositions = formatMatrix->getRowPositions(); - rowPositions.push_back(-1); //dummy row for unweighted calc - - delete formatMatrix; - delete nameMap; - - //openfile for getMap to use - m->openInputFile(distFile, inRow); - - if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; } - } - - - //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix - if (list != NULL) { - vector names; - string binnames; - //map names to rows in sparsematrix - for (int i = 0; i < list->size(); i++) { - names.clear(); - binnames = list->get(i); - - m->splitAtComma(binnames, names); - - for (int j = 0; j < names.size(); j++) { - nameToIndex[names[j]] = i; - } - } - } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); } - + readDist(); - if (m->control_pressed) { - if (large) { inRow.close(); m->mothurRemove(distFile); } - return 0; - } + if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); } return 0; } if (groupfile != "") { //read in group map info. @@ -448,13 +385,18 @@ int GetOTURepCommand::execute(){ if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; } if (Groups.size() != 0) { - SharedUtil* util = new SharedUtil(); + SharedUtil util; vector gNamesOfGroups = groupMap->getNamesOfGroups(); - util->setGroups(Groups, gNamesOfGroups, "getoturep"); + util.setGroups(Groups, gNamesOfGroups, "getoturep"); groupMap->setNamesOfGroups(gNamesOfGroups); - delete util; } - } + }else if (hasGroups) { + if (Groups.size() != 0) { + SharedUtil util; + vector gNamesOfGroups = ct.getNamesOfGroups(); + util.setGroups(Groups, gNamesOfGroups, "getoturep"); + } + } //done with listvector from matrix if (list != NULL) { delete list; } @@ -595,6 +537,11 @@ int GetOTURepCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); @@ -608,7 +555,116 @@ int GetOTURepCommand::execute(){ exit(1); } } +//********************************************************************************************************************** +int GetOTURepCommand::readDist() { + try { + + if (!large) { + //read distance files + if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); } + else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); } + else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; } + + readMatrix->setCutoff(cutoff); + + NameAssignment* nameMap = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + readMatrix->read(nameMap); + }else if (countfile != "") { + readMatrix->read(&ct); + } + + if (m->control_pressed) { delete readMatrix; return 0; } + + list = readMatrix->getListVector(); + SparseDistanceMatrix* matrix = readMatrix->getDMatrix(); + + // Create a data structure to quickly access the distance information. + // It consists of a vector of distance maps, where each map contains + // all distances of a certain sequence. Vector and maps are accessed + // via the index of a sequence in the distance matrix + seqVec = vector(list->size()); + for (int i = 0; i < matrix->seqVec.size(); i++) { + for (int j = 0; j < matrix->seqVec[i].size(); j++) { + if (m->control_pressed) { delete readMatrix; return 0; } + //already added everyone else in row + if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; } + } + } + //add dummy map for unweighted calc + SeqMap dummy; + seqVec.push_back(dummy); + + delete matrix; + delete readMatrix; + delete nameMap; + + if (m->control_pressed) { return 0; } + }else { + //process file and set up indexes + if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); } + else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); } + else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; } + + formatMatrix->setCutoff(cutoff); + + NameAssignment* nameMap = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + readMatrix->read(nameMap); + }else if (countfile != "") { + readMatrix->read(&ct); + } + + if (m->control_pressed) { delete formatMatrix; return 0; } + + list = formatMatrix->getListVector(); + distFile = formatMatrix->getFormattedFileName(); + + //positions in file where the distances for each sequence begin + //rowPositions[1] = position in file where distance related to sequence 1 start. + rowPositions = formatMatrix->getRowPositions(); + rowPositions.push_back(-1); //dummy row for unweighted calc + + delete formatMatrix; + delete nameMap; + + //openfile for getMap to use + m->openInputFile(distFile, inRow); + + if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; } + } + + + //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix + if (list != NULL) { + vector names; + string binnames; + //map names to rows in sparsematrix + for (int i = 0; i < list->size(); i++) { + names.clear(); + binnames = list->get(i); + + m->splitAtComma(binnames, names); + + for (int j = 0; j < names.size(); j++) { + nameToIndex[names[j]] = i; + } + } + } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); } + if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); }return 0; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "GetOTURepCommand", "execute"); + exit(1); + } +} //********************************************************************************************************************** void GetOTURepCommand::readNamesFile() { try { @@ -677,32 +733,38 @@ void GetOTURepCommand::readNamesFile(bool w) { } } //********************************************************************************************************************** -string GetOTURepCommand::findRep(vector names) { +string GetOTURepCommand::findRep(vector names, string group) { try{ // if only 1 sequence in bin or processing the "unique" label, then // the first sequence of the OTU is the representative one if ((names.size() == 1)) { return names[0]; }else{ - vector seqIndex(names.size()); - vector max_dist(names.size()); - vector total_dist(names.size()); + vector seqIndex; //(names.size()); map::iterator itNameFile; map::iterator itNameIndex; //fill seqIndex and initialize sums for (size_t i = 0; i < names.size(); i++) { if (weighted) { - seqIndex[i] = nameToIndex[names[i]]; + seqIndex.push_back(nameToIndex[names[i]]); + if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone. + int numRep = 0; + if (group != "") { numRep = ct.getGroupCount(names[i], group); } + else { numRep = ct.getGroupCount(names[i]); } + for (int j = 1; j < numRep; j++) { //don't add yourself again + seqIndex.push_back(nameToIndex[names[i]]); + } + } }else { if (namefile == "") { itNameIndex = nameToIndex.find(names[i]); if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique - if (large) { seqIndex[i] = (rowPositions.size()-1); } - else { seqIndex[i] = (seqVec.size()-1); } + if (large) { seqIndex.push_back((rowPositions.size()-1)); } + else { seqIndex.push_back((seqVec.size()-1)); } }else { - seqIndex[i] = itNameIndex->second; + seqIndex.push_back(itNameIndex->second); } }else { @@ -715,17 +777,18 @@ string GetOTURepCommand::findRep(vector names) { string name2 = itNameFile->second; if (name1 == name2) { //then you are unique so add your real dists - seqIndex[i] = nameToIndex[names[i]]; + seqIndex.push_back(nameToIndex[names[i]]); }else { //add dummy - if (large) { seqIndex[i] = (rowPositions.size()-1); } - else { seqIndex[i] = (seqVec.size()-1); } + if (large) { seqIndex.push_back((rowPositions.size()-1)); } + else { seqIndex.push_back((seqVec.size()-1)); } } } } } - max_dist[i] = 0.0; - total_dist[i] = 0.0; } + + vector max_dist(seqIndex.size(), 0.0); + vector total_dist(seqIndex.size(), 0.0); // loop through all entries in seqIndex SeqMap::iterator it; @@ -795,19 +858,33 @@ int GetOTURepCommand::process(ListVector* processList) { map filehandles; if (Groups.size() == 0) { //you don't want to use groups - outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + getOutputFileNameTag("name"); - m->openOutputFile(outputNamesFile, newNamesOutput); - outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); + outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "."; + if (countfile == "") { + outputNamesFile += getOutputFileNameTag("name"); + outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); + }else { + outputNamesFile += getOutputFileNameTag("count"); + outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile); + } outputNameFiles[outputNamesFile] = processList->getLabel(); + m->openOutputFile(outputNamesFile, newNamesOutput); + newNamesOutput << "noGroup" << endl; }else{ //you want to use groups ofstream* temp; for (int i=0; igetRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + "." + getOutputFileNameTag("name"); + outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + "."; + if (countfile == "") { + outputNamesFile += getOutputFileNameTag("name"); + outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); + }else { + outputNamesFile += getOutputFileNameTag("count"); + outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile); + } m->openOutputFile(outputNamesFile, *(temp)); - outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); + *(temp) << Groups[i] << endl; outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i]; } } @@ -832,7 +909,7 @@ int GetOTURepCommand::process(ListVector* processList) { m->splitAtComma(temp, namesInBin); if (Groups.size() == 0) { - nameRep = findRep(namesInBin); + nameRep = findRep(namesInBin, ""); newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl; }else{ map > NamesInGroup; @@ -841,20 +918,25 @@ int GetOTURepCommand::process(ListVector* processList) { } for (int j=0; jgetGroup(namesInBin[j]); - - if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } - - if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group - NamesInGroup[thisgroup].push_back(namesInBin[j]); - } + if (groupfile != "") { + string thisgroup = groupMap->getGroup(namesInBin[j]); + if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + + //add this name to correct group + if (m->inUsersGroups(thisgroup, Groups)) { NamesInGroup[thisgroup].push_back(namesInBin[j]); } + }else { + vector thisSeqsGroups = ct.getGroups(namesInBin[j]); + for (int k = 0; k < thisSeqsGroups.size(); k++) { + if (m->inUsersGroups(thisSeqsGroups[k], Groups)) { NamesInGroup[thisSeqsGroups[k]].push_back(namesInBin[j]); } + } + } } //get rep for each group in otu for (int j=0; jopenOutputFile(tempNameFile, out2); - + ifstream in; m->openInputFile(filename, in); int i = 0; + string tempGroup = ""; + in >> tempGroup; m->gobble(in); + + CountTable thisCt; + if (countfile != "") { + thisCt.readTable(countfile); + if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; } + } + + int thistotal = 0; while (!in.eof()) { string rep, binnames; in >> i >> rep >> binnames; m->gobble(in); - out2 << rep << '\t' << binnames << endl; vector names; m->splitAtComma(binnames, names); int binsize = names.size(); - + + if (countfile == "") { out2 << rep << '\t' << binnames << endl; } + else { + if (tempGroup == "noGroup") { + for (int j = 0; j < names.size(); j++) { + if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); } + } + binsize = thisCt.getNumSeqs(rep); + }else { + int total = 0; + for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); } + out2 << rep << '\t' << total << '\t' << total << endl; + binsize = total; + } + } + thistotal += binsize; //if you have a groupfile string group = ""; + map groups; + map::iterator groupIt; if (groupfile != "") { - map groups; - map::iterator groupIt; - //find the groups that are in this bin - for (size_t i = 0; i < names.size(); i++) { + for (int i = 0; i < names.size(); i++) { string groupName = groupMap->getGroup(names[i]); if (groupName == "not found") { m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine(); @@ -937,7 +1042,21 @@ int GetOTURepCommand::processFastaNames(string filename, string label) { } //rip off last dash group = group.substr(0, group.length()-1); - }else{ group = ""; } + }else if (hasGroups) { + map groups; + for (int i = 0; i < names.size(); i++) { + vector thisSeqsGroups = ct.getGroups(names[i]); + for (int j = 0; j < thisSeqsGroups.size(); j++) { groups[thisSeqsGroups[j]] = thisSeqsGroups[j]; } + } + //turn the groups into a string + for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { + group += groupIt->first + "-"; + } + //rip off last dash + group = group.substr(0, group.length()-1); + //cout << group << endl; + } + else{ group = ""; } //print out name and sequence for that bin @@ -947,7 +1066,7 @@ int GetOTURepCommand::processFastaNames(string filename, string label) { if (sorted == "") { //print them out rep = rep + "\t" + toString(i+1); rep = rep + "|" + toString(binsize); - if (groupfile != "") { + if (group != "") { rep = rep + "|" + group; } out << ">" << rep << endl; @@ -973,7 +1092,7 @@ int GetOTURepCommand::processFastaNames(string filename, string label) { string sequence = fasta->getSequence(reps[i].name); string outputName = reps[i].name + "\t" + toString(reps[i].bin); outputName = outputName + "|" + toString(reps[i].size); - if (groupfile != "") { + if (reps[i].group != "") { outputName = outputName + "|" + reps[i].group; } out << ">" << outputName << endl; @@ -984,9 +1103,11 @@ int GetOTURepCommand::processFastaNames(string filename, string label) { in.close(); out.close(); out2.close(); - + m->mothurRemove(filename); rename(tempNameFile.c_str(), filename.c_str()); + + if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); } return 0; @@ -1012,10 +1133,35 @@ int GetOTURepCommand::processNames(string filename, string label) { int i = 0; string rep, binnames; + + string tempGroup = ""; + in >> tempGroup; m->gobble(in); + + CountTable thisCt; + if (countfile != "") { + thisCt.readTable(countfile); + if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; } + } + while (!in.eof()) { if (m->control_pressed) { break; } in >> i >> rep >> binnames; m->gobble(in); - out2 << rep << '\t' << binnames << endl; + + if (countfile == "") { out2 << rep << '\t' << binnames << endl; } + else { + vector names; + m->splitAtComma(binnames, names); + if (tempGroup == "noGroup") { + for (int j = 0; j < names.size(); j++) { + if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); } + } + }else { + int total = 0; + for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); } + out2 << rep << '\t' << total << '\t' << total << endl; + } + } + } in.close(); out2.close(); @@ -1023,6 +1169,8 @@ int GetOTURepCommand::processNames(string filename, string label) { m->mothurRemove(filename); rename(tempNameFile.c_str(), filename.c_str()); + if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); } + return 0; } catch(exception& e) { diff --git a/getoturepcommand.h b/getoturepcommand.h index d19a396..3906329 100644 --- a/getoturepcommand.h +++ b/getoturepcommand.h @@ -18,6 +18,7 @@ #include "groupmap.h" #include "readmatrix.hpp" #include "formatmatrix.h" +#include "counttable.h" typedef map SeqMap; @@ -60,10 +61,11 @@ private: ReadMatrix* readMatrix; FormatMatrix* formatMatrix; NameAssignment* nameMap; - string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, columnfile, distFile, format, outputDir, groups; + CountTable ct; + string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, countfile, columnfile, distFile, format, outputDir, groups; ofstream out; ifstream in, inNames, inRow; - bool abort, allLines, groupError, large, weighted; + bool abort, allLines, groupError, large, weighted, hasGroups; set labels; //holds labels to be used map nameToIndex; //maps sequence name to index in sparsematrix map nameFileMap; @@ -79,9 +81,10 @@ private: void readNamesFile(bool); int process(ListVector*); SeqMap getMap(int); - string findRep(vector); // returns the name of the "representative" sequence of given bin or subset of a bin, for groups + string findRep(vector, string); // returns the name of the "representative" sequence of given bin or subset of a bin, for groups int processNames(string, string); int processFastaNames(string, string); + int readDist(); }; #endif diff --git a/getseqscommand.cpp b/getseqscommand.cpp index ccabafb..6b16111 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -10,13 +10,15 @@ #include "getseqscommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** vector GetSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport); @@ -40,7 +42,7 @@ vector GetSeqsCommand::setParameters(){ string GetSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, list, taxonomy, quality or alignreport file.\n"; + helpString += "The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, count, list, taxonomy, quality or alignreport file.\n"; helpString += "It outputs a file containing only the sequences in the .accnos file.\n"; helpString += "The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups. You must provide accnos unless you have a valid current accnos file, and at least one of the other parameters.\n"; helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"; @@ -68,6 +70,7 @@ GetSeqsCommand::GetSeqsCommand(){ outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["accnosreport"] = tempOutNames; } catch(exception& e) { @@ -88,6 +91,7 @@ string GetSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ if (type == "fasta") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "qfile") { outputFileName = "pick" + m->getExtension(inputName); } @@ -135,6 +139,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["accnosreport"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -215,6 +220,14 @@ GetSeqsCommand::GetSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["qfile"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -270,17 +283,32 @@ GetSeqsCommand::GetSeqsCommand(string option) { if (accnosfile2 == "not open") { abort = true; } else if (accnosfile2 == "not found") { accnosfile2 = ""; } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + string usedDups = "true"; string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "true"; usedDups = ""; } dups = m->isTrue(temp); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; } - - if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ - vector files; files.push_back(fastafile); files.push_back(taxfile); - parser.getNameFile(files); - } + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "") && (countfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; } + + if (countfile == "") { + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } } @@ -300,11 +328,18 @@ int GetSeqsCommand::execute(){ names = m->readAccnos(accnosfile); if (m->control_pressed) { return 0; } + + if (countfile != "") { + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + } //read through the correct file and output lines you want to keep if (namefile != "") { readName(); } if (fastafile != "") { readFasta(); } if (groupfile != "") { readGroup(); } + if (countfile != "") { readCount(); } if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } if (taxfile != "") { readTax(); } @@ -354,6 +389,10 @@ int GetSeqsCommand::execute(){ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); } } + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -493,6 +532,64 @@ int GetSeqsCommand::readQual(){ exit(1); } } +//********************************************************************************************************************** +int GetSeqsCommand::readCount(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + int selectedCount = 0; + + string headers = m->getline(in); m->gobble(in); + out << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); } + + if (names.count(name) != 0) { + out << name << '\t' << thisTotal << '\t' << rest << endl; + wroteSomething = true; + selectedCount+= thisTotal; + } + } + in.close(); + out.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(outputFileName)) { + ct.readTable(outputFileName); + ct.printTable(outputFileName); + } + + if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + m->mothurOut("Selected " + toString(selectedCount) + " sequences from your count file."); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "GetSeqsCommand", "readCount"); + exit(1); + } +} + //********************************************************************************************************************** int GetSeqsCommand::readList(){ try { @@ -528,19 +625,16 @@ int GetSeqsCommand::readList(){ //parse out names that are in accnos file string binnames = list.get(i); + vector bnames; + m->splitAtComma(binnames, bnames); string newNames = ""; - while (binnames.find_first_of(',') != -1) { - string name = binnames.substr(0,binnames.find_first_of(',')); - binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length()); - + for (int i = 0; i < bnames.size(); i++) { + string name = bnames[i]; //if that name is in the .accnos file, add it if (names.count(name) != 0) { newNames += name + ","; selectedCount++; if (m->debug) { sanity["list"].insert(name); } } } - //get last name - if (names.count(binnames) != 0) { newNames += binnames + ","; selectedCount++; if (m->debug) { sanity["list"].insert(binnames); } } - //if there are names in this bin add to new list if (newNames != "") { newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma diff --git a/getseqscommand.h b/getseqscommand.h index c71b5f2..60e471e 100644 --- a/getseqscommand.h +++ b/getseqscommand.h @@ -35,7 +35,7 @@ class GetSeqsCommand : public Command { private: set names; vector outputNames; - string accnosfile, accnosfile2, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; + string accnosfile, accnosfile2, fastafile, namefile, countfile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; bool abort, dups; //for debug @@ -44,6 +44,7 @@ class GetSeqsCommand : public Command { int readFasta(); int readName(); int readGroup(); + int readCount(); int readAlign(); int readList(); int readTax(); diff --git a/groupmap.cpp b/groupmap.cpp index 612b236..fb2495c 100644 --- a/groupmap.cpp +++ b/groupmap.cpp @@ -44,6 +44,7 @@ int GroupMap::readMap() { if (pairDone) { setNamesOfGroups(seqGroup); + if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -57,6 +58,30 @@ int GroupMap::readMap() { } fileHandle.close(); + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { seqName = pieces[i]; columnOne=false; } + else { seqGroup = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + setNamesOfGroups(seqGroup); + + if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } + + it = groupmap.find(seqName); + + if (it != groupmap.end()) { error = 1; m->mothurOut("Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } + else { + groupmap[seqName] = seqGroup; //store data in map + seqsPerGroup[seqGroup]++; //increment number of seqs in that group + } + pairDone = false; + } + } + } + m->setAllGroups(namesOfGroups); return error; } @@ -88,6 +113,8 @@ int GroupMap::readDesignMap() { if (pairDone) { setNamesOfGroups(seqGroup); + if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } + it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -101,6 +128,31 @@ int GroupMap::readDesignMap() { } fileHandle.close(); + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { seqName = pieces[i]; columnOne=false; } + else { seqGroup = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + setNamesOfGroups(seqGroup); + + if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } + + it = groupmap.find(seqName); + + if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } + else { + groupmap[seqName] = seqGroup; //store data in map + seqsPerGroup[seqGroup]++; //increment number of seqs in that group + } + pairDone = false; + } + } + + } + m->setAllGroups(namesOfGroups); return error; } @@ -110,6 +162,79 @@ int GroupMap::readDesignMap() { } } /************************************************************/ +int GroupMap::readMap(string filename) { + try { + groupFileName = filename; + m->openInputFile(filename, fileHandle); + index = 0; + string seqName, seqGroup; + int error = 0; + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + + while (!fileHandle.eof()) { + if (m->control_pressed) { fileHandle.close(); return 1; } + + fileHandle.read(buffer, 4096); + vector pieces = m->splitWhiteSpace(rest, buffer, fileHandle.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { seqName = pieces[i]; columnOne=false; } + else { seqGroup = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + setNamesOfGroups(seqGroup); + + if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } + + it = groupmap.find(seqName); + + if (it != groupmap.end()) { error = 1; m->mothurOut("Your group file contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } + else { + groupmap[seqName] = seqGroup; //store data in map + seqsPerGroup[seqGroup]++; //increment number of seqs in that group + } + pairDone = false; + } + } + } + fileHandle.close(); + + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { seqName = pieces[i]; columnOne=false; } + else { seqGroup = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + setNamesOfGroups(seqGroup); + + if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } + + it = groupmap.find(seqName); + + if (it != groupmap.end()) { error = 1; m->mothurOut("Your group file contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } + else { + groupmap[seqName] = seqGroup; //store data in map + seqsPerGroup[seqGroup]++; //increment number of seqs in that group + } + pairDone = false; + } + } + } + + m->setAllGroups(namesOfGroups); + return error; + } + catch(exception& e) { + m->errorOut(e, "GroupMap", "readMap"); + exit(1); + } +} +/************************************************************/ int GroupMap::readDesignMap(string filename) { try { groupFileName = filename; @@ -135,6 +260,8 @@ int GroupMap::readDesignMap(string filename) { if (pairDone) { setNamesOfGroups(seqGroup); + if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } + it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -148,6 +275,30 @@ int GroupMap::readDesignMap(string filename) { } fileHandle.close(); + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { seqName = pieces[i]; columnOne=false; } + else { seqGroup = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + setNamesOfGroups(seqGroup); + + if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } + + it = groupmap.find(seqName); + + if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } + else { + groupmap[seqName] = seqGroup; //store data in map + seqsPerGroup[seqGroup]++; //increment number of seqs in that group + } + pairDone = false; + } + } + } + m->setAllGroups(namesOfGroups); return error; } diff --git a/groupmap.h b/groupmap.h index 567165d..d698495 100644 --- a/groupmap.h +++ b/groupmap.h @@ -21,6 +21,7 @@ public: GroupMap(string); ~GroupMap(); int readMap(); + int readMap(string); int readDesignMap(); int readDesignMap(string); int getNumGroups(); diff --git a/hcluster.cpp b/hcluster.cpp index 6cd4531..f8f4809 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -10,7 +10,6 @@ #include "hcluster.h" #include "rabundvector.hpp" #include "listvector.hpp" -#include "sparsematrix.hpp" /***********************************************************************/ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) { diff --git a/heatmapsimcommand.cpp b/heatmapsimcommand.cpp index 3de10e6..8a4a12b 100644 --- a/heatmapsimcommand.cpp +++ b/heatmapsimcommand.cpp @@ -25,7 +25,8 @@ vector HeatMapSimCommand::setParameters(){ try { CommandParameter pshared("shared", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pshared); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pphylip); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "ColumnName",false,false); parameters.push_back(pcolumn); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); @@ -48,9 +49,8 @@ string HeatMapSimCommand::getHelpString(){ try { string helpString = ""; ValidCalculators validCalculator; - helpString += "The heatmap.sim command parameters are shared, phylip, column, name, groups, calc, fontsize and label. shared or phylip or column and name are required unless valid current files exist.\n"; - helpString += "There are two ways to use the heatmap.sim command. The first is with the read.otu command. \n"; - helpString += "With the read.otu command you may use the groups, label and calc parameters. \n"; + helpString += "The heatmap.sim command parameters are shared, phylip, column, name, count, groups, calc, fontsize and label. shared or phylip or column and name are required unless valid current files exist.\n"; + helpString += "There are two ways to use the heatmap.sim command. The first is with a shared file, and you may use the groups, label and calc parameter. \n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included in your heatmap.\n"; helpString += "The group names are separated by dashes. The label parameter allows you to select what distance levels you would like a heatmap created for, and is also separated by dashes.\n"; helpString += "The fontsize parameter allows you to adjust the font size of the picture created, default=24.\n"; @@ -174,6 +174,14 @@ HeatMapSimCommand::HeatMapSimCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["shared"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //required parameters @@ -197,6 +205,12 @@ HeatMapSimCommand::HeatMapSimCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } //error checking on files if ((sharedfile == "") && ((phylipfile == "") && (columnfile == ""))) { @@ -224,8 +238,12 @@ HeatMapSimCommand::HeatMapSimCommand(string option) { namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a name or count file if you are going to use the column format."); m->mothurOutEndLine(); + abort = true; + } } } } @@ -520,20 +538,28 @@ int HeatMapSimCommand::runCommandDist() { in.close(); }else { //read names file - NameAssignment* nameMap = new NameAssignment(namefile); - nameMap->readMap(); - - //put names in order in vector - for (int i = 0; i < nameMap->size(); i++) { - names.push_back(nameMap->get(i)); - } - - //resize matrix - matrix.resize(nameMap->size()); - for (int i = 0; i < nameMap->size(); i++) { - matrix[i].resize(nameMap->size(), 0.0); - } - + NameAssignment* nameMap; + CountTable ct; + if (namefile != "") { + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + + //put names in order in vector + for (int i = 0; i < nameMap->size(); i++) { + names.push_back(nameMap->get(i)); + } + }else if (countfile != "") { + nameMap = NULL; + ct.readTable(countfile); + names = ct.getNamesOfSeqs(); + } + + //resize matrix + matrix.resize(names.size()); + for (int i = 0; i < names.size(); i++) { + matrix[i].resize(names.size(), 0.0); + } + //read column file string first, second; double dist; @@ -544,19 +570,26 @@ int HeatMapSimCommand::runCommandDist() { if (m->control_pressed) { return 0; } - map::iterator itA = nameMap->find(first); - map::iterator itB = nameMap->find(second); - - if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + first + "' was not found in the names file, please correct\n"); exit(1); } - if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + second + "' was not found in the names file, please correct\n"); exit(1); } - - //save distance - matrix[itA->second][itB->second] = dist; - matrix[itB->second][itA->second] = dist; + if (namefile != "") { + map::iterator itA = nameMap->find(first); + map::iterator itB = nameMap->find(second); + + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + first + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + second + "' was not found in the names file, please correct\n"); exit(1); } + + //save distance + matrix[itA->second][itB->second] = dist; + matrix[itB->second][itA->second] = dist; + }else if (countfile != "") { + int itA = ct.get(first); + int itB = ct.get(second); + matrix[itA][itB] = dist; + matrix[itB][itA] = dist; + } } in.close(); - delete nameMap; + if (namefile != "") { delete nameMap; } } diff --git a/heatmapsimcommand.h b/heatmapsimcommand.h index 7b74880..2c3a470 100644 --- a/heatmapsimcommand.h +++ b/heatmapsimcommand.h @@ -43,7 +43,7 @@ private: OptionParser* parser; bool abort, allLines; set labels; //holds labels to be used - string format, groups, label, calc, sharedfile, phylipfile, columnfile, namefile, outputDir, inputfile; + string format, groups, label, calc, sharedfile, phylipfile, columnfile, countfile, namefile, outputDir, inputfile; vector Estimators, Groups, outputNames; int fontsize; diff --git a/indicatorcommand.cpp b/indicatorcommand.cpp index f98620b..dc9f121 100644 --- a/indicatorcommand.cpp +++ b/indicatorcommand.cpp @@ -287,17 +287,22 @@ int IndicatorCommand::execute(){ string groupfile = ""; m->setTreeFile(treefile); Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap - treeMap = new TreeMap(); + ct = new CountTable(); bool mismatch = false; - - for (int i = 0; i < m->Treenames.size(); i++) { - //sanity check - is this a group that is not in the sharedfile? + + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->Treenames.size(); i++) { + nameMap.insert(m->Treenames[i]); + //sanity check - is this a group that is not in the sharedfile? if (designfile == "") { + if (i == 0) { gps.insert("Group1"); } if (!(m->inUsersGroups(m->Treenames[i], m->getAllGroups()))) { m->mothurOut("[ERROR]: " + m->Treenames[i] + " is not a group in your shared or relabund file."); m->mothurOutEndLine(); mismatch = true; } - treeMap->addSeq(m->Treenames[i], "Group1"); + groupMap[m->Treenames[i]] = "Group1"; }else{ vector myGroups; myGroups.push_back(m->Treenames[i]); vector myNames = designMap->getNamesSeqs(myGroups); @@ -308,9 +313,10 @@ int IndicatorCommand::execute(){ mismatch = true; } } - treeMap->addSeq(m->Treenames[i], "Group1"); + groupMap[m->Treenames[i]] = "Group1"; } - } + } + ct->createTable(nameMap, groupMap, gps); if ((designfile != "") && (m->Treenames.size() != Groups.size())) { cout << Groups.size() << '\t' << m->Treenames.size() << endl; m->mothurOut("[ERROR]: You design file does not match your tree, aborting."); m->mothurOutEndLine(); mismatch = true; } @@ -318,14 +324,14 @@ int IndicatorCommand::execute(){ if (designfile != "") { delete designMap; } if (sharedfile != "") { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } else { for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } } - delete treeMap; + delete ct; return 0; } read = new ReadNewickTree(treefile); - int readOk = read->read(treeMap); + int readOk = read->read(ct); - if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete treeMap; delete read; return 0; } + if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete ct; delete read; return 0; } vector T = read->getTrees(); @@ -335,19 +341,18 @@ int IndicatorCommand::execute(){ if (designfile != "") { delete designMap; } if (sharedfile != "") { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } else { for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } } - for (int i = 0; i < T.size(); i++) { delete T[i]; } delete treeMap; return 0; + for (int i = 0; i < T.size(); i++) { delete T[i]; } delete ct; return 0; } - map nameMap; - T[0]->assembleTree(nameMap); + T[0]->assembleTree(); /***************************************************/ // create ouptut tree - respecting pickedGroups // /***************************************************/ - Tree* outputTree = new Tree(m->getNumGroups(), treeMap); + Tree* outputTree = new Tree(m->getNumGroups(), ct); outputTree->getSubTree(T[0], m->getGroups()); - outputTree->assembleTree(nameMap); + outputTree->assembleTree(); //no longer need original tree, we have output tree to use and label for (int i = 0; i < T.size(); i++) { delete T[i]; } @@ -356,14 +361,14 @@ int IndicatorCommand::execute(){ if (designfile != "") { delete designMap; } if (sharedfile != "") { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } else { for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } } - delete outputTree; delete treeMap; return 0; + delete outputTree; delete ct; return 0; } /***************************************************/ // get indicator species values // /***************************************************/ GetIndicatorSpecies(outputTree); - delete outputTree; delete treeMap; + delete outputTree; delete ct; }else { //run with design file only //get indicator species diff --git a/indicatorcommand.h b/indicatorcommand.h index 2c36c35..3c24dfb 100644 --- a/indicatorcommand.h +++ b/indicatorcommand.h @@ -12,7 +12,7 @@ #include "command.hpp" #include "readtree.h" -#include "treemap.h" +#include "counttable.h" #include "sharedrabundvector.h" #include "sharedrabundfloatvector.h" #include "inputdata.h" @@ -36,7 +36,7 @@ public: private: ReadTree* read; - TreeMap* treeMap; + CountTable* ct; GroupMap* designMap; string treefile, sharedfile, relabundfile, groups, label, inputFileName, outputDir, designfile; bool abort; diff --git a/kmernode.cpp b/kmernode.cpp new file mode 100755 index 0000000..c087cac --- /dev/null +++ b/kmernode.cpp @@ -0,0 +1,209 @@ +/* + * kmerNode.cpp + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "kmernode.h" + + +/**********************************************************************************************************************/ + +KmerNode::KmerNode(string s, int l, int n) : TaxonomyNode(s, l), kmerSize(n) { + try { + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + + numPossibleKmers = power4s[kmerSize]; + numUniqueKmers = 0; + + kmerVector.assign(numPossibleKmers, 0); + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "KmerNode"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +void KmerNode::loadSequence(vector& kmerProfile){ + try { + for(int i=0;icontrol_pressed) { break; } + if(kmerVector[i] == 0 && kmerProfile[i] != 0) { numUniqueKmers++; } + + kmerVector[i] += kmerProfile[i]; + } + + numSeqs++; + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "loadSequence"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +string KmerNode::getKmerBases(int kmerNumber){ + try { + // Here we convert the kmer number into the kmer in terms of bases. + // + // Example: Score = 915 (for a 6-mer) + // Base6 = (915 / 4^0) % 4 = 915 % 4 = 3 => T [T] + // Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A [AT] + // Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C [CAT] + // Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G [GCAT] + // Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T [TGCAT] + // Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A [ATGCAT] -> this checks out with the previous method + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + + string kmer = ""; + + if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){ // if the kmer number is the same as the maxKmer then it must + for(int i=0;icontrol_pressed) { return kmer; } + int nt = (int)(kmerNumber / (float)power4s[i]) % 4; // the '%' operator returns the remainder + if(nt == 0) { kmer = 'A' + kmer; } // from int-based division ] + else if(nt == 1){ kmer = 'C' + kmer; } + else if(nt == 2){ kmer = 'G' + kmer; } + else if(nt == 3){ kmer = 'T' + kmer; } + } + } + return kmer; + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getKmerBases"); + exit(1); + } +} + +/**************************************************************************************************/ + +void KmerNode::addThetas(vector newTheta, int newNumSeqs){ + try { + for(int i=0;icontrol_pressed) { break; } + kmerVector[i] += newTheta[i]; + } + + // if(alignLength == 0){ + // alignLength = (int)newTheta.size(); + // theta.resize(alignLength); + // columnCounts.resize(alignLength); + // } + // + // for(int i=0;ierrorOut(e, "KmerNode", "addThetas"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +int KmerNode::getNumUniqueKmers(){ + try { + if(numUniqueKmers == 0){ + + for(int i=0;icontrol_pressed) { return numUniqueKmers; } + if(kmerVector[i] != 0){ + numUniqueKmers++; + } + + } + + } + + return numUniqueKmers; + + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getNumUniqueKmers"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +void KmerNode::printTheta(){ + try { + m->mothurOut(name + "\n"); + for(int i=0;imothurOut(getKmerBases(i) + '\t' + toString(kmerVector[i]) + "\n"); + } + } + m->mothurOutEndLine(); + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "printTheta"); + exit(1); + } + +} +/**************************************************************************************************/ + +double KmerNode::getSimToConsensus(vector& queryKmerProfile){ + try { + double present = 0; + + for(int i=0;icontrol_pressed) { return present; } + if(queryKmerProfile[i] != 0 && kmerVector[i] != 0){ + present++; + } + } + + return present / double(queryKmerProfile.size() - kmerSize + 1); + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getSimToConsensus"); + exit(1); + } +} + +/**********************************************************************************************************************/ + +double KmerNode::getPxGivenkj_D_j(vector& queryKmerProfile) { + try { + double sumLogProb = 0.0000; + double alpha = 1.0 / (double)totalSeqs; //flat prior + // double alpha = pow((1.0 / (double)numUniqueKmers), numSeqs)+0.0001; //non-flat prior + + for(int i=0;icontrol_pressed) { return sumLogProb; } + if(queryKmerProfile[i] != 0){ //numUniqueKmers needs to be the value from Root; + sumLogProb += log((kmerVector[i] + alpha) / (numSeqs + numUniqueKmers * alpha)); + } + + } + return sumLogProb; + } + catch(exception& e) { + m->errorOut(e, "KmerNode", "getPxGivenkj_D_j"); + exit(1); + } + +} + +/**********************************************************************************************************************/ diff --git a/kmernode.h b/kmernode.h new file mode 100755 index 0000000..e15fb1d --- /dev/null +++ b/kmernode.h @@ -0,0 +1,45 @@ +#ifndef KMERNODE +#define KMERNODE + +/* + * kmerNode.h + * bayesian + * + * Created by Pat Schloss on 10/11/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + + +#include "taxonomynode.h" + +/**********************************************************************************************************************/ + +class KmerNode : public TaxonomyNode { + +public: + KmerNode(string, int, int); + void loadSequence(vector&); + void printTheta(); + double getPxGivenkj_D_j(vector&); + double getSimToConsensus(vector&); + void checkTheta(){}; + void setNumUniqueKmers(int num) { numUniqueKmers = num; } + int getNumUniqueKmers(); + void addThetas(vector, int); + vector getTheta() { return kmerVector; } + + +private: + string getKmerBases(int); + int kmerSize; // value of k + int numPossibleKmers; // 4^kmerSize + int numUniqueKmers; // number of unique kmers seen in a group ~ O_kj + int numKmers; // number of kmers in a sequence + vector kmerVector; // counts of kmers across all sequences in a node +}; + +/**********************************************************************************************************************/ + +#endif + diff --git a/kmertree.cpp b/kmertree.cpp new file mode 100755 index 0000000..fbf2bfb --- /dev/null +++ b/kmertree.cpp @@ -0,0 +1,386 @@ +// +// kmerTree.cpp +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#include "kmernode.h" +#include "kmertree.h" + +/**************************************************************************************************/ + +KmerTree::KmerTree(string referenceFileName, string taxonomyFileName, int k, int cutoff) : Classify(), confidenceThreshold(cutoff), kmerSize(k){ + try { + KmerNode* newNode = new KmerNode("Root", 0, kmerSize); + tree.push_back(newNode); // the tree is stored as a vector of elements of type TaxonomyNode + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + numPossibleKmers = power4s[kmerSize]; + + string refTaxonomy; + + readTaxonomy(taxonomyFileName); + + ifstream referenceFile; + m->openInputFile(referenceFileName, referenceFile); + bool error = false; + while(!referenceFile.eof()){ + + if (m->control_pressed) { break; } + + Sequence seq(referenceFile); m->gobble(referenceFile); + + if (seq.getName() != "") { + map::iterator it = taxonomy.find(seq.getName()); + + if (it != taxonomy.end()) { + refTaxonomy = it->second; // lookup the taxonomy string for the current reference sequence + vector kmerProfile = ripKmerProfile(seq.getUnaligned()); //convert to kmer vector + addTaxonomyToTree(seq.getName(), refTaxonomy, kmerProfile); + }else { + m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true; + } + } + } + referenceFile.close(); + + if (error) { m->control_pressed = true; } + + numTaxa = (int)tree.size(); + numLevels = 0; + for(int i=0;igetLevel(); + if(level > numLevels){ numLevels = level; } + } + numLevels++; + + aggregateThetas(); + + int dbSize = tree[0]->getNumSeqs(); + + for(int i=0;icheckTheta(); + tree[i]->setNumUniqueKmers(tree[0]->getNumUniqueKmers()); + tree[i]->setTotalSeqs(dbSize); + } + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "KmerTree"); + exit(1); + } +} + +/**************************************************************************************************/ + +KmerTree::~KmerTree(){ + + for(int i=0;i KmerTree::ripKmerProfile(string sequence){ + try { + // assume all input sequences are unaligned + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + + int nKmers = (int)sequence.length() - kmerSize + 1; + + vector kmerProfile(numPossibleKmers + 1, 0); + + for(int i=0;icontrol_pressed) { break; } + + int kmer = 0; + for(int j=0;jerrorOut(e, "KmerTree", "ripKmerProfile"); + exit(1); + } +} + +/**************************************************************************************************/ + +int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector& sequence){ + try { + KmerNode* newNode; + string taxonName = ""; + int treePosition = 0; // the root is element 0 + + + int level = 1; + + for(int i=0;icontrol_pressed) { break; } + if(taxonomy[i] == ';'){ // looking for semicolons... + + if (taxonName == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; } + + int newIndex = tree[treePosition]->getChildIndex(taxonName);// look to see if your current node already + // has a child with the new taxonName + if(newIndex != -1) { treePosition = newIndex; } // if you've seen it before, jump to that + else { // position in the tree + int newChildIndex = (int)tree.size(); // otherwise, we'll have to create one... + tree[treePosition]->makeChild(taxonName, newChildIndex); + + newNode = new KmerNode(taxonName, level, kmerSize); + + newNode->setParent(treePosition); + + tree.push_back(newNode); + treePosition = newChildIndex; + } + + // sequence data to that node to update that node's theta - seems slow... + taxonName = ""; // clear out the taxon name that we will build as we look + level++; + + } // for a semicolon + else{ + taxonName += taxonomy[i]; // keep adding letters until we reach a semicolon + } + } + + tree[treePosition]->loadSequence(sequence); // now that we've gotten to the correct node, add the + + return 0; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "addTaxonomyToTree"); + exit(1); + } + +} + +/**************************************************************************************************/ + +int KmerTree::aggregateThetas(){ + try { + vector > levelMatrix(numLevels+1); + + for(int i=0;icontrol_pressed) { return 0; } + levelMatrix[tree[i]->getLevel()].push_back(i); + } + + for(int i=numLevels-1;i>0;i--) { + if (m->control_pressed) { return 0; } + + for(int j=0;jgetParent()]->addThetas(holder->getTheta(), holder->getNumSeqs()); + } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "aggregateThetas"); + exit(1); + } +} + +/**************************************************************************************************/ + +int KmerTree::getMinRiskIndexKmer(vector& sequence, vector& taxaIndices, vector& probabilities){ + try { + int numProbs = (int)probabilities.size(); + + vector G(numProbs, 0.2); //a random sequence will, on average, be 20% similar to any other sequence; not sure that this holds up for kmers; whatever. + vector risk(numProbs, 0); + + for(int i=1;icontrol_pressed) { return 0; } + G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence); + } + + double minRisk = 1e6; + int minRiskIndex = 0; + + for(int i=0;icontrol_pressed) { return 0; } + for(int j=0;jerrorOut(e, "KmerTree", "getMinRiskIndexKmer"); + exit(1); + } +} + +/**************************************************************************************************/ + +int KmerTree::sanityCheck(vector >& indices, vector& maxIndices){ + try { + int finalLevel = (int)indices.size()-1; + + for(int position=1;positioncontrol_pressed) { return 0; } + int predictedParent = tree[indices[position][maxIndices[position]]]->getParent(); + int actualParent = indices[position-1][maxIndices[position-1]]; + + if(predictedParent != actualParent){ + finalLevel = position - 1; + return finalLevel; + } + } + return finalLevel; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "sanityCheck"); + exit(1); + } +} + +/**************************************************************************************************/ +string KmerTree::getTaxonomy(Sequence* thisSeq){ + try { + string seqName = thisSeq->getName(); string querySequence = thisSeq->getAligned(); string taxonProbabilityString = ""; + string unalignedSeq = thisSeq->getUnaligned(); + + double logPOutlier = (querySequence.length() - kmerSize + 1) * log(1.0/(double)tree[0]->getNumUniqueKmers()); + + vector queryProfile = ripKmerProfile(unalignedSeq); //convert to kmer vector + + vector > pXgivenKj_D_j(numLevels); + vector > indices(numLevels); + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[i].push_back(logPOutlier); + indices[i].push_back(-1); + } + + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(queryProfile)); + indices[tree[i]->getLevel()].push_back(i); + } + + vector sumLikelihood(numLevels, 0); + vector bestPosterior(numLevels, 0); + vector maxIndex(numLevels, 0); + int maxPosteriorIndex; + + //let's find the best level and taxa within that level + for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + + int numTaxaInLevel = (int)indices[i].size(); + + vector posteriors(numTaxaInLevel, 0); + sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex); + + maxPosteriorIndex = 0; + for(int j=0;j posteriors[maxPosteriorIndex]){ + maxPosteriorIndex = j; + } + + } + + maxIndex[i] = getMinRiskIndexKmer(queryProfile, indices[i], posteriors); + + maxIndex[i] = maxPosteriorIndex; + bestPosterior[i] = posteriors[maxIndex[i]]; + } + + // vector pX_level(numLevels, 0); + // + // for(int i=0;igetNumSeqs(); + // } + // + // int max_pLevel_X_index = -1; + // double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index); + // double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum); + // + // vector pLevel_X(numLevels, 0); + // for(int i=0;icontrol_pressed) { return taxonProbabilityString; } + int confidenceScore = (int) (bestPosterior[i] * 100); + if (confidenceScore >= confidenceThreshold) { + if(indices[i][maxIndex[i]] != -1){ + taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + "(" + toString(confidenceScore) + ");"; + simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";"; + + // levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");"; + } + else{ + taxonProbabilityString += "unclassified" + '(' + toString(confidenceScore) + ");"; + // levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");"; + simpleTax += "unclassified;"; + } + }else { break; } + savedspot = i; + } + + + + for(int i=savedspot+1;icontrol_pressed) { return taxonProbabilityString; } + taxonProbabilityString += "unclassified(0);"; + simpleTax += "unclassified;"; + } + + return taxonProbabilityString; + } + catch(exception& e) { + m->errorOut(e, "KmerTree", "getTaxonomy"); + exit(1); + } +} + + +/**************************************************************************************************/ + diff --git a/kmertree.h b/kmertree.h new file mode 100755 index 0000000..f7c10ef --- /dev/null +++ b/kmertree.h @@ -0,0 +1,37 @@ +// +// kmerTree.h +// pdsBayesian +// +// Created by Patrick Schloss on 4/3/12. +// Copyright (c) 2012 University of Michigan. All rights reserved. +// + +#ifndef pdsBayesian_kmerTree_h +#define pdsBayesian_kmerTree_h + +#include "classify.h" + +class KmerNode; + +class KmerTree : public Classify { + +public: + KmerTree(string, string, int, int); + ~KmerTree(); + + string getTaxonomy(Sequence*); + +private: + int addTaxonomyToTree(string, string, vector&); + vector ripKmerProfile(string); + int getMinRiskIndexKmer(vector&, vector&, vector&); + int aggregateThetas(); + int sanityCheck(vector >&, vector&); + + int kmerSize; + int numPossibleKmers, confidenceThreshold; + vector tree; + +}; + +#endif diff --git a/knn.cpp b/knn.cpp index 837fa6d..81b21b2 100644 --- a/knn.cpp +++ b/knn.cpp @@ -14,6 +14,7 @@ Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOp : Classify(), num(n), search(method) { try { threadID = tid; + shortcuts = true; //create search database and names vector generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch); diff --git a/listseqscommand.cpp b/listseqscommand.cpp index bfbb078..7c3f07f 100644 --- a/listseqscommand.cpp +++ b/listseqscommand.cpp @@ -10,6 +10,7 @@ #include "listseqscommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** @@ -17,6 +18,7 @@ vector ListSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pfasta); CommandParameter pname("name", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pcount); CommandParameter pgroup("group", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); @@ -37,8 +39,8 @@ vector ListSeqsCommand::setParameters(){ string ListSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The list.seqs command reads a fasta, name, group, list, taxonomy or alignreport file and outputs a .accnos file containing sequence names.\n"; - helpString += "The list.seqs command parameters are fasta, name, group, list, taxonomy and alignreport. You must provide one of these parameters.\n"; + helpString += "The list.seqs command reads a fasta, name, group, count, list, taxonomy or alignreport file and outputs a .accnos file containing sequence names.\n"; + helpString += "The list.seqs command parameters are fasta, name, group, count, list, taxonomy and alignreport. You must provide one of these parameters.\n"; helpString += "The list.seqs command should be in the following format: list.seqs(fasta=yourFasta).\n"; helpString += "Example list.seqs(fasta=amazon.fasta).\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; @@ -164,6 +166,14 @@ ListSeqsCommand::ListSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -195,8 +205,13 @@ ListSeqsCommand::ListSeqsCommand(string option) { if (taxfile == "not open") { abort = true; } else if (taxfile == "not found") { taxfile = ""; } else { m->setTaxonomyFile(taxfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } - if ((fastafile == "") && (namefile == "") && (listfile == "") && (groupfile == "") && (alignfile == "") && (taxfile == "")) { m->mothurOut("You must provide a file."); m->mothurOutEndLine(); abort = true; } + if ((countfile == "") && (fastafile == "") && (namefile == "") && (listfile == "") && (groupfile == "") && (alignfile == "") && (taxfile == "")) { m->mothurOut("You must provide a file."); m->mothurOutEndLine(); abort = true; } int okay = 1; if (outputDir != "") { okay++; } @@ -225,6 +240,7 @@ int ListSeqsCommand::execute(){ else if (alignfile != "") { inputFileName = alignfile; readAlign(); } else if (listfile != "") { inputFileName = listfile; readList(); } else if (taxfile != "") { inputFileName = taxfile; readTax(); } + else if (countfile != "") { inputFileName = countfile; readCount(); } if (m->control_pressed) { outputTypes.clear(); return 0; } @@ -293,12 +309,6 @@ int ListSeqsCommand::readFasta(){ Sequence currSeq(in); name = currSeq.getName(); - //if (lastName == "") { lastName = name; } - //if (name != lastName) { count = 1; } - // lastName = name; - - //Sequence newSeq(name+"_"+toString(count), currSeq.getAligned()); - //newSeq.printSequence(out); if (name != "") { names.push_back(name); } @@ -404,7 +414,24 @@ int ListSeqsCommand::readGroup(){ exit(1); } } - +//********************************************************************************************************************** +int ListSeqsCommand::readCount(){ + try { + CountTable ct; + ct.readTable(countfile); + + if (m->control_pressed) { return 0; } + + names = ct.getNamesOfSeqs(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ListSeqsCommand", "readCount"); + exit(1); + } +} //********************************************************************************************************************** //alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name int ListSeqsCommand::readAlign(){ diff --git a/listseqscommand.h b/listseqscommand.h index 1a31a6d..8e4cce3 100644 --- a/listseqscommand.h +++ b/listseqscommand.h @@ -34,7 +34,7 @@ class ListSeqsCommand : public Command { private: vector names, outputNames; - string fastafile, namefile, groupfile, alignfile, inputFileName, outputDir, listfile, taxfile; + string fastafile, namefile, groupfile, countfile, alignfile, inputFileName, outputDir, listfile, taxfile; bool abort; int readFasta(); @@ -43,6 +43,7 @@ class ListSeqsCommand : public Command { int readAlign(); int readList(); int readTax(); + int readCount(); }; diff --git a/macros.h b/macros.h new file mode 100755 index 0000000..f95acbe --- /dev/null +++ b/macros.h @@ -0,0 +1,32 @@ +// +// macros.h +// rrf-fs-prototype +// +// Created by Abu Zaher Faridee on 5/28/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#ifndef rrf_fs_prototype_macros_h +#define rrf_fs_prototype_macros_h + +#include "mothurout.h" + +/***********************************************************************/ +class OptimumFeatureSubsetSelector{ +public: + OptimumFeatureSubsetSelector(string selectionType = "log2"): selectionType(selectionType){ + } + + int getOptimumFeatureSubsetSize(int numFeatures){ + + if (selectionType == "log2"){ return (int)ceil(log2(numFeatures)); } + else if (selectionType == "squareRoot"){ return (int)ceil(sqrt(numFeatures)); } + return -1; + } +private: + string selectionType; +}; + +/***********************************************************************/ + +#endif diff --git a/makebiomcommand.cpp b/makebiomcommand.cpp index 9e8d3e3..68e70ee 100644 --- a/makebiomcommand.cpp +++ b/makebiomcommand.cpp @@ -549,15 +549,16 @@ vector MakeBiomCommand::getMetaData(vector& lookup) if (m->control_pressed) { return metadata; } //if there is a bin label use it otherwise make one - string binLabel = binTag; - string sbinNumber = otuLabels[i]; - if (sbinNumber.length() < snumBins.length()) { - int diff = snumBins.length() - sbinNumber.length(); - for (int h = 0; h < diff; h++) { binLabel += "0"; } - } - binLabel += sbinNumber; - - labelTaxMap[binLabel] = taxs[i]; + if (m->isContainingOnlyDigits(otuLabels[i])) { + string binLabel = binTag; + string sbinNumber = otuLabels[i]; + if (sbinNumber.length() < snumBins.length()) { + int diff = snumBins.length() - sbinNumber.length(); + for (int h = 0; h < diff; h++) { binLabel += "0"; } + } + binLabel += sbinNumber; + labelTaxMap[binLabel] = taxs[i]; + }else { labelTaxMap[otuLabels[i]] = taxs[i]; } } diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index 691d706..4ae25ce 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -13,7 +13,15 @@ vector MakeContigsCommand::setParameters(){ try { CommandParameter pfasta("ffastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter prfasta("rfastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(prfasta); - CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "",false,false); parameters.push_back(palign); + CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); + CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); + + CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "",false,false); parameters.push_back(palign); + CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles); CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch); CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch); CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen); @@ -37,15 +45,22 @@ string MakeContigsCommand::getHelpString(){ try { string helpString = ""; helpString += "The make.contigs command reads a forward fastq file and a reverse fastq file and outputs new fasta and quality files.\n"; - helpString += "The make.contigs command parameters are ffastq, rfastq, align, match, mismatch, gapopen, gapextend and processors.\n"; + helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n"; + helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n"; helpString += "The ffastq and rfastq parameters are required.\n"; helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n"; + helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; + helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"; helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"; helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"; helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"; helpString += "The threshold parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=40.\n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; + helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n"; helpString += "The make.contigs command should be in the following format: \n"; helpString += "make.contigs(ffastq=yourForwardFastqFile, rfastq=yourReverseFastqFile, align=yourAlignmentMethod) \n"; helpString += "Note: No spaces between parameter labels (i.e. ffastq), '=' and parameters (i.e.yourForwardFastqFile).\n"; @@ -68,6 +83,7 @@ string MakeContigsCommand::getOutputFileNameTag(string type, string inputName="" else { if (type == "fasta") { outputFileName = "contigs.fasta"; } else if (type == "qfile") { outputFileName = "contigs.qual"; } + else if (type == "group") { outputFileName = "groups"; } else if (type == "mismatch") { outputFileName = "contigs.mismatch"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -86,6 +102,7 @@ MakeContigsCommand::MakeContigsCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["group"] = tempOutNames; outputTypes["mismatch"] = tempOutNames; } catch(exception& e) { @@ -121,6 +138,7 @@ MakeContigsCommand::MakeContigsCommand(string option) { outputTypes["fasta"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["mismatch"] = tempOutNames; + outputTypes["group"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter @@ -143,6 +161,14 @@ MakeContigsCommand::MakeContigsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["rfastq"] = inputDir + it->second; } } + + it = parameters.find("oligos"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["oligos"] = inputDir + it->second; } + } } ffastqfile = validParameter.validFile(parameters, "ffastq", true); @@ -153,6 +179,11 @@ MakeContigsCommand::MakeContigsCommand(string option) { if (rfastqfile == "not open") { rfastqfile = ""; abort = true; } else if (rfastqfile == "not found") { rfastqfile = ""; abort=true; m->mothurOut("The rfastq parameter is required.\n"); } + oligosfile = validParameter.validFile(parameters, "oligos", true); + if (oligosfile == "not found") { oligosfile = ""; } + else if(oligosfile == "not open") { abort = true; } + else { m->setOligosFile(oligosfile); } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(ffastqfile); } @@ -182,6 +213,26 @@ MakeContigsCommand::MakeContigsCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); + + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } + m->mothurConvert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } + + temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } + allFiles = m->isTrue(temp); align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; } if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; } @@ -239,6 +290,12 @@ int MakeContigsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { currentQual = (itTypes->second)[0]; m->setQualFile(currentQual); } } + + string currentGroup = ""; + itTypes = outputTypes.find("group"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { currentGroup = (itTypes->second)[0]; m->setGroupFile(currentGroup); } + } //output files created by command m->mothurOutEndLine(); @@ -246,7 +303,6 @@ int MakeContigsCommand::execute(){ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); - return 0; } catch(exception& e) { @@ -700,6 +756,262 @@ bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse){ exit(1); } } +//*************************************************************************************************************** +//illumina data requires paired forward and reverse data +//BARCODE atgcatgc atgcatgc groupName +//PRIMER atgcatgc atgcatgc groupName +//PRIMER atgcatgc atgcatgc +bool MakeContigsCommand::getOligos(vector >& fastaFileNames, vector >& qualFileNames){ + try { + ifstream in; + m->openInputFile(oligosfile, in); + + ofstream test; + + string type, foligo, roligo, group; + + int indexPrimer = 0; + int indexBarcode = 0; + set uniquePrimers; + set uniqueBarcodes; + + while(!in.eof()){ + + in >> type; + + if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); } + + if(type[0] == '#'){ + while (!in.eof()) { char c = in.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + m->gobble(in); + } + else{ + m->gobble(in); + //make type case insensitive + for(int i=0;i> foligo; + + if (m->debug) { m->mothurOut("[DEBUG]: reading - " + foligo + ".\n"); } + + for(int i=0;igobble(in); + + in >> roligo; + + for(int i=0;imothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine(); } + else { uniquePrimers.insert(tempPair); } + + if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); } } + + primers[indexPrimer]=newPrimer; indexPrimer++; + primerNameVector.push_back(group); + }else if(type == "BARCODE"){ + m->gobble(in); + + in >> roligo; + + for(int i=0;idebug) { m->mothurOut("[DEBUG]: barcode pair " + newPair.forward + " " + newPair.reverse + ", and group = " + group + ".\n"); } + + //check for repeat barcodes + string tempPair = foligo+roligo; + if (uniqueBarcodes.count(tempPair) != 0) { m->mothurOut("barcode pair " + newPair.forward + " " + newPair.reverse + " is in your oligos file already, disregarding."); m->mothurOutEndLine(); } + else { uniqueBarcodes.insert(tempPair); } + + barcodes[indexBarcode]=newPair; indexBarcode++; + barcodeNameVector.push_back(group); + }else if(type == "LINKER"){ + linker.push_back(foligo); + }else if(type == "SPACER"){ + spacer.push_back(foligo); + } + else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are primer, barcode, linker and spacer. Ignoring " + foligo + "."); m->mothurOutEndLine(); } + } + m->gobble(in); + } + in.close(); + + if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; } + + //add in potential combos + if(barcodeNameVector.size() == 0){ + oligosPair temp("", ""); + barcodes[0] = temp; + barcodeNameVector.push_back(""); + } + + if(primerNameVector.size() == 0){ + oligosPair temp("", ""); + primers[0] = temp; + primerNameVector.push_back(""); + } + + fastaFileNames.resize(barcodeNameVector.size()); + for(int i=0;i uniqueNames; //used to cleanup outputFileNames + for(map::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ + for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ + + string primerName = primerNameVector[itPrimer->first]; + string barcodeName = barcodeNameVector[itBar->first]; + + string comboGroupName = ""; + string fastaFileName = ""; + string qualFileName = ""; + string nameFileName = ""; + string countFileName = ""; + + if(primerName == ""){ + comboGroupName = barcodeNameVector[itBar->first]; + } + else{ + if(barcodeName == ""){ + comboGroupName = primerNameVector[itPrimer->first]; + } + else{ + comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first]; + } + } + + + ofstream temp; + fastaFileName = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + comboGroupName + ".fasta"; + if (uniqueNames.count(fastaFileName) == 0) { + outputNames.push_back(fastaFileName); + outputTypes["fasta"].push_back(fastaFileName); + uniqueNames.insert(fastaFileName); + } + + fastaFileNames[itBar->first][itPrimer->first] = fastaFileName; + m->openOutputFile(fastaFileName, temp); temp.close(); + + + qualFileName = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + comboGroupName + ".qual"; + if (uniqueNames.count(qualFileName) == 0) { + outputNames.push_back(qualFileName); + outputTypes["qfile"].push_back(qualFileName); + } + + qualFileNames[itBar->first][itPrimer->first] = qualFileName; + m->openOutputFile(qualFileName, temp); temp.close(); + } + } + } + + bool allBlank = true; + for (int i = 0; i < barcodeNameVector.size(); i++) { + if (barcodeNameVector[i] != "") { + allBlank = false; + break; + } + } + for (int i = 0; i < primerNameVector.size(); i++) { + if (primerNameVector[i] != "") { + allBlank = false; + break; + } + } + + if (allBlank) { + m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine(); + allFiles = false; + return false; + } + + return true; + + } + catch(exception& e) { + m->errorOut(e, "MakeContigsCommand", "getOligos"); + exit(1); + } +} +//********************************************************************/ +string MakeContigsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "MakeContigsCommand", "reverseOligo"); + exit(1); + } +} //********************************************************************************************************************** diff --git a/makecontigscommand.h b/makecontigscommand.h index 2308b65..84e43c0 100644 --- a/makecontigscommand.h +++ b/makecontigscommand.h @@ -17,7 +17,7 @@ #include "needlemanoverlap.hpp" #include "blastalign.hpp" #include "noalign.hpp" - +#include "trimoligos.h" struct fastqRead { vector scores; @@ -50,17 +50,31 @@ public: void help() { m->mothurOut(getHelpString()); } private: - bool abort; - string outputDir, ffastqfile, rfastqfile, align; + bool abort, allFiles; + string outputDir, ffastqfile, rfastqfile, align, oligosfile; float match, misMatch, gapOpen, gapExtend; - int processors, longestBase, threshold; + int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs; vector outputNames; + map barcodes; + map primers; + vector linker; + vector spacer; + vector primerNameVector; + vector barcodeNameVector; + + map groupCounts; + //map combos; + //map groupToIndex; + //vector groupVector; + fastqRead readFastq(ifstream&); vector< vector > readFastqFiles(int&); bool checkReads(fastqRead&, fastqRead&); int createProcesses(vector< vector >, string, string, string); int driver(vector, string, string, string); + bool getOligos(vector >&, vector >&); + string reverseOligo(string); }; /**************************************************************************************************/ diff --git a/makefile b/makefile index 32ede6e..bc5a569 100644 --- a/makefile +++ b/makefile @@ -17,7 +17,7 @@ USECOMPRESSION ?= no MOTHUR_FILES="\"Enter_your_default_path_here\"" RELEASE_DATE = "\"7/9/2012\"" VERSION = "\"1.26.0\"" -FORTAN_COMPILER = gfortran +FORTAN_COMPILER = /usr/local/gfortran/bin/gfortran FORTRAN_FLAGS = # Optimize to level 3: diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index 4774504..1861aa5 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -13,8 +13,8 @@ vector MGClusterCommand::setParameters(){ try { CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pblast); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pcount("count", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter plength("length", "Number", "", "5", "", "", "",false,false); parameters.push_back(plength); CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "",false,false); parameters.push_back(ppenalty); CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pcutoff); @@ -147,6 +147,14 @@ MGClusterCommand::MGClusterCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } diff --git a/mgclustercommand.h b/mgclustercommand.h index c9c23c8..b5b295f 100644 --- a/mgclustercommand.h +++ b/mgclustercommand.h @@ -12,7 +12,6 @@ #include "command.hpp" #include "readblast.h" -#include "sparsematrix.hpp" #include "nameassignment.hpp" #include "cluster.hpp" #include "hcluster.h" diff --git a/mothur.h b/mothur.h index 25b803f..cd14056 100644 --- a/mothur.h +++ b/mothur.h @@ -177,7 +177,13 @@ inline bool compareSpearman(spearmanRank left, spearmanRank right){ //******************************************************************************************************************** //sorts highest to lowest inline bool compareSeqPriorityNodes(seqPriorityNode left, seqPriorityNode right){ - return (left.numIdentical > right.numIdentical); + if (left.numIdentical > right.numIdentical) { + return true; + }else if (left.numIdentical == right.numIdentical) { + if (left.seq > right.seq) { return true; } + else { return false; } + } + return false; } //******************************************************************************************************************** //sorts lowest to highest diff --git a/mothurout.cpp b/mothurout.cpp index 9704464..7d40e80 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -939,7 +939,7 @@ string MothurOut::getFullPathName(string fileName){ } for (int i = index; i >= 0; i--) { - newFileName = dirs[i] + "\\" + newFileName; + newFileName = dirs[i] + "\\\\" + newFileName; } return newFileName; @@ -1544,7 +1544,6 @@ vector MothurOut::splitWhiteSpace(string input){ //********************************************************************************************************************** int MothurOut::readTax(string namefile, map& taxMap) { try { - //open input file ifstream in; openInputFile(namefile, in); @@ -1575,6 +1574,23 @@ int MothurOut::readTax(string namefile, map& taxMap) { } } in.close(); + + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + //are there confidence scores, if so remove them + if (secondCol.find_first_of('(') != -1) { removeConfidences(secondCol); } + taxMap[firstCol] = secondCol; + if (debug) { mothurOut("[DEBUG]: name = '" + firstCol + "' tax = '" + secondCol + "'\n"); } + pairDone = false; + } + } + } return taxMap.size(); @@ -1587,7 +1603,6 @@ int MothurOut::readTax(string namefile, map& taxMap) { /**********************************************************************************************************************/ int MothurOut::readNames(string namefile, map& nameMap, bool redund) { try { - //open input file ifstream in; openInputFile(namefile, in); @@ -1618,6 +1633,23 @@ int MothurOut::readNames(string namefile, map& nameMap, bool red } } in.close(); + + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + //parse names into vector + vector theseNames; + splitAtComma(secondCol, theseNames); + for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = firstCol; } + pairDone = false; + } + } + } return nameMap.size(); @@ -1630,7 +1662,6 @@ int MothurOut::readNames(string namefile, map& nameMap, bool red /**********************************************************************************************************************/ int MothurOut::readNames(string namefile, map& nameMap, int flip) { try { - //open input file ifstream in; openInputFile(namefile, in); @@ -1658,6 +1689,20 @@ int MothurOut::readNames(string namefile, map& nameMap, int flip } } in.close(); + + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + nameMap[secondCol] = firstCol; + pairDone = false; + } + } + } return nameMap.size(); @@ -1670,7 +1715,7 @@ int MothurOut::readNames(string namefile, map& nameMap, int flip /**********************************************************************************************************************/ int MothurOut::readNames(string namefile, map& nameMap, map& nameCount) { try { - nameMap.clear(); nameCount.clear(); + nameMap.clear(); nameCount.clear(); //open input file ifstream in; openInputFile(namefile, in); @@ -1703,6 +1748,24 @@ int MothurOut::readNames(string namefile, map& nameMap, map pieces = splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + //parse names into vector + vector theseNames; + splitAtComma(secondCol, theseNames); + for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = firstCol; } + nameCount[firstCol] = theseNames.size(); + pairDone = false; + } + } + + } return nameMap.size(); } @@ -1714,7 +1777,6 @@ int MothurOut::readNames(string namefile, map& nameMap, map& nameMap) { try { - //open input file ifstream in; openInputFile(namefile, in); @@ -1739,6 +1801,17 @@ int MothurOut::readNames(string namefile, map& nameMap) { } } in.close(); + + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { nameMap[firstCol] = secondCol; pairDone = false; } + } + } return nameMap.size(); @@ -1750,8 +1823,7 @@ int MothurOut::readNames(string namefile, map& nameMap) { } /**********************************************************************************************************************/ int MothurOut::readNames(string namefile, map >& nameMap) { - try { - + try { //open input file ifstream in; openInputFile(namefile, in); @@ -1782,6 +1854,22 @@ int MothurOut::readNames(string namefile, map >& nameMap) } in.close(); + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + vector temp; + splitAtComma(secondCol, temp); + nameMap[firstCol] = temp; + pairDone = false; + } + } + } + return nameMap.size(); } catch(exception& e) { @@ -1792,7 +1880,6 @@ int MothurOut::readNames(string namefile, map >& nameMap) /**********************************************************************************************************************/ map MothurOut::readNames(string namefile) { try { - map nameMap; //open input file @@ -1823,6 +1910,20 @@ map MothurOut::readNames(string namefile) { } } in.close(); + + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + int num = getNumNames(secondCol); + nameMap[firstCol] = num; + pairDone = false; + } + } + } return nameMap; @@ -1875,6 +1976,29 @@ int MothurOut::readNames(string namefile, vector& nameVector, m } in.close(); + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + int num = getNumNames(secondCol); + + map::iterator it = fastamap.find(firstCol); + if (it == fastamap.end()) { + error = 1; + mothurOut("[ERROR]: " + firstCol + " is not in your fastafile, but is in your namesfile, please correct."); mothurOutEndLine(); + }else { + seqPriorityNode temp(num, it->second, firstCol); + nameVector.push_back(temp); + } + + pairDone = false; + } + } + } return error; } catch(exception& e) { @@ -1885,7 +2009,7 @@ int MothurOut::readNames(string namefile, vector& nameVector, m //********************************************************************************************************************** set MothurOut::readAccnos(string accnosfile){ try { - set names; + set names; ifstream in; openInputFile(accnosfile, in); string name; @@ -1903,6 +2027,10 @@ set MothurOut::readAccnos(string accnosfile){ } in.close(); + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + for (int i = 0; i < pieces.size(); i++) { names.insert(pieces[i]); } + } return names; } catch(exception& e) { @@ -1930,6 +2058,11 @@ int MothurOut::readAccnos(string accnosfile, vector& names){ for (int i = 0; i < pieces.size(); i++) { names.push_back(pieces[i]); } } in.close(); + + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + for (int i = 0; i < pieces.size(); i++) { names.push_back(pieces[i]); } + } return 0; } @@ -1981,6 +2114,32 @@ int MothurOut::getNumChar(string line, char c){ exit(1); } } +//********************************************************************************************************************** +bool MothurOut::isSubset(vector bigset, vector subset) { + try { + + + if (subset.size() > bigset.size()) { return false; } + + //check if each guy in suset is also in bigset + for (int i = 0; i < subset.size(); i++) { + bool match = false; + for (int j = 0; j < bigset.size(); j++) { + if (subset[i] == bigset[j]) { match = true; break; } + } + + //you have a guy in subset that had no match in bigset + if (match == false) { return false; } + } + + return true; + + } + catch(exception& e) { + errorOut(e, "MothurOut", "isSubset"); + exit(1); + } +} /***********************************************************************/ int MothurOut::mothurRemove(string filename){ try { @@ -2298,30 +2457,29 @@ void MothurOut::splitAtDash(string& estim, vector& container) { try { string individual = ""; int estimLength = estim.size(); + bool prevEscape = false; for(int i=0;i& container) { try { string individual = ""; int estimLength = estim.size(); + bool prevEscape = false; for(int i=0;i& container) { //This function parses the line options and puts them in a set void MothurOut::splitAtDash(string& estim, set& container) { try { - string individual; + string individual = ""; int lineNum; - - while (estim.find_first_of('-') != -1) { - individual = estim.substr(0,estim.find_first_of('-')); - if ((estim.find_first_of('-')+1) <= estim.length()) { //checks to make sure you don't have dash at end of string - estim = estim.substr(estim.find_first_of('-')+1, estim.length()); - convert(individual, lineNum); //convert the string to int - container.insert(lineNum); + int estimLength = estim.size(); + bool prevEscape = false; + for(int i=0;i& featureVector){ + try { + //finds sum + double average = 0; + for (int i = 0; i < featureVector.size(); i++) { average += featureVector[i]; } + average /= (double) featureVector.size(); + + //find standard deviation + double stdDev = 0; + for (int i = 0; i < featureVector.size(); i++) { //compute the difference of each dist from the mean, and square the result of each + stdDev += ((featureVector[i] - average) * (featureVector[i] - average)); + } + + stdDev /= (double) featureVector.size(); + stdDev = sqrt(stdDev); + + return stdDev; + } + catch(exception& e) { + errorOut(e, "MothurOut", "getStandardDeviation"); + exit(1); + } +} +/**************************************************************************************************/ diff --git a/mothurout.h b/mothurout.h index 77c5a80..53d4250 100644 --- a/mothurout.h +++ b/mothurout.h @@ -140,7 +140,9 @@ class MothurOut { void splitAtChar(string&, vector&, char); void splitAtChar(string&, string&, char); int removeConfidences(string&); + string removeQuotes(string); string makeList(vector&); + bool isSubset(vector, vector); //bigSet, subset //math operation int factorial(int num); @@ -149,6 +151,7 @@ class MothurOut { float roundDist(float, int); unsigned int fromBase36(string); int getRandomIndex(int); //highest + double getStandardDeviation(vector&); int control_pressed; bool executing, runParse, jumble, gui, mothurCalling, debug; diff --git a/parsefastaqcommand.cpp b/parsefastaqcommand.cpp index 1331b7f..816bdb5 100644 --- a/parsefastaqcommand.cpp +++ b/parsefastaqcommand.cpp @@ -16,7 +16,8 @@ vector ParseFastaQCommand::setParameters(){ CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfastq); CommandParameter pfasta("fasta", "Bool", "", "T", "", "", "",false,false); parameters.push_back(pfasta); CommandParameter pqual("qfile", "Bool", "", "T", "", "", "",false,false); parameters.push_back(pqual); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa", "sanger", "", "", "",false,false); parameters.push_back(pformat); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); vector myArray; @@ -33,8 +34,9 @@ string ParseFastaQCommand::getHelpString(){ try { string helpString = ""; helpString += "The fastq.info command reads a fastq file and creates a fasta and quality file.\n"; - helpString += "The fastq.info command parameters are fastq, fasta and qfile; fastq is required.\n"; - helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n"; + helpString += "The fastq.info command parameters are fastq, fasta, qfile and format; fastq is required.\n"; + helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n"; + helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa or illumina, default=sanger.\n"; helpString += "The fasta parameter allows you to indicate whether you want a fasta file generated. Default=T.\n"; helpString += "The qfile parameter allows you to indicate whether you want a quality file generated. Default=T.\n"; helpString += "Example fastq.info(fastaq=test.fastaq).\n"; @@ -138,6 +140,13 @@ ParseFastaQCommand::ParseFastaQCommand(string option){ temp = validParameter.validFile(parameters, "qfile", false); if(temp == "not found"){ temp = "T"; } qual = m->isTrue(temp); + format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "sanger"; } + + if ((format != "sanger") && (format != "illumina") && (format != "solexa")) { + m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa and illumina, aborting." ); m->mothurOutEndLine(); + abort=true; + } + if ((!fasta) && (!qual)) { m->mothurOut("[ERROR]: no outputs selected. Aborting."); m->mothurOutEndLine(); abort=true; } } @@ -163,6 +172,12 @@ int ParseFastaQCommand::execute(){ ifstream in; m->openInputFile(fastaQFile, in); + + //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference. + for (int i = -64; i < 65; i++) { + char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499)); + convertTable.push_back(temp); + } while (!in.eof()) { @@ -238,12 +253,18 @@ vector ParseFastaQCommand::convertQual(string qual) { try { vector qualScores; - int controlChar = int('@'); - for (int i = 0; i < qual.length(); i++) { - int temp = int(qual[i]); - temp -= controlChar; - + + int temp = 0; + temp = int(qual[i]); + if (format == "illumina") { + temp -= 64; //char '@' + }else if (format == "solexa") { + temp = int(convertTable[temp]); //convert to sanger + temp -= 33; //char '!' + }else { + temp -= 33; //char '!' + } qualScores.push_back(temp); } diff --git a/parsefastaqcommand.h b/parsefastaqcommand.h index 4481b98..96fcb7d 100644 --- a/parsefastaqcommand.h +++ b/parsefastaqcommand.h @@ -34,10 +34,11 @@ public: private: vector outputNames; - string outputDir, fastaQFile; + string outputDir, fastaQFile, format; bool abort, fasta, qual; vector convertQual(string); + vector convertTable; }; #endif diff --git a/parsimony.cpp b/parsimony.cpp index 3b0f317..6a0485c 100644 --- a/parsimony.cpp +++ b/parsimony.cpp @@ -15,7 +15,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { try { processors = p; outputDir = o; - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); //if the users enters no groups then give them the score of all groups vector mGroups = m->getGroups(); @@ -38,7 +38,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { vector groups; if (numGroups == 0) { //get score for all users groups - vector tGroups = tmap->getNamesOfGroups(); + vector tGroups = ct->getNamesOfGroups(); for (int i = 0; i < tGroups.size(); i++) { if (tGroups[i] != "xxx") { groups.push_back(tGroups[i]); @@ -57,7 +57,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); }else{ lines.clear(); int numPairs = namesOfGroupCombos.size(); @@ -74,10 +74,10 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { lines.push_back(linePair(startPos, numPairsPerProcessor)); } - data = createProcesses(t, namesOfGroupCombos, tmap); + data = createProcesses(t, namesOfGroupCombos, ct); } #else - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); #endif return data; @@ -90,7 +90,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) { } /**************************************************************************************************/ -EstOutput Parsimony::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, TreeMap* tmap) { +EstOutput Parsimony::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, CountTable* ct) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; @@ -107,7 +107,7 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector > namesOfGr process++; }else if (pid == 0){ EstOutput myresults; - myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap); + myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, ct); if (m->control_pressed) { exit(0); } @@ -127,7 +127,7 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector > namesOfGr } } - results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap); + results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, ct); //force parent to wait until all the processes are done for (int i=0;i > namesOfGr } } /**************************************************************************************************/ -EstOutput Parsimony::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, TreeMap* tmap) { +EstOutput Parsimony::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, CountTable* ct) { try { EstOutput results; results.resize(num); - Tree* copyTree = new Tree(tmap); + Tree* copyTree = new Tree(ct); int count = 0; for (int h = start; h < (start+num); h++) { diff --git a/parsimony.h b/parsimony.h index 7316d50..bf0e0d4 100644 --- a/parsimony.h +++ b/parsimony.h @@ -12,7 +12,7 @@ */ #include "treecalculator.h" -#include "treemap.h" +#include "counttable.h" /***********************************************************************/ @@ -35,8 +35,8 @@ class Parsimony : public TreeCalculator { int processors; string outputDir; - EstOutput driver(Tree*, vector< vector >, int, int, TreeMap*); - EstOutput createProcesses(Tree*, vector< vector >, TreeMap*); + EstOutput driver(Tree*, vector< vector >, int, int, CountTable*); + EstOutput createProcesses(Tree*, vector< vector >, CountTable*); }; /***********************************************************************/ diff --git a/parsimonycommand.cpp b/parsimonycommand.cpp index f124b60..eabbb59 100644 --- a/parsimonycommand.cpp +++ b/parsimonycommand.cpp @@ -14,8 +14,9 @@ vector ParsimonyCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter prandom("random", "String", "", "", "", "", "",false,false); parameters.push_back(prandom); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); @@ -36,7 +37,7 @@ vector ParsimonyCommand::setParameters(){ string ParsimonyCommand::getHelpString(){ try { string helpString = ""; - helpString += "The parsimony command parameters are tree, group, name, random, groups, processors and iters. tree parameter is required unless you have valid current tree file or are using random.\n"; + helpString += "The parsimony command parameters are tree, group, name, count, random, groups, processors and iters. tree parameter is required unless you have valid current tree file or are using random.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n"; helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"; helpString += "The parsimony command should be in the following format: parsimony(random=yourOutputFilename, groups=yourGroups, iters=yourIters).\n"; @@ -145,6 +146,14 @@ ParsimonyCommand::ParsimonyCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -172,6 +181,20 @@ ParsimonyCommand::ParsimonyCommand(string option) { if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + } //if the user changes the output directory command factory will send this info to us in the output parameter @@ -193,10 +216,12 @@ ParsimonyCommand::ParsimonyCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); - } + if (countfile=="") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } @@ -219,9 +244,11 @@ int ParsimonyCommand::execute() { m->setTreeFile(treefile); - TreeReader* reader = new TreeReader(treefile, groupfile, namefile); + TreeReader* reader; + if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); } + else { reader = new TreeReader(treefile, countfile); } T = reader->getTrees(); - tmap = T[0]->getTreeMap(); + ct = T[0]->getCountTable(); delete reader; if(outputDir == "") { outputDir += m->hasPath(treefile); } @@ -245,7 +272,7 @@ int ParsimonyCommand::execute() { //set users groups to analyze SharedUtil util; vector mGroups = m->getGroups(); - vector tGroups = tmap->getNamesOfGroups(); + vector tGroups = ct->getNamesOfGroups(); util.setGroups(mGroups, tGroups, allGroups, numGroups, "parsimony"); //sets the groups the user wants to analyze util.getCombos(groupComb, mGroups, numComp); m->setGroups(mGroups); @@ -260,7 +287,7 @@ int ParsimonyCommand::execute() { if (m->control_pressed) { delete reading; delete output; - delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (randomtree == "") { outSum.close(); } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); m->clearGroups(); @@ -285,7 +312,7 @@ int ParsimonyCommand::execute() { if (m->control_pressed) { delete reading; delete output; - delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (randomtree == "") { outSum.close(); } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); m->clearGroups(); @@ -314,7 +341,7 @@ int ParsimonyCommand::execute() { for (int j = 0; j < iters; j++) { //create new tree with same num nodes and leaves as users - randT = new Tree(tmap); + randT = new Tree(ct); //create random relationships between nodes randT->assembleRandomTree(); @@ -326,7 +353,7 @@ int ParsimonyCommand::execute() { delete reading; delete output; delete randT; if (randomtree == "") { outSum.close(); } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); - delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } m->clearGroups(); return 0; } @@ -355,13 +382,13 @@ int ParsimonyCommand::execute() { for (int j = 0; j < iters; j++) { //create new tree with same num nodes and leaves as users - randT = new Tree(tmap); + randT = new Tree(ct); //create random relationships between nodes randT->assembleRandomTree(); if (m->control_pressed) { - delete reading; delete output; delete randT; delete tmap; + delete reading; delete output; delete randT; delete ct; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0; } @@ -370,7 +397,7 @@ int ParsimonyCommand::execute() { randomData = pars.getValues(randT, processors, outputDir); if (m->control_pressed) { - delete reading; delete output; delete randT; delete tmap; + delete reading; delete output; delete randT; delete ct; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0; } @@ -424,7 +451,7 @@ int ParsimonyCommand::execute() { if (m->control_pressed) { delete reading; delete output; - delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (randomtree == "") { outSum.close(); } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0; @@ -437,7 +464,7 @@ int ParsimonyCommand::execute() { printParsimonyFile(); if (randomtree == "") { printUSummaryFile(); } - delete output; delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + delete output; delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;} @@ -529,7 +556,7 @@ void ParsimonyCommand::getUserInput() { try { //create treemap - tmap = new TreeMap(); + ct = new CountTable(); m->mothurOut("Please enter the number of groups you would like to analyze: "); cin >> numGroups; @@ -539,30 +566,31 @@ void ParsimonyCommand::getUserInput() { count = 1; numEachGroup.resize(numGroups, 0); - + set nameMap; + map groupMap; + set gps; + for (int i = 1; i <= numGroups; i++) { m->mothurOut("Please enter the number of sequences in group " + toString(i) + ": "); cin >> num; m->mothurOutJustToLog(toString(num)); m->mothurOutEndLine(); - - //set tmaps seqsPerGroup - tmap->seqsPerGroup[toString(i)] = num; - tmap->addGroup(toString(i)); + gps.insert(toString(i)); + //set tmaps namesOfSeqs for (int j = 0; j < num; j++) { - tmap->namesOfSeqs.push_back(toString(count)); - tmap->treemap[toString(count)].groupname = toString(i); + groupMap[toString(count)] = i; + nameMap.insert(toString(count)); count++; } } - + ct->createTable(nameMap, groupMap, gps); + //clears buffer so next command doesn't have error string s; getline(cin, s); - m->Treenames = tmap->namesOfSeqs; - + m->Treenames = ct->getNamesOfSeqs(); } catch(exception& e) { m->errorOut(e, "ParsimonyCommand", "getUserInput"); diff --git a/parsimonycommand.h b/parsimonycommand.h index 79613f5..38a7505 100644 --- a/parsimonycommand.h +++ b/parsimonycommand.h @@ -11,7 +11,7 @@ #include "command.hpp" #include "parsimony.h" -#include "treemap.h" +#include "counttable.h" #include "progress.hpp" #include "sharedutilities.h" #include "fileoutput.h" @@ -41,10 +41,10 @@ private: vector T; //user trees Tree* randT; //random tree Tree* copyUserTree; - TreeMap* tmap; - TreeMap* savetmap; + CountTable* ct; + CountTable* savect; vector groupComb; // AB. AC, BC... - string sumFile, randomtree, allGroups, outputDir, treefile, groupfile, namefile; + string sumFile, randomtree, allGroups, outputDir, treefile, groupfile, namefile, countfile; int iters, numGroups, numComp, counter, processors, numUniquesInName; vector numEachGroup; //vector containing the number of sequences in each group the users wants for random distrib. vector< vector > userTreeScores; //scores for users trees for each comb. diff --git a/pcrseqscommand.h b/pcrseqscommand.h index baeca4e..d35850c 100644 --- a/pcrseqscommand.h +++ b/pcrseqscommand.h @@ -15,6 +15,7 @@ #include "trimoligos.h" #include "alignment.hpp" #include "needlemanoverlap.hpp" +#include "counttable.h" class PcrSeqsCommand : public Command { public: @@ -45,7 +46,7 @@ private: vector lines; bool getOligos(vector >&, vector >&, vector >&); bool abort, keepprimer, keepdots; - string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch; + string fastafile, oligosfile, taxfile, groupfile, namefile, countfile, ecolifile, outputDir, nomatch; int start, end, processors, length; vector revPrimer, outputNames; @@ -55,6 +56,7 @@ private: int readName(set&); int readGroup(set); int readTax(set); + int readCount(set); bool readOligos(); bool readEcoli(); int driverPcr(string, string, string, set&, linePair); diff --git a/phylodiversitycommand.cpp b/phylodiversitycommand.cpp index ddd2b31..b0c11f6 100644 --- a/phylodiversitycommand.cpp +++ b/phylodiversitycommand.cpp @@ -15,8 +15,9 @@ vector PhyloDiversityCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq); @@ -41,7 +42,7 @@ vector PhyloDiversityCommand::setParameters(){ string PhyloDiversityCommand::getHelpString(){ try { string helpString = ""; - helpString += "The phylo.diversity command parameters are tree, group, name, groups, iters, freq, processors, scale, rarefy, collect and summary. tree and group are required, unless you have valid current files.\n"; + helpString += "The phylo.diversity command parameters are tree, group, name, count, groups, iters, freq, processors, scale, rarefy, collect and summary. tree and group are required, unless you have valid current files.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n"; helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n"; helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n"; @@ -156,6 +157,14 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -179,6 +188,19 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(treefile); } string temp; @@ -214,10 +236,12 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; } - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); - } + if (countfile=="") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } } @@ -236,14 +260,16 @@ int PhyloDiversityCommand::execute(){ int start = time(NULL); m->setTreeFile(treefile); - TreeReader* reader = new TreeReader(treefile, groupfile, namefile); + TreeReader* reader; + if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); } + else { reader = new TreeReader(treefile, countfile); } vector trees = reader->getTrees(); - tmap = trees[0]->getTreeMap(); + ct = trees[0]->getCountTable(); delete reader; SharedUtil util; vector mGroups = m->getGroups(); - vector tGroups = tmap->getNamesOfGroups(); + vector tGroups = ct->getNamesOfGroups(); util.setGroups(mGroups, tGroups, "phylo.diversity"); //sets the groups the user wants to analyze //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed @@ -255,7 +281,7 @@ int PhyloDiversityCommand::execute(){ //for each of the users trees for(int i = 0; i < trees.size(); i++) { - if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + if (m->control_pressed) { delete ct; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } ofstream outSum, outRare, outCollect; string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(i+1) + "." + getOutputFileNameTag("summary"); @@ -286,15 +312,16 @@ int PhyloDiversityCommand::execute(){ //find largest group total int largestGroup = 0; - for (int j = 0; j < mGroups.size(); j++) { - if (tmap->seqsPerGroup[mGroups[j]] > largestGroup) { largestGroup = tmap->seqsPerGroup[mGroups[j]]; } + for (int j = 0; j < mGroups.size(); j++) { + int numSeqsThisGroup = ct->getGroupCount(mGroups[j]); + if (numSeqsThisGroup > largestGroup) { largestGroup = numSeqsThisGroup; } //initialize diversity - diversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0); //numSampled + diversity[mGroups[j]].resize(numSeqsThisGroup+1, 0.0); //numSampled //groupA 0.0 0.0 //initialize sumDiversity - sumDiversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0); + sumDiversity[mGroups[j]].resize(numSeqsThisGroup+1, 0.0); } //convert freq percentage to number @@ -649,7 +676,7 @@ map PhyloDiversityCommand::getRootForGroups(Tree* t){ map done; //initialize root for all groups to -1 - for (int k = 0; k < (t->getTreeMap())->getNamesOfGroups().size(); k++) { done[(t->getTreeMap())->getNamesOfGroups()[k]] = false; } + for (int k = 0; k < (t->getCountTable())->getNamesOfGroups().size(); k++) { done[(t->getCountTable())->getNamesOfGroups()[k]] = false; } for (int i = 0; i < t->getNumLeaves(); i++) { diff --git a/phylodiversitycommand.h b/phylodiversitycommand.h index 9527692..ee76f05 100644 --- a/phylodiversitycommand.h +++ b/phylodiversitycommand.h @@ -11,7 +11,7 @@ */ #include "command.hpp" -#include "treemap.h" +#include "counttable.h" #include "sharedutilities.h" #include "tree.h" @@ -33,11 +33,11 @@ class PhyloDiversityCommand : public Command { int execute(); void help() { m->mothurOut(getHelpString()); } private: - TreeMap* tmap; + CountTable* ct; float freq; int iters, processors, numUniquesInName; bool abort, rarefy, summary, collect, scale; - string groups, outputDir, treefile, groupfile, namefile; + string groups, outputDir, treefile, groupfile, namefile, countfile; vector Groups, outputNames; //holds groups to be used, and outputFile names map getRootForGroups(Tree* t); diff --git a/phylosummary.cpp b/phylosummary.cpp index 5f7bbc3..ab6bb83 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -8,21 +8,68 @@ */ #include "phylosummary.h" - /**************************************************************************************************/ -PhyloSummary::PhyloSummary(string refTfile, string groupFile){ +PhyloSummary::PhyloSummary(string refTfile, CountTable* c){ try { m = MothurOut::getInstance(); maxLevel = 0; ignore = false; + numSeqs = 0; + + ct = c; + groupmap = NULL; + + //check for necessary files + string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum")); + ifstream FileTest(taxFileNameTest.c_str()); - if (groupFile != "") { - groupmap = new GroupMap(groupFile); - groupmap->readMap(); + if (!FileTest) { + m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1); }else{ - groupmap = NULL; + readTreeStruct(FileTest); } + + tree[0].rank = "0"; + assignRank(0); + + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "PhyloSummary"); + exit(1); + } +} + +/**************************************************************************************************/ + +PhyloSummary::PhyloSummary(CountTable* c){ + try { + m = MothurOut::getInstance(); + maxLevel = 0; + ignore = true; + numSeqs = 0; + + ct = c; + groupmap = NULL; + + tree.push_back(rawTaxNode("Root")); + tree[0].rank = "0"; + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "PhyloSummary"); + exit(1); + } +} +/**************************************************************************************************/ +PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){ + try { + m = MothurOut::getInstance(); + maxLevel = 0; + ignore = false; + numSeqs = 0; + + groupmap = g; + ct = NULL; //check for necessary files string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum")); @@ -46,23 +93,18 @@ PhyloSummary::PhyloSummary(string refTfile, string groupFile){ /**************************************************************************************************/ -PhyloSummary::PhyloSummary(string groupFile){ +PhyloSummary::PhyloSummary(GroupMap* g){ try { m = MothurOut::getInstance(); maxLevel = 0; ignore = true; + numSeqs = 0; - if (groupFile != "") { - groupmap = new GroupMap(groupFile); - groupmap->readMap(); - }else{ - groupmap = NULL; - } + groupmap = g; + ct = NULL; tree.push_back(rawTaxNode("Root")); tree[0].rank = "0"; - - } catch(exception& e) { m->errorOut(e, "PhyloSummary", "PhyloSummary"); @@ -78,7 +120,6 @@ int PhyloSummary::summarize(string userTfile){ for (map::iterator itTemp = temp.begin(); itTemp != temp.end();) { addSeqToTree(itTemp->first, itTemp->second); - numSeqs++; temp.erase(itTemp++); } @@ -137,7 +178,9 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ childPointer = tree[currentNode].children.find(taxon); if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on - if (groupmap != NULL) { + int thisCount = 1; + + if (groupmap != NULL) { //find out the sequences group string group = groupmap->getGroup(seqName); @@ -150,9 +193,27 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ if (itGroup != tree[childPointer->second].groupCount.end()) { tree[childPointer->second].groupCount[group]++; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + vector groupCounts = ct->getGroupCounts(seqName); + vector groups = ct->getNamesOfGroups(); + for (int i = 0; i < groups.size(); i++) { + + if (groupCounts[i] != 0) { + //do you have a count for this group? + map::iterator itGroup = tree[childPointer->second].groupCount.find(groups[i]); + + //if yes, increment it - there should not be a case where we can't find it since we load group in read + if (itGroup != tree[childPointer->second].groupCount.end()) { + tree[childPointer->second].groupCount[groups[i]] += groupCounts[i]; + } + } + } + } + thisCount = ct->getNumSeqs(seqName); + } - tree[childPointer->second].total++; + tree[childPointer->second].total += thisCount; currentNode = childPointer->second; }else{ @@ -163,8 +224,8 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ tree[index].parent = currentNode; tree[index].level = (level+1); - tree[index].total = 1; tree[currentNode].children[taxon] = index; + int thisCount = 1; //initialize groupcounts if (groupmap != NULL) { @@ -184,9 +245,33 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ //if yes, increment it - there should not be a case where we can't find it since we load group in read if (itGroup != tree[index].groupCount.end()) { tree[index].groupCount[group]++; - } - } + } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + vector mGroups = ct->getNamesOfGroups(); + for (int j = 0; j < mGroups.size(); j++) { + tree[index].groupCount[mGroups[j]] = 0; + } + vector groupCounts = ct->getGroupCounts(seqName); + vector groups = ct->getNamesOfGroups(); + + for (int i = 0; i < groups.size(); i++) { + if (groupCounts[i] != 0) { + + //do you have a count for this group? + map::iterator itGroup = tree[index].groupCount.find(groups[i]); + + //if yes, increment it - there should not be a case where we can't find it since we load group in read + if (itGroup != tree[index].groupCount.end()) { + tree[index].groupCount[groups[i]]+=groupCounts[i]; + } + } + } + } + thisCount = ct->getNumSeqs(seqName); + } + tree[index].total = thisCount; currentNode = index; }else{ //otherwise, error @@ -210,7 +295,7 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ } /**************************************************************************************************/ -int PhyloSummary::addSeqToTree(string seqTaxonomy, vector names){ +int PhyloSummary::addSeqToTree(string seqTaxonomy, map containsGroup){ try { numSeqs++; @@ -235,32 +320,12 @@ int PhyloSummary::addSeqToTree(string seqTaxonomy, vector names){ childPointer = tree[currentNode].children.find(taxon); if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on - if (groupmap != NULL) { - - map containsGroup; - vector mGroups = groupmap->getNamesOfGroups(); - for (int j = 0; j < mGroups.size(); j++) { - containsGroup[mGroups[j]] = false; - } - - for (int k = 0; k < names.size(); k++) { - //find out the sequences group - string group = groupmap->getGroup(names[k]); - - if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } - else { - containsGroup[group] = true; - } - } + for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { + if (itGroup->second == true) { + tree[childPointer->second].groupCount[itGroup->first]++; + } + } - for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { - if (itGroup->second == true) { - tree[childPointer->second].groupCount[itGroup->first]++; - } - } - - } - tree[childPointer->second].total++; currentNode = childPointer->second; @@ -274,33 +339,12 @@ int PhyloSummary::addSeqToTree(string seqTaxonomy, vector names){ tree[index].level = (level+1); tree[index].total = 1; tree[currentNode].children[taxon] = index; - - //initialize groupcounts - if (groupmap != NULL) { - map containsGroup; - vector mGroups = groupmap->getNamesOfGroups(); - for (int j = 0; j < mGroups.size(); j++) { - tree[index].groupCount[mGroups[j]] = 0; - containsGroup[mGroups[j]] = false; - } - - for (int k = 0; k < names.size(); k++) { - //find out the sequences group - string group = groupmap->getGroup(names[k]); - - if (group == "not found") { m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } - else { - containsGroup[group] = true; - } - } - - for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { - if (itGroup->second == true) { - tree[index].groupCount[itGroup->first]++; - } - } - } + for (map::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) { + if (itGroup->second == true) { + tree[index].groupCount[itGroup->first]++; + } + } currentNode = index; @@ -349,17 +393,24 @@ void PhyloSummary::print(ofstream& out){ try { if (ignore) { assignRank(0); } - + vector mGroups; //print labels out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t"; if (groupmap != NULL) { //so the labels match the counts below, since the map sorts them automatically... //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end()); - vector mGroups = groupmap->getNamesOfGroups(); + mGroups = groupmap->getNamesOfGroups(); for (int i = 0; i < mGroups.size(); i++) { out << mGroups[i] << '\t'; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + mGroups = ct->getNamesOfGroups(); + for (int i = 0; i < mGroups.size(); i++) { + out << mGroups[i] << '\t'; + } + } + } out << endl; @@ -373,9 +424,10 @@ void PhyloSummary::print(ofstream& out){ tree[0].total += tree[it->second].total; if (groupmap != NULL) { - vector mGroups = groupmap->getNamesOfGroups(); for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } - } + }else if ( ct != NULL) { + if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } } + } } } @@ -384,12 +436,10 @@ void PhyloSummary::print(ofstream& out){ if (groupmap != NULL) { - //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) { - // out << itGroup->second << '\t'; - //} - vector mGroups = groupmap->getNamesOfGroups(); - for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } - } + for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } + }else if ( ct != NULL) { + if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { out << tree[0].groupCount[mGroups[i]] << '\t'; } } + } out << endl; //print rest @@ -427,7 +477,12 @@ void PhyloSummary::print(int i, ofstream& out){ //} vector mGroups = groupmap->getNamesOfGroups(); for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + vector mGroups = ct->getNamesOfGroups(); + for (int i = 0; i < mGroups.size(); i++) { out << tree[it->second].groupCount[mGroups[i]] << '\t'; } + } + } out << endl; } @@ -473,7 +528,13 @@ void PhyloSummary::readTreeStruct(ifstream& in){ for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) { tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0; } - } + }else if (ct != NULL) { + if (ct->hasGroupInfo()) { + for (int j = 0; j < (ct->getNamesOfGroups()).size(); j++) { + tree[i].groupCount[(ct->getNamesOfGroups())[j]] = 0; + } + } + } tree[i].total = 0; diff --git a/phylosummary.h b/phylosummary.h index cdec0d0..65a4674 100644 --- a/phylosummary.h +++ b/phylosummary.h @@ -13,6 +13,7 @@ #include "mothur.h" #include "mothurout.h" #include "groupmap.h" +#include "counttable.h" /**************************************************************************************************/ @@ -32,13 +33,15 @@ struct rawTaxNode { class PhyloSummary { public: - PhyloSummary(string); - PhyloSummary(string, string); - ~PhyloSummary() { if (groupmap != NULL) { delete groupmap; } } + PhyloSummary(GroupMap*); + PhyloSummary(string, GroupMap*); + PhyloSummary(CountTable*); + PhyloSummary(string, CountTable*); + ~PhyloSummary() {} int summarize(string); //pass it a taxonomy file and a group file and it makes the tree int addSeqToTree(string, string); - int addSeqToTree(string, vector); + int addSeqToTree(string, map); void print(ofstream&); int getMaxLevel() { return maxLevel; } @@ -49,6 +52,7 @@ private: void assignRank(int); void readTreeStruct(ifstream&); GroupMap* groupmap; + CountTable* ct; bool ignore; int numNodes; diff --git a/phylotree.cpp b/phylotree.cpp index 3dde186..8a7c712 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -75,7 +75,7 @@ PhyloTree::PhyloTree(ifstream& in, string filename){ for (int i = 0; i < numGenus; i++) { iss >> gnode >> gsize; m->gobble(iss); - uniqueTaxonomies[gnode] = gnode; + uniqueTaxonomies.insert(gnode); totals.push_back(gsize); } @@ -102,7 +102,7 @@ PhyloTree::PhyloTree(ifstream& in, string filename){ for (int i = 0; i < numGenus; i++) { in >> gnode >> gsize; m->gobble(in); - uniqueTaxonomies[gnode] = gnode; + uniqueTaxonomies.insert(gnode); totals.push_back(gsize); } @@ -260,7 +260,7 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ //use print to reassign the taxa id taxon = getNextTaxon(seqTaxonomy, seqName); - if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies[currentNode] = currentNode; } break; } + if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies.insert(currentNode); } break; } childPointer = tree[currentNode].children.find(taxon); @@ -280,7 +280,7 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ name2Taxonomy[seqName] = currentNode; } - if (seqTaxonomy == "") { uniqueTaxonomies[currentNode] = currentNode; } + if (seqTaxonomy == "") { uniqueTaxonomies.insert(currentNode); } } return 0; @@ -295,9 +295,16 @@ vector PhyloTree::getGenusNodes() { try { genusIndex.clear(); //generate genusIndexes - map::iterator it2; - for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(it2->first); } - + set::iterator it2; + map temp; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(*it2); temp[*it2] = genusIndex.size()-1; } + + for (map::iterator itName = name2Taxonomy.begin(); itName != name2Taxonomy.end(); itName++) { + map::iterator itTemp = temp.find(itName->second); + if (itTemp != temp.end()) { name2GenusNodeIndex[itName->first] = itTemp->second; } + else { m->mothurOut("[ERROR]: trouble making name2GenusNodeIndex, aborting.\n"); m->control_pressed = true; } + } + return genusIndex; } catch(exception& e) { @@ -541,8 +548,8 @@ void PhyloTree::printTreeNodes(string treefilename) { //print genus nodes outTree << endl << uniqueTaxonomies.size() << endl; - map::iterator it2; - for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; } + set::iterator it2; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << *it2 << '\t' << tree[*it2].accessions.size() << endl; } outTree << endl; outTree.close(); @@ -594,12 +601,12 @@ string PhyloTree::getName(int i ){ } } /**************************************************************************************************/ -int PhyloTree::getIndex(string seqName){ +int PhyloTree::getGenusIndex(string seqName){ try { - map::iterator itFind = name2Taxonomy.find(seqName); + map::iterator itFind = name2GenusNodeIndex.find(seqName); - if (itFind != name2Taxonomy.end()) { return name2Taxonomy[seqName]; } - else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} + if (itFind != name2GenusNodeIndex.end()) { return itFind->second; } + else { m->mothurOut("Cannot find " + seqName + ". Could be a mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} } catch(exception& e) { m->errorOut(e, "PhyloTree", "get"); diff --git a/phylotree.h b/phylotree.h index 7aae8f1..e000220 100644 --- a/phylotree.h +++ b/phylotree.h @@ -44,7 +44,7 @@ public: TaxNode get(int i); TaxNode get(string seqName); string getName(int i); - int getIndex(string seqName); + int getGenusIndex(string seqName); string getFullTaxonomy(string); //pass a sequence name return taxonomy int getMaxLevel() { return maxLevel; } @@ -63,7 +63,8 @@ private: vector genusIndex; //holds the indexes in tree where the genus level taxonomies are stored vector totals; //holds the numSeqs at each genus level taxonomy map name2Taxonomy; //maps name to index in tree - map uniqueTaxonomies; //map of unique taxonomies + map name2GenusNodeIndex; + set uniqueTaxonomies; //map of unique taxonomies map leafNodes; //used to create static reference taxonomy file //void print(int, ofstream&); int numNodes; diff --git a/prcseqscommand.cpp b/prcseqscommand.cpp index 6b73d44..de2cb20 100644 --- a/prcseqscommand.cpp +++ b/prcseqscommand.cpp @@ -13,8 +13,9 @@ vector PcrSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax); CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli); CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart); @@ -40,7 +41,7 @@ string PcrSeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The pcr.seqs command reads a fasta file.\n"; - helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n"; + helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n"; helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n"; helpString += "The start parameter allows you to provide a starting position to trim to.\n"; helpString += "The end parameter allows you to provide a ending position to trim from.\n"; @@ -72,6 +73,7 @@ string PcrSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ else if (type == "taxonomy") { outputFileName = "pcr" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pcr" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pcr" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pcr" + m->getExtension(inputName); } else if (type == "accnos") { outputFileName = "bad.accnos"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -93,6 +95,7 @@ PcrSeqsCommand::PcrSeqsCommand(){ outputTypes["taxonomy"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["accnos"] = tempOutNames; } catch(exception& e) { @@ -132,6 +135,7 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; outputTypes["accnos"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -185,6 +189,14 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -229,6 +241,19 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { else if(groupfile == "not open"){ groupfile = ""; abort = true; } else { m->setGroupFile(groupfile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + taxfile = validParameter.validFile(parameters, "taxonomy", true); if (taxfile == "not found"){ taxfile = ""; } else if(taxfile == "not open"){ taxfile = ""; abort = true; } @@ -265,10 +290,12 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { } //check to make sure you didn't forget the name file by mistake - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -339,7 +366,9 @@ int PcrSeqsCommand::execute(){ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (taxfile != "") { readTax(badNames); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - + if (countfile != "") { readCount(badNames); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } @@ -373,6 +402,11 @@ int PcrSeqsCommand::execute(){ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); @@ -1087,6 +1121,63 @@ int PcrSeqsCommand::readTax(set names){ exit(1); } } +//*************************************************************************************************************** +int PcrSeqsCommand::readCount(set badSeqNames){ + try { + ifstream in; + m->openInputFile(countfile, in); + set::iterator it; + + string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile); + ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut); + + string headers = m->getline(in); m->gobble(in); + goodCountOut << headers << endl; + + string name, rest; int thisTotal, removedCount; removedCount = 0; + bool wroteSomething = false; + while (!in.eof()) { + + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + + if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; } + else{ + wroteSomething = true; + goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl; + } + } + in.close(); + goodCountOut.close(); + + if (m->control_pressed) { m->mothurRemove(goodCountFile); } + + if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(goodCountFile)) { + ct.readTable(goodCountFile); + ct.printTable(goodCountFile); + } + + if (m->control_pressed) { m->mothurRemove(goodCountFile); } + + m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine(); + + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "readCOunt"); + exit(1); + } +} /**************************************************************************************/ diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 951b200..dadc918 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -14,8 +14,9 @@ vector PreClusterCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -36,9 +37,10 @@ string PreClusterCommand::getHelpString(){ string helpString = ""; helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n"; helpString += "The pre.cluster command outputs a new fasta and name file.\n"; - helpString += "The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n"; - helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n"; + helpString += "The pre.cluster command parameters are fasta, name, group, count, processors and diffs. The fasta parameter is required. \n"; + helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n"; helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n"; + helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n"; helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n"; helpString += "The pre.cluster command should be in the following format: \n"; helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n"; @@ -63,6 +65,7 @@ string PreClusterCommand::getOutputFileNameTag(string type, string inputName="") else { if (type == "fasta") { outputFileName = "precluster" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "precluster.names"; } + else if (type == "count") { outputFileName = "precluster.count_table"; } else if (type == "map") { outputFileName = "precluster.map"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -81,6 +84,7 @@ PreClusterCommand::PreClusterCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["map"] = tempOutNames; } catch(exception& e) { @@ -117,6 +121,7 @@ PreClusterCommand::PreClusterCommand(string option) { outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; outputTypes["map"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -146,6 +151,14 @@ PreClusterCommand::PreClusterCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -175,6 +188,25 @@ PreClusterCommand::PreClusterCommand(string option) { if (groupfile == "not found") { groupfile = ""; bygroup = false; } else if (groupfile == "not open") { abort = true; groupfile = ""; } else { m->setGroupFile(groupfile); bygroup = true; } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not found") { countfile = ""; } + else if (countfile == "not open") { abort = true; countfile = ""; } + else { + m->setCountTableFile(countfile); + ct.readTable(countfile); + if (ct.hasGroupInfo()) { bygroup = true; } + else { bygroup = false; } + } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; } m->mothurConvert(temp, diffs); @@ -183,10 +215,12 @@ PreClusterCommand::PreClusterCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -207,10 +241,11 @@ int PreClusterCommand::execute(){ string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile)); string newFastaFile = fileroot + getOutputFileNameTag("fasta", fastafile); string newNamesFile = fileroot + getOutputFileNameTag("name"); + string newCountFile = fileroot + getOutputFileNameTag("count"); string newMapFile = fileroot + getOutputFileNameTag("map"); //add group name if by group outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile); - outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); - + if (countfile == "") { outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); } + else { outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); } if (bygroup) { //clear out old files @@ -219,39 +254,45 @@ int PreClusterCommand::execute(){ newMapFile = fileroot + "precluster."; //parse fasta and name file by group - SequenceParser* parser; - if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile); } - else { parser = new SequenceParser(groupfile, fastafile); } - - vector groups = parser->getNamesOfGroups(); - - if(processors == 1) { driverGroups(parser, newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); } - else { createProcessesGroups(parser, newFastaFile, newNamesFile, newMapFile, groups); } - - delete parser; - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - - //run unique.seqs for deconvolute results - string inputString = "fasta=" + newFastaFile; - if (namefile != "") { inputString += ", name=" + newNamesFile; } - m->mothurOutEndLine(); - m->mothurOut("/******************************************/"); m->mothurOutEndLine(); - m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - m->mothurCalling = true; + vector groups; + if (countfile != "") { + cparser = new SequenceCountParser(countfile, fastafile); + groups = cparser->getNamesOfGroups(); + }else { + if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile); } + else { parser = new SequenceParser(groupfile, fastafile); } + groups = parser->getNamesOfGroups(); + } - Command* uniqueCommand = new DeconvoluteCommand(inputString); - uniqueCommand->execute(); - - map > filenames = uniqueCommand->getOutputFiles(); - - delete uniqueCommand; - m->mothurCalling = false; - m->mothurOut("/******************************************/"); m->mothurOutEndLine(); - - m->renameFile(filenames["fasta"][0], newFastaFile); - m->renameFile(filenames["name"][0], newNamesFile); - + if(processors == 1) { driverGroups(newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); } + else { createProcessesGroups(newFastaFile, newNamesFile, newMapFile, groups); } + + if (countfile != "") { + mergeGroupCounts(newCountFile, newNamesFile, newFastaFile); + delete cparser; + }else { + delete parser; + //run unique.seqs for deconvolute results + string inputString = "fasta=" + newFastaFile; + if (namefile != "") { inputString += ", name=" + newNamesFile; } + m->mothurOutEndLine(); + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* uniqueCommand = new DeconvoluteCommand(inputString); + uniqueCommand->execute(); + + map > filenames = uniqueCommand->getOutputFiles(); + + delete uniqueCommand; + m->mothurCalling = false; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + m->renameFile(filenames["fasta"][0], newFastaFile); + m->renameFile(filenames["name"][0], newNamesFile); + } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine(); }else { @@ -272,8 +313,9 @@ int PreClusterCommand::execute(){ m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine(); m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); - printData(newFastaFile, newNamesFile); - + if (countfile != "") { newNamesFile = newCountFile; } + printData(newFastaFile, newNamesFile, ""); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); } @@ -295,6 +337,11 @@ int PreClusterCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } return 0; @@ -305,7 +352,7 @@ int PreClusterCommand::execute(){ } } /**************************************************************************************************/ -int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newFName, string newNName, string newMFile, vector groups) { +int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile, vector groups) { try { vector processIDS; @@ -336,7 +383,7 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF process++; }else if (pid == 0){ outputNames.clear(); - num = driverGroups(parser, newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups); + num = driverGroups(newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups); string tempFile = toString(getpid()) + ".outputNames.temp"; ofstream outTemp; @@ -355,7 +402,7 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF } //do my part - num = driverGroups(parser, newFName, newNName, newMFile, lines[0].start, lines[0].end, groups); + num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups); //force parent to wait until all the processes are done for (int i=0;i groups){ +int PreClusterCommand::driverGroups(string newFFile, string newNFile, string newMFile, int start, int end, vector groups){ try { int numSeqs = 0; @@ -458,24 +505,29 @@ int PreClusterCommand::driverGroups(SequenceParser* parser, string newFFile, str m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine(); map thisNameMap; - if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); } - vector thisSeqs = parser->getSeqs(groups[i]); - + vector thisSeqs; + if (groupfile != "") { + thisSeqs = parser->getSeqs(groups[i]); + }else if (countfile != "") { + thisSeqs = cparser->getSeqs(groups[i]); + } + if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); } + //fill alignSeqs with this groups info. - numSeqs = loadSeqs(thisNameMap, thisSeqs); + numSeqs = loadSeqs(thisNameMap, thisSeqs, groups[i]); if (m->control_pressed) { return 0; } if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } - int count = process(newMFile+groups[i]+".map"); + int count= process(newMFile+groups[i]+".map"); outputNames.push_back(newMFile+groups[i]+".map"); outputTypes["map"].push_back(newMFile+groups[i]+".map"); if (m->control_pressed) { return 0; } m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine(); m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); - printData(newFFile, newNFile); + printData(newFFile, newNFile, groups[i]); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); @@ -559,26 +611,13 @@ int PreClusterCommand::readFASTA(){ //ifstream inNames; ifstream inFasta; - //m->openInputFile(namefile, inNames); m->openInputFile(fastafile, inFasta); - - //string firstCol, secondCol, nameString; set lengths; while (!inFasta.eof()) { if (m->control_pressed) { inFasta.close(); return 0; } - - //inNames >> firstCol >> secondCol; - //nameString = secondCol; - - //m->gobble(inNames); - //int size = 1; - //while (secondCol.find_first_of(',') != -1) { - // size++; - // secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); - //} - + Sequence seq(inFasta); m->gobble(inFasta); if (seq.getName() != "") { //can get "" if commented line is at end of fasta file @@ -592,14 +631,15 @@ int PreClusterCommand::readFASTA(){ lengths.insert(seq.getAligned().length()); } }else { //no names file, you are identical to yourself - seqPNode tempNode(1, seq, seq.getName()); + int numRep = 1; + if (countfile != "") { numRep = ct.getNumSeqs(seq.getName()); } + seqPNode tempNode(numRep, seq, seq.getName()); alignSeqs.push_back(tempNode); lengths.insert(seq.getAligned().length()); } } } inFasta.close(); - //inNames.close(); if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); } else if (lengths.size() == 1) { length = *(lengths.begin()); } @@ -613,13 +653,15 @@ int PreClusterCommand::readFASTA(){ } } /**************************************************************************************************/ -int PreClusterCommand::loadSeqs(map& thisName, vector& thisSeqs){ +int PreClusterCommand::loadSeqs(map& thisName, vector& thisSeqs, string group){ try { set lengths; alignSeqs.clear(); map::iterator it; bool error = false; - + map thisCount; + if (countfile != "") { thisCount = cparser->getCountTable(group); } + for (int i = 0; i < thisSeqs.size(); i++) { if (m->control_pressed) { return 0; } @@ -641,12 +683,20 @@ int PreClusterCommand::loadSeqs(map& thisName, vector& lengths.insert(thisSeqs[i].getAligned().length()); } }else { //no names file, you are identical to yourself - seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName()); + int numRep = 1; + if (countfile != "") { + map::iterator it2 = thisCount.find(thisSeqs[i].getName()); + + //should never be true since parser checks for this + if (it2 == thisCount.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); error = true; } + else { numRep = it2->second; } + } + seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName()); alignSeqs.push_back(tempNode); lengths.insert(thisSeqs[i].getAligned().length()); } } - + if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); } else if (lengths.size() == 1) { length = *(lengths.begin()); } @@ -683,10 +733,84 @@ int PreClusterCommand::calcMisMatches(string seq1, string seq2){ exit(1); } } +/**************************************************************************************************/ + +int PreClusterCommand::mergeGroupCounts(string newcount, string newname, string newfasta){ + try { + ifstream inNames; + m->openInputFile(newname, inNames); + + string group, first, second; + set uniqueNames; + while (!inNames.eof()) { + if (m->control_pressed) { break; } + inNames >> group; m->gobble(inNames); + inNames >> first; m->gobble(inNames); + inNames >> second; m->gobble(inNames); + + vector names; + m->splitAtComma(second, names); + + uniqueNames.insert(first); + + int total = ct.getGroupCount(first, group); + for (int i = 1; i < names.size(); i++) { + total += ct.getGroupCount(names[i], group); + ct.setAbund(names[i], group, 0); + } + ct.setAbund(first, group, total); + } + inNames.close(); + + vector namesOfSeqs = ct.getNamesOfSeqs(); + for (int i = 0; i < namesOfSeqs.size(); i++) { + if (ct.getNumSeqs(namesOfSeqs[i]) == 0) { + ct.remove(namesOfSeqs[i]); + } + } + + ct.printTable(newcount); + m->mothurRemove(newname); + + if (bygroup) { //if by group, must remove the duplicate seqs that are named the same + ifstream in; + m->openInputFile(newfasta, in); + + ofstream out; + m->openOutputFile(newfasta+"temp", out); + + int count = 0; + set already; + while(!in.eof()) { + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { + count++; + if (already.count(seq.getName()) == 0) { + seq.printSequence(out); + already.insert(seq.getName()); + } + } + } + in.close(); + out.close(); + m->mothurRemove(newfasta); + m->renameFile(newfasta+"temp", newfasta); + } + return 0; + + } + catch(exception& e) { + m->errorOut(e, "PreClusterCommand", "mergeGroupCounts"); + exit(1); + } +} /**************************************************************************************************/ -void PreClusterCommand::printData(string newfasta, string newname){ +void PreClusterCommand::printData(string newfasta, string newname, string group){ try { ofstream outFasta; ofstream outNames; @@ -699,10 +823,14 @@ void PreClusterCommand::printData(string newfasta, string newname){ m->openOutputFile(newname, outNames); } + if ((countfile != "") && (group == "")) { outNames << "Representative_Sequence\ttotal\n"; } for (int i = 0; i < alignSeqs.size(); i++) { if (alignSeqs[i].numIdentical != 0) { alignSeqs[i].seq.printSequence(outFasta); - outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; + if (countfile != "") { + if (group != "") { outNames << group << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } + else { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].numIdentical << endl; } + }else { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } } } diff --git a/preclustercommand.h b/preclustercommand.h index 082bff2..e63afa6 100644 --- a/preclustercommand.h +++ b/preclustercommand.h @@ -15,6 +15,7 @@ #include "command.hpp" #include "sequence.hpp" #include "sequenceparser.h" +#include "sequencecountparser.h" /************************************************************/ struct seqPNode { @@ -28,7 +29,13 @@ struct seqPNode { ~seqPNode() {} }; /************************************************************/ -inline bool comparePriority(seqPNode first, seqPNode second) { return (first.numIdentical > second.numIdentical); } +inline bool comparePriority(seqPNode first, seqPNode second) { + if (first.numIdentical > second.numIdentical) { return true; } + else if (first.numIdentical == second.numIdentical) { + if (first.seq.getName() > second.seq.getName()) { return true; } + } + return false; +} //************************************************************/ class PreClusterCommand : public Command { @@ -58,9 +65,13 @@ private: linePair(int i, int j) : start(i), end(j) {} }; + SequenceParser* parser; + SequenceCountParser* cparser; + CountTable ct; + int diffs, length, processors; bool abort, bygroup; - string fastafile, namefile, outputDir, groupfile; + string fastafile, namefile, outputDir, groupfile, countfile; vector alignSeqs; //maps the number of identical seqs to a sequence map names; //represents the names file first column maps to second column map sizes; //this map a seq name to the number of identical seqs in the names file @@ -73,11 +84,12 @@ private: void readNameFile(); //int readNamesFASTA(); int calcMisMatches(string, string); - void printData(string, string); //fasta filename, names file name + void printData(string, string, string); //fasta filename, names file name int process(string); - int loadSeqs(map&, vector&); - int driverGroups(SequenceParser*, string, string, string, int, int, vector groups); - int createProcessesGroups(SequenceParser*, string, string, string, vector); + int loadSeqs(map&, vector&, string); + int driverGroups(string, string, string, int, int, vector groups); + int createProcessesGroups(string, string, string, vector); + int mergeGroupCounts(string, string, string); }; /**************************************************************************************************/ @@ -87,7 +99,7 @@ private: struct preClusterData { string fastafile; string namefile; - string groupfile; + string groupfile, countfile; string newFName, newNName, newMName; MothurOut* m; int start; @@ -97,7 +109,7 @@ struct preClusterData { vector mapFileNames; preClusterData(){} - preClusterData(string f, string n, string g, string nff, string nnf, string nmf, vector gr, MothurOut* mout, int st, int en, int d, int tid) { + preClusterData(string f, string n, string g, string c, string nff, string nnf, string nmf, vector gr, MothurOut* mout, int st, int en, int d, int tid) { fastafile = f; namefile = n; groupfile = g; @@ -110,6 +122,7 @@ struct preClusterData { diffs = d; threadID = tid; groups = gr; + countfile = c; } }; @@ -124,10 +137,15 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ //parse fasta and name file by group SequenceParser* parser; - if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } - else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } - - int numSeqs = 0; + SequenceCountParser* cparser; + if (pDataArray->countfile != "") { + cparser = new SequenceCountParser(pDataArray->countfile, pDataArray->fastafile); + }else { + if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } + else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } + } + + int numSeqs = 0; vector alignSeqs; //clear out old files ofstream outF; pDataArray->m->openOutputFile(pDataArray->newFName, outF); outF.close(); @@ -143,8 +161,13 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine(); map thisNameMap; - if (pDataArray->namefile != "") { thisNameMap = parser->getNameMap(pDataArray->groups[k]); } - vector thisSeqs = parser->getSeqs(pDataArray->groups[k]); + vector thisSeqs; + if (pDataArray->groupfile != "") { + thisSeqs = parser->getSeqs(pDataArray->groups[k]); + }else if (pDataArray->countfile != "") { + thisSeqs = cparser->getSeqs(pDataArray->groups[k]); + } + if (pDataArray->namefile != "") { thisNameMap = parser->getNameMap(pDataArray->groups[k]); } //fill alignSeqs with this groups info. //////////////////////////////////////////////////// @@ -154,6 +177,9 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ alignSeqs.clear(); map::iterator it; bool error = false; + map thisCount; + if (pDataArray->countfile != "") { thisCount = cparser->getCountTable(pDataArray->groups[k]); } + for (int i = 0; i < thisSeqs.size(); i++) { @@ -176,8 +202,16 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); } } }else { //no names file, you are identical to yourself - seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName()); - alignSeqs.push_back(tempNode); + int numRep = 1; + if (pDataArray->countfile != "") { + map::iterator it2 = thisCount.find(thisSeqs[i].getName()); + + //should never be true since parser checks for this + if (it2 == thisCount.end()) { pDataArray->m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); error = true; } + else { numRep = it2->second; } + } + seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName()); + alignSeqs.push_back(tempNode); if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); } } } @@ -274,7 +308,9 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ for (int i = 0; i < alignSeqs.size(); i++) { if (alignSeqs[i].numIdentical != 0) { alignSeqs[i].seq.printSequence(outFasta); - outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; + if (pDataArray->countfile != "") { outNames << pDataArray->groups[k] << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; + }else { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } + } } diff --git a/randomforest.cpp b/randomforest.cpp new file mode 100644 index 0000000..36a2c1a --- /dev/null +++ b/randomforest.cpp @@ -0,0 +1,156 @@ +// +// randomforest.cpp +// Mothur +// +// Created by Sarah Westcott on 10/2/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "randomforest.hpp" + +/***********************************************************************/ + +RandomForest::RandomForest(const vector > dataSet,const int numDecisionTrees, + const string treeSplitCriterion = "informationGain") : AbstractRandomForest(dataSet, numDecisionTrees, treeSplitCriterion) { + m = MothurOut::getInstance(); +} + +/***********************************************************************/ +// DONE +int RandomForest::calcForrestErrorRate() { + try { + int numCorrect = 0; + for (map >::iterator it = globalOutOfBagEstimates.begin(); it != globalOutOfBagEstimates.end(); it++) { + + if (m->control_pressed) { return 0; } + + int indexOfSample = it->first; + vector predictedOutComes = it->second; + vector::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end()); + int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin()); + int realOutcome = dataSet[indexOfSample][numFeatures]; + + if (majorityVotedOutcome == realOutcome) { numCorrect++; } + } + + // TODO: save or return forrestErrorRate for future use; + double forrestErrorRate = 1 - ((double)numCorrect / (double)globalOutOfBagEstimates.size()); + + m->mothurOut("numCorrect = " + toString(numCorrect)+ "\n"); + m->mothurOut("forrestErrorRate = " + toString(forrestErrorRate)+ "\n"); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RandomForest", "calcForrestErrorRate"); + exit(1); + } +} + +/***********************************************************************/ +// DONE +int RandomForest::calcForrestVariableImportance(string filename) { + try { + + // TODO: need to add try/catch operators to fix this + // follow the link: http://en.wikipedia.org/wiki/Dynamic_cast + //if you are going to dynamically cast, aren't you undoing the advantage of abstraction. Why abstract at all? + //could cause maintenance issues later if other types of Abstract decison trees are created that cannot be cast as a decision tree. + for (int i = 0; i < decisionTrees.size(); i++) { + if (m->control_pressed) { return 0; } + DecisionTree* decisionTree = dynamic_cast(decisionTrees[i]); + + for (int j = 0; j < numFeatures; j++) { + globalVariableImportanceList[j] += (double)decisionTree->variableImportanceList[j]; + } + } + + for (int i = 0; i < numFeatures; i++) { + cout << "[" << i << ',' << globalVariableImportanceList[i] << "], "; + globalVariableImportanceList[i] /= (double)numDecisionTrees; + } + + vector< vector > globalVariableRanks; + for (int i = 0; i < globalVariableImportanceList.size(); i++) { + if (globalVariableImportanceList[i] > 0) { + vector globalVariableRank(2, 0); + globalVariableRank[0] = i; globalVariableRank[1] = globalVariableImportanceList[i]; + globalVariableRanks.push_back(globalVariableRank); + } + } + + VariableRankDescendingSorterDouble variableRankDescendingSorter; + sort(globalVariableRanks.begin(), globalVariableRanks.end(), variableRankDescendingSorter); + ofstream out; + m->openOutputFile(filename, out); + out <<"OTU\tRank\n"; + for (int i = 0; i < globalVariableRanks.size(); i++) { + out << m->currentBinLabels[(int)globalVariableRanks[i][0]] << '\t' << globalVariableImportanceList[globalVariableRanks[i][0]] << endl; + } + out.close(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "RandomForest", "calcForrestVariableImportance"); + exit(1); + } +} +/***********************************************************************/ +// DONE +int RandomForest::populateDecisionTrees() { + try { + + for (int i = 0; i < numDecisionTrees; i++) { + if (m->control_pressed) { return 0; } + if (((i+1) % 10) == 0) { m->mothurOut("Creating " + toString(i+1) + " (th) Decision tree\n"); } + // TODO: need to first fix if we are going to use pointer based system or anything else + DecisionTree* decisionTree = new DecisionTree(dataSet, globalDiscardedFeatureIndices, OptimumFeatureSubsetSelector("log2"), treeSplitCriterion); + decisionTree->calcTreeVariableImportanceAndError(); + if (m->control_pressed) { return 0; } + updateGlobalOutOfBagEstimates(decisionTree); + if (m->control_pressed) { return 0; } + decisionTree->purgeDataSetsFromTree(); + if (m->control_pressed) { return 0; } + decisionTrees.push_back(decisionTree); + } + + if (m->debug) { + // m->mothurOut("globalOutOfBagEstimates = " + toStringVectorMap(globalOutOfBagEstimates)+ "\n"); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RandomForest", "populateDecisionTrees"); + exit(1); + } +} +/***********************************************************************/ +// TODO: need to finalize bettween reference and pointer for DecisionTree [partially solved] +// TODO: make this pure virtual in superclass +// DONE +int RandomForest::updateGlobalOutOfBagEstimates(DecisionTree* decisionTree) { + try { + for (map::iterator it = decisionTree->outOfBagEstimates.begin(); it != decisionTree->outOfBagEstimates.end(); it++) { + + if (m->control_pressed) { return 0; } + + int indexOfSample = it->first; + int predictedOutcomeOfSample = it->second; + + if (globalOutOfBagEstimates.count(indexOfSample) == 0) { + globalOutOfBagEstimates[indexOfSample] = vector(decisionTree->numOutputClasses, 0); + }; + + globalOutOfBagEstimates[indexOfSample][predictedOutcomeOfSample] += 1; + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "RandomForest", "updateGlobalOutOfBagEstimates"); + exit(1); + } +} +/***********************************************************************/ + + diff --git a/randomforest.hpp b/randomforest.hpp new file mode 100755 index 0000000..716d1a1 --- /dev/null +++ b/randomforest.hpp @@ -0,0 +1,45 @@ +// +// randomforest.hpp +// rrf-fs-prototype +// +// Created by Abu Zaher Faridee on 7/20/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#ifndef rrf_fs_prototype_randomforest_hpp +#define rrf_fs_prototype_randomforest_hpp + +#include "macros.h" +#include "abstractrandomforest.hpp" +#include "decisiontree.hpp" + +class RandomForest: public AbstractRandomForest { + +public: + + // DONE + RandomForest(const vector > dataSet,const int numDecisionTrees, const string); + + + //NOTE:: if you are going to dynamically cast, aren't you undoing the advantage of abstraction. Why abstract at all? + //could cause maintenance issues later if other types of Abstract decison trees are created that cannot be cast as a decision tree. + virtual ~RandomForest() { + for (vector::iterator it = decisionTrees.begin(); it != decisionTrees.end(); it++) { + // we know that this is decision tree, so we can do a dynamic_case here + DecisionTree* decisionTree = dynamic_cast(*it); + // calling the destructor by deleting + delete decisionTree; + } + } + + int calcForrestErrorRate(); + int calcForrestVariableImportance(string); + int populateDecisionTrees(); + int updateGlobalOutOfBagEstimates(DecisionTree* decisionTree); + +private: + MothurOut* m; + +}; + +#endif diff --git a/readcluster.cpp b/readcluster.cpp index b6cb71d..a6adabb 100644 --- a/readcluster.cpp +++ b/readcluster.cpp @@ -42,6 +42,26 @@ int ReadCluster::read(NameAssignment*& nameMap){ } } /***********************************************************************/ +int ReadCluster::read(CountTable*& ct){ + try { + + if (format == "phylip") { convertPhylip2Column(ct); } + else { list = new ListVector(ct->getListVector()); } + + if (m->control_pressed) { return 0; } + + if (sortWanted) { OutPutFile = m->sortFile(distFile, outputDir); } + else { OutPutFile = distFile; } //for use by clusters splitMatrix to convert a phylip matrix to column + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ReadCluster", "read"); + exit(1); + } +} +/***********************************************************************/ int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){ try { @@ -224,6 +244,181 @@ int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){ } /***********************************************************************/ +int ReadCluster::convertPhylip2Column(CountTable*& ct){ + try { + //convert phylip file to column file + map rowToName; + map::iterator it; + + ifstream in; + ofstream out; + string tempFile = distFile + ".column.temp"; + + m->openInputFile(distFile, in); m->gobble(in); + m->openOutputFile(tempFile, out); + + float distance; + int square, nseqs; + string name; + vector matrixNames; + + string numTest; + in >> numTest >> name; + + if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); } + else { convert(numTest, nseqs); } + + rowToName[0] = name; + matrixNames.push_back(name); + + if(ct == NULL){ + list = new ListVector(nseqs); + list->set(0, name); + } + else{ list = new ListVector(ct->getListVector()); } + + char d; + while((d=in.get()) != EOF){ + + if(isalnum(d)){ + square = 1; + in.putback(d); + for(int i=0;i> distance; + } + break; + } + if(d == '\n'){ + square = 0; + break; + } + } + + if(square == 0){ + + for(int i=1;i> name; + rowToName[i] = name; + matrixNames.push_back(name); + + //there's A LOT of repeated code throughout this method... + if(ct == NULL){ + list->set(i, name); + + for(int j=0;jcontrol_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; } + + in >> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff){ + out << i << '\t' << j << '\t' << distance << endl; + } + } + + } + else{ + + for(int j=0;jcontrol_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; } + + in >> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff){ + out << i << '\t' << j << '\t' << distance << endl; + } + } + } + } + } + else{ + for(int i=1;i> name; + rowToName[i] = name; + matrixNames.push_back(name); + + if(ct == NULL){ + list->set(i, name); + for(int j=0;jcontrol_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; } + + in >> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff && j < i){ + out << i << '\t' << j << '\t' << distance << endl; + } + } + } + else{ + for(int j=0;jcontrol_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; } + + in >> distance; + + if (distance == -1) { distance = 1000000; } + + if(distance < cutoff && j < i){ + out << i << '\t' << j << '\t' << distance << endl; + } + + } + } + } + } + + list->setLabel("0"); + in.close(); + out.close(); + + if(ct == NULL){ + ct = new CountTable(); + for(int i=0;ipush_back(matrixNames[i]); + } + } + + + ifstream in2; + ofstream out2; + + string outputFile = m->getRootName(distFile) + "column.dist"; + m->openInputFile(tempFile, in2); + m->openOutputFile(outputFile, out2); + + int first, second; + float dist; + + while (in2) { + if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(tempFile); m->mothurRemove(outputFile); return 0; } + + in2 >> first >> second >> dist; + out2 << rowToName[first] << '\t' << rowToName[second] << '\t' << dist << endl; + m->gobble(in2); + } + in2.close(); + out2.close(); + + m->mothurRemove(tempFile); + distFile = outputFile; + + if (m->control_pressed) { m->mothurRemove(outputFile); } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ReadCluster", "convertPhylip2Column"); + exit(1); + } +} +/***********************************************************************/ + ReadCluster::~ReadCluster(){} /***********************************************************************/ diff --git a/readcluster.h b/readcluster.h index a838dac..7ea579c 100644 --- a/readcluster.h +++ b/readcluster.h @@ -13,6 +13,7 @@ #include "mothur.h" #include "nameassignment.hpp" #include "listvector.hpp" +#include "counttable.h" /******************************************************/ @@ -23,6 +24,7 @@ public: ReadCluster(string, float, string, bool); ~ReadCluster(); int read(NameAssignment*&); + int read(CountTable*&); string getOutputFile() { return OutPutFile; } void setFormat(string f) { format = f; } ListVector* getListVector() { return list; } @@ -36,6 +38,7 @@ private: bool sortWanted; int convertPhylip2Column(NameAssignment*&); + int convertPhylip2Column(CountTable*&); }; /******************************************************/ diff --git a/readmatrix.hpp b/readmatrix.hpp index 90d5b43..bc3874e 100644 --- a/readmatrix.hpp +++ b/readmatrix.hpp @@ -16,7 +16,6 @@ #include "counttable.h" #include "sparsedistancematrix.h" -class SparseMatrix; class ReadMatrix { diff --git a/readtree.cpp b/readtree.cpp index 6fa4c3d..71c4bd5 100644 --- a/readtree.cpp +++ b/readtree.cpp @@ -20,12 +20,12 @@ ReadTree::ReadTree() { } } /***********************************************************************/ -int ReadTree::AssembleTrees(map nameMap) { +int ReadTree::AssembleTrees() { try { //assemble users trees for (int i = 0; i < Trees.size(); i++) { if (m->control_pressed) { return 0; } - Trees[i]->assembleTree(nameMap); + Trees[i]->assembleTree(); } return 0; } @@ -107,7 +107,7 @@ float ReadTree::readBranchLength(istream& f) { /***********************************************************************/ //This class reads a file in Newick form and stores it in a tree. -int ReadNewickTree::read(TreeMap* tmap) { +int ReadNewickTree::read(CountTable* ct) { try { holder = ""; int c, error; @@ -129,12 +129,12 @@ int ReadNewickTree::read(TreeMap* tmap) { } //make new tree - T = new Tree(tmap); + T = new Tree(ct); numNodes = T->getNumNodes(); numLeaves = T->getNumLeaves(); - error = readTreeString(tmap); + error = readTreeString(ct); //save trees for later commands Trees.push_back(T); @@ -143,9 +143,9 @@ int ReadNewickTree::read(TreeMap* tmap) { //if you are a nexus file }else if ((c = filehandle.peek()) == '#') { //get right number of seqs from nexus file. - Tree* temp = new Tree(tmap); delete temp; + Tree* temp = new Tree(ct); delete temp; - nexusTranslation(tmap); //reads file through the translation and updates treemap + nexusTranslation(ct); //reads file through the translation and updates treemap while((c = filehandle.peek()) != EOF) { // get past comments while ((c = filehandle.peek()) != EOF) { @@ -166,12 +166,12 @@ int ReadNewickTree::read(TreeMap* tmap) { filehandle.putback(c); //put back first ( of tree. //make new tree - T = new Tree(tmap); + T = new Tree(ct); numNodes = T->getNumNodes(); numLeaves = T->getNumLeaves(); //read tree info - error = readTreeString(tmap); + error = readTreeString(ct); //save trees for later commands Trees.push_back(T); @@ -191,7 +191,7 @@ int ReadNewickTree::read(TreeMap* tmap) { } /**************************************************************************************************/ //This function read the file through the translation of the sequences names and updates treemap. -string ReadNewickTree::nexusTranslation(TreeMap* tmap) { +string ReadNewickTree::nexusTranslation(CountTable* ct) { try { holder = ""; @@ -209,42 +209,14 @@ string ReadNewickTree::nexusTranslation(TreeMap* tmap) { filehandle >> holder; if(holder == "tree" && comment != 1){return holder;} } - - //update treemap - tmap->namesOfSeqs.clear(); - - /*char c; - string number, name; - while ((c = filehandle.peek()) != EOF) { - - filehandle >> number; - - if ((number == "tree") || (number == ";") ) { name = number; break; } - - filehandle >> name; - - char lastChar; - if (name.length() != 0) { lastChar = name[name.length()-1]; } - - if ((name == "tree") || (name == ";") ) { break; } - - if (lastChar == ',') { name.erase(name.end()-1); } //erase the comma - */ - + string number, name; for(int i=0;i> number; filehandle >> name; name.erase(name.end()-1); //erase the comma - - //insert new one with new name - string group = tmap->getGroup(name); - tmap->treemap[toString(number)].groupname = group; - tmap->treemap[toString(number)].vectorIndex = tmap->getIndex(name); - //erase old one. so treemap[sarah].groupnumber is now treemap[1].groupnumber. if number is 1 and name is sarah. - tmap->treemap.erase(name); - tmap->namesOfSeqs.push_back(number); + ct->renameSeq(name, toString(number)); } return name; @@ -256,7 +228,7 @@ string ReadNewickTree::nexusTranslation(TreeMap* tmap) { } /**************************************************************************************************/ -int ReadNewickTree::readTreeString(TreeMap* tmap) { +int ReadNewickTree::readTreeString(CountTable* ct) { try { int n = 0; @@ -269,7 +241,7 @@ int ReadNewickTree::readTreeString(TreeMap* tmap) { if(ch == '('){ n = numLeaves; //number of leaves / sequences, we want node 1 to start where the leaves left off - lc = readNewickInt(filehandle, n, T, tmap); + lc = readNewickInt(filehandle, n, T, ct); if (lc == -1) { m->mothurOut("error with lc"); m->mothurOutEndLine(); return -1; } //reports an error in reading if(filehandle.peek()==','){ @@ -281,7 +253,7 @@ int ReadNewickTree::readTreeString(TreeMap* tmap) { } if(rooted != 1){ - rc = readNewickInt(filehandle, n, T, tmap); + rc = readNewickInt(filehandle, n, T, ct); if (rc == -1) { m->mothurOut("error with rc"); m->mothurOutEndLine(); return -1; } //reports an error in reading if(filehandle.peek() == ')'){ readSpecialChar(filehandle,')',"right parenthesis"); @@ -326,7 +298,7 @@ int ReadNewickTree::readTreeString(TreeMap* tmap) { } /**************************************************************************************************/ -int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) { +int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, CountTable* ct) { try { if (m->control_pressed) { return -1; } @@ -339,7 +311,7 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) { //read all children vector childrenNodes; while(f.peek() != ')'){ - int child = readNewickInt(f, n, T, tmap); + int child = readNewickInt(f, n, T, ct); if (child == -1) { return -1; } //reports an error in reading //cout << "child = " << child << endl; childrenNodes.push_back(child); @@ -387,12 +359,7 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) { }else{ T->tree[n].setBranchLength(0.0); } - - //T->tree[n].setChildren(lc,rc); - //T->tree[lc].setParent(n); - //T->tree[rc].setParent(n); - //T->printTree(); cout << endl; - + return n++; }else{ @@ -410,33 +377,27 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) { f.putback(d); //set group info - string group = tmap->getGroup(name); + vector group = ct->getGroups(name); //find index in tree of name int n1 = T->getIndex(name); //adds sequence names that are not in group file to the "xxx" group - if(group == "not found") { + if(group.size() == 0) { m->mothurOut("Name: " + name + " is not in your groupfile, and will be disregarded. \n"); //readOk = -1; return n1; - tmap->namesOfSeqs.push_back(name); - tmap->treemap[name].groupname = "xxx"; - - map::iterator it; - it = tmap->seqsPerGroup.find("xxx"); - if (it == tmap->seqsPerGroup.end()) { //its a new group - tmap->addGroup("xxx"); - tmap->seqsPerGroup["xxx"] = 1; - }else { - tmap->seqsPerGroup["xxx"]++; - } - - group = "xxx"; - } - - vector tempGroup; tempGroup.push_back(group); - - T->tree[n1].setGroup(tempGroup); + vector currentGroups = ct->getNamesOfGroups(); + if (!m->inUsersGroups("xxx", currentGroups)) { ct->addGroup("xxx"); } + currentGroups = ct->getNamesOfGroups(); + vector thisCounts; thisCounts.resize(currentGroups.size(), 0); + for (int h = 0; h < currentGroups.size(); h++) { + if (currentGroups[h] == "xxx") { thisCounts[h] = 1; break; } + } + ct->push_back(name, thisCounts); + + group.push_back("xxx"); + } + T->tree[n1].setGroup(group); T->tree[n1].setChildren(-1,-1); if(blen == 1){ diff --git a/readtree.h b/readtree.h index 6b074de..8a69243 100644 --- a/readtree.h +++ b/readtree.h @@ -11,6 +11,7 @@ #include "mothur.h" #include "tree.h" +#include "counttable.h" #define MAX_LINE 513 #define SKIPLINE(f,c) {while((c=f.get())!=EOF && ((c) != '\n')){}} @@ -24,17 +25,17 @@ class ReadTree { ReadTree(); virtual ~ReadTree() {}; - virtual int read(TreeMap*) = 0; + virtual int read(CountTable*) = 0; int readSpecialChar(istream&, char, string); int readNodeChar(istream& f); float readBranchLength(istream& f); vector getTrees() { return Trees; } - int AssembleTrees(map); + int AssembleTrees(); protected: vector Trees; - TreeMap* treeMap; + CountTable* ct; int numNodes, numLeaves; MothurOut* m; @@ -48,13 +49,13 @@ class ReadNewickTree : public ReadTree { public: ReadNewickTree(string file) : treeFile(file) { m->openInputFile(file, filehandle); readOk = 0; } ~ReadNewickTree() {}; - int read(TreeMap*); + int read(CountTable*); private: Tree* T; - int readNewickInt(istream&, int&, Tree*, TreeMap*); - int readTreeString(TreeMap*); - string nexusTranslation(TreeMap*); + int readNewickInt(istream&, int&, Tree*, CountTable*); + int readTreeString(CountTable*); + string nexusTranslation(CountTable*); ifstream filehandle; string treeFile; string holder; diff --git a/removegroupscommand.cpp b/removegroupscommand.cpp index 05b1170..a29906c 100644 --- a/removegroupscommand.cpp +++ b/removegroupscommand.cpp @@ -18,9 +18,9 @@ vector RemoveGroupsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(pfasta); CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup); - CommandParameter pdesign("design", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pdesign); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup); CommandParameter pdesign("design", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pdesign); CommandParameter plist("list", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(ptaxonomy); CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos); @@ -41,9 +41,9 @@ vector RemoveGroupsCommand::setParameters(){ string RemoveGroupsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The remove.groups command removes sequences from a specfic group or set of groups from the following file types: fasta, name, group, list, taxonomy, design or sharedfile.\n"; + helpString += "The remove.groups command removes sequences from a specfic group or set of groups from the following file types: fasta, name, group, count, list, taxonomy, design or sharedfile.\n"; helpString += "It outputs a file containing the sequences NOT in the those specified groups, or with a sharedfile eliminates the groups you selected.\n"; - helpString += "The remove.groups command parameters are accnos, fasta, name, group, list, taxonomy, shared, design and groups. The group parameter is required, unless you have a current group file or are using a sharedfile.\n"; + helpString += "The remove.groups command parameters are accnos, fasta, name, group, list, taxonomy, shared, design and groups. The group or count parameter is required, unless you have a current group or count file or are using a sharedfile.\n"; helpString += "You must also provide an accnos containing the list of groups to remove or set the groups parameter to the groups you wish to remove.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like removed. You can separate group names with dashes.\n"; helpString += "The remove.groups command should be in the following format: remove.groups(accnos=yourAccnos, fasta=yourFasta, group=yourGroupFile).\n"; @@ -71,6 +71,7 @@ string RemoveGroupsCommand::getOutputFileNameTag(string type, string inputName=" else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "shared") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "design") { outputFileName = "pick" + m->getExtension(inputName); } @@ -96,6 +97,7 @@ RemoveGroupsCommand::RemoveGroupsCommand(){ outputTypes["list"] = tempOutNames; outputTypes["shared"] = tempOutNames; outputTypes["design"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "RemoveGroupsCommand", "RemoveGroupsCommand"); @@ -134,6 +136,7 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option) { outputTypes["list"] = tempOutNames; outputTypes["shared"] = tempOutNames; outputTypes["design"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter @@ -207,6 +210,14 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["design"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -258,12 +269,22 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option) { else if (sharedfile == "not found") { sharedfile = ""; } else { m->setSharedFile(sharedfile); } - groupfile = validParameter.validFile(parameters, "group", true); - if (groupfile == "not open") { groupfile = ""; abort = true; } - else if (groupfile == "not found") { groupfile = ""; } - else { m->setGroupFile(groupfile); } - if ((sharedfile == "") && (groupfile == "") && (designfile == "")) { + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + + + if ((sharedfile == "") && (groupfile == "") && (designfile == "") && (countfile == "")) { //is there are current file available for any of these? if ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != "")) { //give priority to group, then shared @@ -273,7 +294,11 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option) { sharedfile = m->getSharedFile(); if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You have no current groupfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current groupfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + } } } }else { @@ -287,7 +312,12 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option) { designfile = m->getDesignFile(); if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You have no current groupfile or sharedfile or designfile and one is required."); m->mothurOutEndLine(); abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current groupfile, designfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + } + } } } @@ -296,14 +326,15 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option) { if ((accnosfile == "") && (Groups.size() == 0)) { m->mothurOut("You must provide an accnos file containing group names or specify groups using the groups parameter."); m->mothurOutEndLine(); abort = true; } - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (sharedfile == "") && (designfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared, design or list."); m->mothurOutEndLine(); abort = true; } - if ((groupfile == "") && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != ""))) { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group file."); m->mothurOutEndLine(); abort = true; } - - if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ - vector files; files.push_back(fastafile); files.push_back(taxfile); - parser.getNameFile(files); - } - + if ((fastafile == "") && (namefile == "") && (countfile == "") && (groupfile == "") && (designfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared, design, count or list."); m->mothurOutEndLine(); abort = true; } + if (((groupfile == "") && (countfile == "")) && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != ""))) { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group or count file."); m->mothurOutEndLine(); abort = true; } + + if (countfile == "") { + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } } @@ -337,7 +368,28 @@ int RemoveGroupsCommand::execute(){ fillNames(); delete groupMap; - } + }else if (countfile != ""){ + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + CountTable ct; + ct.readTable(countfile); + if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: your count file does not contain group info, aborting.\n"); return 0; } + + vector gNamesOfGroups = ct.getNamesOfGroups(); + SharedUtil util; + util.setGroups(Groups, gNamesOfGroups); + vector namesOfSeqs = ct.getNamesOfSeqs(); + sort(Groups.begin(), Groups.end()); + + for (int i = 0; i < namesOfSeqs.size(); i++) { + vector thisSeqsGroups = ct.getGroups(namesOfSeqs[i]); + if (m->isSubset(Groups, thisSeqsGroups)) { //you only have seqs from these groups so remove you + names.insert(namesOfSeqs[i]); + } + } + } + if (m->control_pressed) { return 0; } @@ -345,6 +397,7 @@ int RemoveGroupsCommand::execute(){ if (namefile != "") { readName(); } if (fastafile != "") { readFasta(); } if (groupfile != "") { readGroup(); } + if (countfile != "") { readCount(); } if (listfile != "") { readList(); } if (taxfile != "") { readTax(); } if (sharedfile != "") { readShared(); } @@ -394,6 +447,11 @@ int RemoveGroupsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setDesignFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -762,6 +820,87 @@ int RemoveGroupsCommand::readGroup(){ } } //********************************************************************************************************************** +int RemoveGroupsCommand::readCount(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + int removedCount = 0; + + string headers = m->getline(in); m->gobble(in); + vector columnHeaders = m->splitWhiteSpace(headers); + + vector groups; + map originalGroupIndexes; + map GroupIndexes; + set indexOfGroupsChosen; + for (int i = 2; i < columnHeaders.size(); i++) { groups.push_back(columnHeaders[i]); originalGroupIndexes[i-2] = columnHeaders[i]; } + //sort groups to keep consistent with how we store the groups in groupmap + sort(groups.begin(), groups.end()); + for (int i = 0; i < groups.size(); i++) { GroupIndexes[groups[i]] = i; } + + vector groupsToKeep; + for (int i = 0; i < groups.size(); i++) { + if (!m->inUsersGroups(groups[i], Groups)) { groupsToKeep.push_back(groups[i]); } + } + sort(groupsToKeep.begin(), groupsToKeep.end()); + out << "Representative_Sequence\ttotal\t"; + for (int i = 0; i < groupsToKeep.size(); i++) { out << groupsToKeep[i] << '\t'; indexOfGroupsChosen.insert(GroupIndexes[groupsToKeep[i]]); } + out << endl; + + string name; int oldTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); in >> oldTotal; m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + toString(oldTotal) + "\n"); } + + if (names.count(name) == 0) { + //if group info, then read it + vector selectedCounts; int thisTotal = 0; int temp; + for (int i = 0; i < groups.size(); i++) { + int thisIndex = GroupIndexes[originalGroupIndexes[i]]; + in >> temp; m->gobble(in); + if (indexOfGroupsChosen.count(thisIndex) != 0) { //we want this group + selectedCounts.push_back(temp); thisTotal += temp; + } + } + + out << name << '\t' << thisTotal << '\t'; + for (int i = 0; i < selectedCounts.size(); i++) { out << selectedCounts[i] << '\t'; } + out << endl; + + wroteSomething = true; + removedCount+= (oldTotal - thisTotal); + }else { m->getline(in); removedCount += oldTotal; } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file does NOT contain sequences from the groups you wish to get."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RemoveGroupsCommand", "readCount"); + exit(1); + } +} +//********************************************************************************************************************** int RemoveGroupsCommand::readDesign(){ try { string thisOutputDir = outputDir; diff --git a/removegroupscommand.h b/removegroupscommand.h index c6db380..c36998a 100644 --- a/removegroupscommand.h +++ b/removegroupscommand.h @@ -36,7 +36,7 @@ public: private: set names; - string accnosfile, fastafile, namefile, groupfile, designfile, listfile, taxfile, outputDir, groups, sharedfile; + string accnosfile, fastafile, namefile, groupfile, countfile, designfile, listfile, taxfile, outputDir, groups, sharedfile; bool abort; vector outputNames, Groups; GroupMap* groupMap; @@ -49,6 +49,7 @@ private: int readShared(); int readName(); int readGroup(); + int readCount(); int readList(); int readTax(); int fillNames(); diff --git a/removelineagecommand.cpp b/removelineagecommand.cpp index 4cec90f..2b930b5 100644 --- a/removelineagecommand.cpp +++ b/removelineagecommand.cpp @@ -10,13 +10,15 @@ #include "removelineagecommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** vector RemoveLineageCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,true); parameters.push_back(ptaxonomy); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport); @@ -38,9 +40,9 @@ vector RemoveLineageCommand::setParameters(){ string RemoveLineageCommand::getHelpString(){ try { string helpString = ""; - helpString += "The remove.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, list or alignreport file.\n"; + helpString += "The remove.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, count, list or alignreport file.\n"; helpString += "It outputs a file containing only the sequences from the taxonomy file that are not from the taxon you requested to be removed.\n"; - helpString += "The remove.lineage command parameters are taxon, fasta, name, group, list, taxonomy, alignreport and dups. You must provide taxonomy unless you have a valid current taxonomy file.\n"; + helpString += "The remove.lineage command parameters are taxon, fasta, name, group, list, taxonomy, count, alignreport and dups. You must provide taxonomy unless you have a valid current taxonomy file.\n"; helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"; helpString += "The taxon parameter allows you to select the taxons you would like to remove, and is required.\n"; helpString += "You may enter your taxons with confidence scores, doing so will remove only those sequences that belong to the taxonomy and whose cofidence scores fall below the scores you give.\n"; @@ -72,6 +74,7 @@ string RemoveLineageCommand::getOutputFileNameTag(string type, string inputName= else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "alignreport") { outputFileName = "pick.align.report"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -94,6 +97,7 @@ RemoveLineageCommand::RemoveLineageCommand(){ outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "RemoveLineageCommand", "RemoveLineageCommand"); @@ -131,6 +135,7 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -187,6 +192,14 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -223,6 +236,19 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { else { m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; } }else { m->setTaxonomyFile(taxfile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + string usedDups = "true"; string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { @@ -240,14 +266,16 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { } m->splitAtChar(taxons, listOfTaxons, '-'); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (countfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } - if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ - vector files; files.push_back(fastafile); files.push_back(taxfile); - parser.getNameFile(files); - } + if (countfile == "") { + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } @@ -265,6 +293,12 @@ int RemoveLineageCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } if (m->control_pressed) { return 0; } + + if (countfile != "") { + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + } //read through the correct file and output lines you want to keep if (taxfile != "") { readTax(); } //fills the set of names to remove @@ -273,6 +307,7 @@ int RemoveLineageCommand::execute(){ if (groupfile != "") { readGroup(); } if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } + if (countfile != "") { readCount(); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -309,6 +344,11 @@ int RemoveLineageCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -511,7 +551,59 @@ int RemoveLineageCommand::readName(){ exit(1); } } - +//********************************************************************************************************************** +int RemoveLineageCommand::readCount(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + + string headers = m->getline(in); m->gobble(in); + out << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); } + + if (names.count(name) == 0) { + out << name << '\t' << thisTotal << '\t' << rest << endl; + wroteSomething = true; + } + } + in.close(); + out.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(outputFileName)) { + ct.readTable(outputFileName); + ct.printTable(outputFileName); + } + + if (wroteSomething == false) { m->mothurOut("Your group file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "readCount"); + exit(1); + } +} //********************************************************************************************************************** int RemoveLineageCommand::readGroup(){ try { @@ -594,15 +686,17 @@ int RemoveLineageCommand::readTax(){ bool remove = false; + string noQuotesTax = m->removeQuotes(tax); + for (int j = 0; j < listOfTaxons.size(); j++) { - string newtax = tax; + string newtax = noQuotesTax; //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them if (!taxonsHasConfidence[j]) { - int hasConfidences = tax.find_first_of('('); + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences != string::npos) { - newtax = tax; + newtax = noQuotesTax; m->removeConfidences(newtax); } @@ -617,7 +711,7 @@ int RemoveLineageCommand::readTax(){ } }else{//if taxons has them and you don't them remove taxons - int hasConfidences = tax.find_first_of('('); + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences == string::npos) { int pos = newtax.find(noConfidenceTaxons[j]); @@ -632,10 +726,10 @@ int RemoveLineageCommand::readTax(){ }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons //first remove confidences from both and see if the taxonomy exists - string noNewTax = tax; - int hasConfidences = tax.find_first_of('('); + string noNewTax = noQuotesTax; + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences != string::npos) { - noNewTax = tax; + noNewTax = noQuotesTax; m->removeConfidences(noNewTax); } diff --git a/removelineagecommand.h b/removelineagecommand.h index a5caec8..a756d24 100644 --- a/removelineagecommand.h +++ b/removelineagecommand.h @@ -34,12 +34,13 @@ class RemoveLineageCommand : public Command { private: set names; vector outputNames, listOfTaxons; - string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons; + string fastafile, namefile, groupfile, alignfile, listfile, countfile, taxfile, outputDir, taxons; bool abort, dups; int readFasta(); int readName(); int readGroup(); + int readCount(); int readAlign(); int readList(); int readTax(); diff --git a/removerarecommand.cpp b/removerarecommand.cpp index 923ca72..ded26bb 100644 --- a/removerarecommand.cpp +++ b/removerarecommand.cpp @@ -20,7 +20,8 @@ vector RemoveRareCommand::setParameters(){ CommandParameter prabund("rabund", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(prabund); CommandParameter psabund("sabund", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(psabund); CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pshared); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup); + CommandParameter pcount("count", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pnseqs("nseqs", "Number", "", "0", "", "", "",false,true); parameters.push_back(pnseqs); @@ -41,7 +42,7 @@ vector RemoveRareCommand::setParameters(){ string RemoveRareCommand::getHelpString(){ try { string helpString = ""; - helpString += "The remove.rare command parameters are list, rabund, sabund, shared, group, label, groups, bygroup and nseqs.\n"; + helpString += "The remove.rare command parameters are list, rabund, sabund, shared, group, count, label, groups, bygroup and nseqs.\n"; helpString += "The remove.rare command reads one of the following file types: list, rabund, sabund or shared file. It outputs a new file after removing the rare otus.\n"; helpString += "The groups parameter allows you to specify which of the groups you would like analyzed. Default=all. You may separate group names with dashes.\n"; helpString += "The label parameter is used to analyze specific labels in your input. default=all. You may separate label names with dashes.\n"; @@ -72,6 +73,7 @@ string RemoveRareCommand::getOutputFileNameTag(string type, string inputName="") else if (type == "sabund") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "shared") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -93,6 +95,7 @@ RemoveRareCommand::RemoveRareCommand(){ outputTypes["sabund"] = tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["group"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["shared"] = tempOutNames; } catch(exception& e) { @@ -131,6 +134,7 @@ RemoveRareCommand::RemoveRareCommand(string option) { outputTypes["list"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["shared"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -179,6 +183,14 @@ RemoveRareCommand::RemoveRareCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["shared"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -207,6 +219,15 @@ RemoveRareCommand::RemoveRareCommand(string option) { if (sharedfile == "not open") { sharedfile = ""; abort = true; } else if (sharedfile == "not found") { sharedfile = ""; } else { m->setSharedFile(sharedfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } if ((sharedfile == "") && (listfile == "") && (rabundfile == "") && (sabundfile == "")) { //is there are current file available for any of these? @@ -252,7 +273,7 @@ RemoveRareCommand::RemoveRareCommand(string option) { if (byGroup && (sharedfile == "")) { m->mothurOut("The byGroup parameter is only valid with a shared file."); m->mothurOutEndLine(); } - if ((groupfile != "") && (listfile == "")) { m->mothurOut("A groupfile is only valid with a list file."); m->mothurOutEndLine(); groupfile = ""; } + if (((groupfile != "") || (countfile != "")) && (listfile == "")) { m->mothurOut("A group or count file is only valid with a list file."); m->mothurOutEndLine(); groupfile = ""; countfile = ""; } } } @@ -310,6 +331,11 @@ int RemoveRareCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -327,7 +353,9 @@ int RemoveRareCommand::processList(){ string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(listfile); } string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + getOutputFileNameTag("list", listfile); - string outputGroupFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); + string outputGroupFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); + string outputCountFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + ofstream out, outGroup; m->openOutputFile(outputFileName, out); @@ -374,13 +402,21 @@ int RemoveRareCommand::processList(){ //if groupfile is given then use it GroupMap* groupMap; + CountTable ct; if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); SharedUtil util; vector namesGroups = groupMap->getNamesOfGroups(); util.setGroups(Groups, namesGroups); m->openOutputFile(outputGroupFileName, outGroup); - } + }else if (countfile != "") { + ct.readTable(countfile); + if (ct.hasGroupInfo()) { + vector namesGroups = ct.getNamesOfGroups(); + SharedUtil util; + util.setGroups(Groups, namesGroups); + } + } if (list != NULL) { @@ -397,6 +433,7 @@ int RemoveRareCommand::processList(){ vector names; string saveBinNames = binnames; m->splitAtComma(binnames, names); + int binsize = names.size(); vector newGroupFile; if (groupfile != "") { @@ -412,14 +449,38 @@ int RemoveRareCommand::processList(){ saveBinNames += names[k] + ","; } } - names = newNames; + names = newNames; binsize = names.size(); saveBinNames = saveBinNames.substr(0, saveBinNames.length()-1); - } + }else if (countfile != "") { + saveBinNames = ""; + binsize = 0; + for(int k = 0; k < names.size(); k++) { + if (ct.hasGroupInfo()) { + vector thisSeqsGroups = ct.getGroups(names[k]); + + int thisSeqsCount = 0; + for (int n = 0; n < thisSeqsGroups.size(); n++) { + if (m->inUsersGroups(thisSeqsGroups[n], Groups)) { + thisSeqsCount += ct.getGroupCount(names[k], thisSeqsGroups[n]); + } + } + binsize += thisSeqsCount; + //if you don't have any seqs from the groups the user wants, then remove you. + if (thisSeqsCount == 0) { newGroupFile.push_back(names[k]); } + else { saveBinNames += names[k] + ","; } + }else { + binsize += ct.getNumSeqs(names[k]); + saveBinNames += names[k] + ","; + } + } + saveBinNames = saveBinNames.substr(0, saveBinNames.length()-1); + } - if (names.size() > nseqs) { //keep bin + if (binsize > nseqs) { //keep bin newList.push_back(saveBinNames); - for(int k = 0; k < newGroupFile.size(); k++) { outGroup << newGroupFile[k] << endl; } - } + if (groupfile != "") { for(int k = 0; k < newGroupFile.size(); k++) { outGroup << newGroupFile[k] << endl; } } + else if (countfile != "") { for(int k = 0; k < newGroupFile.size(); k++) { ct.remove(newGroupFile[k]); } } + }else { if (countfile != "") { for(int k = 0; k < names.size(); k++) { ct.remove(names[k]); } } } } //print new listvector @@ -431,6 +492,17 @@ int RemoveRareCommand::processList(){ out.close(); if (groupfile != "") { outGroup.close(); outputTypes["group"].push_back(outputGroupFileName); outputNames.push_back(outputGroupFileName); } + if (countfile != "") { + if (ct.hasGroupInfo()) { + vector allGroups = ct.getNamesOfGroups(); + for (int i = 0; i < allGroups.size(); i++) { + if (!m->inUsersGroups(allGroups[i], Groups)) { ct.removeGroup(allGroups[i]); } + } + + } + ct.printTable(outputCountFileName); + outputTypes["count"].push_back(outputCountFileName); outputNames.push_back(outputCountFileName); + } if (wroteSomething == false) { m->mothurOut("Your file contains only rare sequences."); m->mothurOutEndLine(); } outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName); diff --git a/removerarecommand.h b/removerarecommand.h index 2d70ba7..7b4c6fb 100644 --- a/removerarecommand.h +++ b/removerarecommand.h @@ -36,7 +36,7 @@ public: void help() { m->mothurOut(getHelpString()); } private: - string sabundfile, rabundfile, sharedfile, groupfile, listfile, outputDir, groups, label; + string sabundfile, rabundfile, sharedfile, groupfile, countfile, listfile, outputDir, groups, label; int nseqs, allLines; bool abort, byGroup; vector outputNames, Groups; diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp index 0d53c1a..00b94a9 100644 --- a/removeseqscommand.cpp +++ b/removeseqscommand.cpp @@ -10,13 +10,15 @@ #include "removeseqscommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** vector RemoveSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport); @@ -39,9 +41,9 @@ vector RemoveSeqsCommand::setParameters(){ string RemoveSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, list, taxonomy, quality or alignreport file.\n"; + helpString += "The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, count, list, taxonomy, quality or alignreport file.\n"; helpString += "It outputs a file containing the sequences NOT in the .accnos file.\n"; - helpString += "The remove.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups. You must provide accnos and at least one of the file parameters.\n"; + helpString += "The remove.seqs command parameters are accnos, fasta, name, group, count, list, taxonomy, qfile, alignreport and dups. You must provide accnos and at least one of the file parameters.\n"; helpString += "The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=true. \n"; helpString += "The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n"; helpString += "Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n"; @@ -70,6 +72,7 @@ string RemoveSeqsCommand::getOutputFileNameTag(string type, string inputName="") else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "qfile") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "alignreport") { outputFileName = "pick.align.report"; } + else if (type == "count") { outputFileName = "pick.count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -93,6 +96,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(){ outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "RemoveSeqsCommand", "RemoveSeqsCommand"); @@ -131,6 +135,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -203,6 +208,14 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["qfile"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -259,13 +272,28 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { else { temp = "false"; usedDups = ""; } } dups = m->isTrue(temp); - - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; } - - if ((fastafile != "") && (namefile == "")) { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + + if ((countfile == "") && (fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; } + + if (countfile == "") { + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -285,6 +313,12 @@ int RemoveSeqsCommand::execute(){ names = m->readAccnos(accnosfile); if (m->control_pressed) { return 0; } + + if (countfile != "") { + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + } //read through the correct file and output lines you want to keep if (namefile != "") { readName(); } @@ -294,6 +328,7 @@ int RemoveSeqsCommand::execute(){ if (listfile != "") { readList(); } if (taxfile != "") { readTax(); } if (qualfile != "") { readQual(); } + if (countfile != "") { readCount(); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -333,7 +368,12 @@ int RemoveSeqsCommand::execute(){ itTypes = outputTypes.find("qfile"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); } - } + } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -366,6 +406,12 @@ int RemoveSeqsCommand::readFasta(){ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } Sequence currSeq(in); + + if (!dups) {//adjust name if needed + map::iterator it = uniqueMap.find(currSeq.getName()); + if (it != uniqueMap.end()) { currSeq.setName(it->second); } + } + name = currSeq.getName(); if (name != "") { @@ -373,7 +419,7 @@ int RemoveSeqsCommand::readFasta(){ if (names.count(name) == 0) { wroteSomething = true; - currSeq.printSequence(out); + currSeq.printSequence(out); }else { removedCount++; } } m->gobble(in); @@ -437,6 +483,11 @@ int RemoveSeqsCommand::readQual(){ m->gobble(in); + if (!dups) {//adjust name if needed + map::iterator it = uniqueMap.find(saveName); + if (it != uniqueMap.end()) { name = ">" + it->second; saveName = it->second; } + } + if (names.count(saveName) == 0) { wroteSomething = true; @@ -463,6 +514,64 @@ int RemoveSeqsCommand::readQual(){ } } //********************************************************************************************************************** +int RemoveSeqsCommand::readCount(){ + try { + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + int removedCount = 0; + + string headers = m->getline(in); m->gobble(in); + out << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); } + + if (names.count(name) == 0) { + out << name << '\t' << thisTotal << '\t' << rest << endl; + wroteSomething = true; + }else { removedCount += thisTotal; } + } + in.close(); + out.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(outputFileName)) { + ct.readTable(outputFileName); + ct.printTable(outputFileName); + } + + + if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RemoveSeqsCommand", "readCount"); + exit(1); + } +} +//********************************************************************************************************************** int RemoveSeqsCommand::readList(){ try { string thisOutputDir = outputDir; @@ -597,6 +706,8 @@ int RemoveSeqsCommand::readName(){ wroteSomething = true; out << validSecond[0] << '\t'; + //we are changing the unique name in the fasta file + uniqueMap[firstCol] = validSecond[0]; //you know you have at least one valid second since first column is valid for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } @@ -690,9 +801,15 @@ int RemoveSeqsCommand::readTax(){ in >> name; //read from first column in >> tax; //read from second column + if (!dups) {//adjust name if needed + map::iterator it = uniqueMap.find(name); + if (it != uniqueMap.end()) { name = it->second; } + } + //if this name is in the accnos file if (names.count(name) == 0) { wroteSomething = true; + out << name << '\t' << tax << endl; }else { removedCount++; } @@ -742,6 +859,11 @@ int RemoveSeqsCommand::readAlign(){ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } in >> name; //read from first column + + if (!dups) {//adjust name if needed + map::iterator it = uniqueMap.find(name); + if (it != uniqueMap.end()) { name = it->second; } + } //if this name is in the accnos file if (names.count(name) == 0) { diff --git a/removeseqscommand.h b/removeseqscommand.h index 474951a..e26e751 100644 --- a/removeseqscommand.h +++ b/removeseqscommand.h @@ -34,13 +34,15 @@ class RemoveSeqsCommand : public Command { private: set names; - string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; + string accnosfile, fastafile, namefile, groupfile, countfile, alignfile, listfile, taxfile, qualfile, outputDir; bool abort, dups; vector outputNames; + map uniqueMap; int readFasta(); int readName(); int readGroup(); + int readCount(); int readAlign(); int readList(); int readTax(); diff --git a/rftreenode.cpp b/rftreenode.cpp new file mode 100644 index 0000000..170cfb1 --- /dev/null +++ b/rftreenode.cpp @@ -0,0 +1,92 @@ +// +// rftreenode.cpp +// Mothur +// +// Created by Sarah Westcott on 10/2/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "rftreenode.hpp" + +/***********************************************************************/ +RFTreeNode::RFTreeNode(vector< vector > bootstrappedTrainingSamples, + vector globalDiscardedFeatureIndices, + int numFeatures, + int numSamples, + int numOutputClasses, + int generation) + +: bootstrappedTrainingSamples(bootstrappedTrainingSamples), +globalDiscardedFeatureIndices(globalDiscardedFeatureIndices), +numFeatures(numFeatures), +numSamples(numSamples), +numOutputClasses(numOutputClasses), +generation(generation), +isLeaf(false), +outputClass(-1), +splitFeatureIndex(-1), +splitFeatureValue(-1), +splitFeatureEntropy(-1.0), +ownEntropy(-1.0), +bootstrappedFeatureVectors(numFeatures, vector(numSamples, 0)), +bootstrappedOutputVector(numSamples, 0), +leftChildNode(NULL), +rightChildNode(NULL), +parentNode(NULL) { + m = MothurOut::getInstance(); + + for (int i = 0; i < numSamples; i++) { // just doing a simple transpose of the matrix + if (m->control_pressed) { break; } + for (int j = 0; j < numFeatures; j++) { bootstrappedFeatureVectors[j][i] = bootstrappedTrainingSamples[i][j]; } + } + + for (int i = 0; i < numSamples; i++) { if (m->control_pressed) { break; } bootstrappedOutputVector[i] = bootstrappedTrainingSamples[i][numFeatures]; } + + createLocalDiscardedFeatureList(); + updateNodeEntropy(); +} +/***********************************************************************/ +int RFTreeNode::createLocalDiscardedFeatureList(){ + try { + + for (int i = 0; i < numFeatures; i++) { + if (m->control_pressed) { return 0; } + vector::iterator it = find(globalDiscardedFeatureIndices.begin(), globalDiscardedFeatureIndices.end(), i); + if (it == globalDiscardedFeatureIndices.end()){ // NOT FOUND + double standardDeviation = m->getStandardDeviation(bootstrappedFeatureVectors[i]); + if (standardDeviation <= 0){ localDiscardedFeatureIndices.push_back(i); } + } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RFTreeNode", "createLocalDiscardedFeatureList"); + exit(1); + } +} +/***********************************************************************/ +int RFTreeNode::updateNodeEntropy() { + try { + + vector classCounts(numOutputClasses, 0); + for (int i = 0; i < bootstrappedOutputVector.size(); i++) { classCounts[bootstrappedOutputVector[i]]++; } + int totalClassCounts = accumulate(classCounts.begin(), classCounts.end(), 0); + double nodeEntropy = 0.0; + for (int i = 0; i < classCounts.size(); i++) { + if (m->control_pressed) { return 0; } + if (classCounts[i] == 0) continue; + double probability = (double)classCounts[i] / (double)totalClassCounts; + nodeEntropy += -(probability * log2(probability)); + } + ownEntropy = nodeEntropy; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RFTreeNode", "updateNodeEntropy"); + exit(1); + } +} + +/***********************************************************************/ diff --git a/rftreenode.hpp b/rftreenode.hpp new file mode 100755 index 0000000..8987ebc --- /dev/null +++ b/rftreenode.hpp @@ -0,0 +1,91 @@ +// +// rftreenode.hpp +// rrf-fs-prototype +// +// Created by Abu Zaher Faridee on 5/29/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#ifndef rrf_fs_prototype_treenode_hpp +#define rrf_fs_prototype_treenode_hpp + +#include "mothurout.h" +#include "macros.h" + +class RFTreeNode{ + +public: + + RFTreeNode(vector< vector > bootstrappedTrainingSamples, vector globalDiscardedFeatureIndices, int numFeatures, int numSamples, int numOutputClasses, int generation); + + virtual ~RFTreeNode(){} + + // getters + // we need to return const reference so that we have the actual value and not a copy, + // plus we do not modify the value as well + const int getSplitFeatureIndex() { return splitFeatureIndex; } + // TODO: check if this works properly or returs a shallow copy of the data + const vector< vector >& getBootstrappedTrainingSamples() { return bootstrappedTrainingSamples; } + const int getSplitFeatureValue() { return splitFeatureValue; } + const int getGeneration() { return generation; } + const bool checkIsLeaf() { return isLeaf; } + // TODO: fix this const pointer dillema + // we do not want to modify the data pointer by getLeftChildNode + RFTreeNode* getLeftChildNode() { return leftChildNode; } + RFTreeNode* getRightChildNode() { return rightChildNode; } + const int getOutputClass() { return outputClass; } + const int getNumSamples() { return numSamples; } + const int getNumFeatures() { return numFeatures; } + const vector& getLocalDiscardedFeatureIndices() { return localDiscardedFeatureIndices; } + const vector< vector >& getBootstrappedFeatureVectors() { return bootstrappedFeatureVectors; } + const vector& getBootstrappedOutputVector() { return bootstrappedOutputVector; } + const vector& getFeatureSubsetIndices() { return featureSubsetIndices; } + const double getOwnEntropy() { return ownEntropy; } + + // setters + void setIsLeaf(bool isLeaf) { this->isLeaf = isLeaf; } + void setOutputClass(int outputClass) { this->outputClass = outputClass; } + void setFeatureSubsetIndices(vector featureSubsetIndices) { this->featureSubsetIndices = featureSubsetIndices; } + void setLeftChildNode(RFTreeNode* leftChildNode) { this->leftChildNode = leftChildNode; } + void setRightChildNode(RFTreeNode* rightChildNode) { this->rightChildNode = rightChildNode; } + void setParentNode(RFTreeNode* parentNode) { this->parentNode = parentNode; } + void setSplitFeatureIndex(int splitFeatureIndex) { this->splitFeatureIndex = splitFeatureIndex; } + void setSplitFeatureValue(int splitFeatureValue) { this->splitFeatureValue = splitFeatureValue; } + void setSplitFeatureEntropy(double splitFeatureEntropy) { this->splitFeatureEntropy = splitFeatureEntropy; } + + // TODO: need to remove this mechanism of friend class + //NOTE: friend classes can be useful for testing purposes, but I would avoid using them otherwise. + friend class DecisionTree; + friend class AbstractDecisionTree; + +private: + vector > bootstrappedTrainingSamples; + vector globalDiscardedFeatureIndices; + vector localDiscardedFeatureIndices; + vector > bootstrappedFeatureVectors; + vector bootstrappedOutputVector; + vector featureSubsetIndices; + + int numFeatures; + int numSamples; + int numOutputClasses; + int generation; + bool isLeaf; + int outputClass; + int splitFeatureIndex; + int splitFeatureValue; + double splitFeatureEntropy; + double ownEntropy; + + RFTreeNode* leftChildNode; + RFTreeNode* rightChildNode; + RFTreeNode* parentNode; + + MothurOut* m; + + int createLocalDiscardedFeatureList(); + int updateNodeEntropy(); + +}; + +#endif diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 6a9a613..2b5ebc1 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -8,14 +8,15 @@ */ #include "screenseqscommand.h" - +#include "counttable.h" //********************************************************************************************************************** vector ScreenSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport); CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax); @@ -44,8 +45,8 @@ vector ScreenSeqsCommand::setParameters(){ string ScreenSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The screen.seqs command reads a fastafile and creates .....\n"; - helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n"; + helpString += "The screen.seqs command reads a fastafile and screens sequences.\n"; + helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, taxonomy, optimize, criteria and processors.\n"; helpString += "The fasta parameter is required.\n"; helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n"; helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n"; @@ -83,6 +84,7 @@ string ScreenSeqsCommand::getOutputFileNameTag(string type, string inputName="") if (type == "fasta") { outputFileName = "good" + m->getExtension(inputName); } else if (type == "taxonomy") { outputFileName = "good" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "good" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "good" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "good" + m->getExtension(inputName); } else if (type == "accnos") { outputFileName = "bad.accnos"; } else if (type == "qfile") { outputFileName = "good" + m->getExtension(inputName); } @@ -110,6 +112,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(){ outputTypes["accnos"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand"); @@ -149,6 +152,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { outputTypes["accnos"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -202,6 +206,14 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -229,6 +241,19 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + alignreport = validParameter.validFile(parameters, "alignreport", true); if (alignreport == "not open") { abort = true; } else if (alignreport == "not found") { alignreport = ""; } @@ -288,10 +313,12 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; } m->mothurConvert(temp, criteria); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -312,6 +339,11 @@ int ScreenSeqsCommand::execute(){ if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here //use the namefile to optimize correctly if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } getSummary(positions); } else { @@ -472,7 +504,9 @@ int ScreenSeqsCommand::execute(){ screenNameGroupFile(badSeqNames); if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group - + else if (countfile != "") { screenCountFile(badSeqNames); } + + if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } if(alignreport != "") { screenAlignReport(badSeqNames); } @@ -519,6 +553,11 @@ int ScreenSeqsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); @@ -962,7 +1001,69 @@ int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ exit(1); } } +//*************************************************************************************************************** +int ScreenSeqsCommand::screenCountFile(set badSeqNames){ + try { + ifstream in; + m->openInputFile(countfile, in); + set::iterator it; + + string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile); + ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut); + + string headers = m->getline(in); m->gobble(in); + goodCountOut << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + + it = badSeqNames.find(name); + + if(it != badSeqNames.end()){ + badSeqNames.erase(it); + } + else{ + goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl; + } + } + + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + //we were unable to remove some of the bad sequences + if (badSeqNames.size() != 0) { + for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { + m->mothurOut("Your count file does not include the sequence " + *it + " please correct."); + m->mothurOutEndLine(); + } + } + + in.close(); + goodCountOut.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(goodCountFile)) { + ct.readTable(goodCountFile); + ct.printTable(goodCountFile); + } + + if (m->control_pressed) { m->mothurRemove(goodCountFile); } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenCountFile"); + exit(1); + } +} //*************************************************************************************************************** int ScreenSeqsCommand::screenAlignReport(set badSeqNames){ diff --git a/screenseqscommand.h b/screenseqscommand.h index 771113d..b0d7c7c 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -44,6 +44,7 @@ private: int screenNameGroupFile(set); int screenGroupFile(set); + int screenCountFile(set); int screenAlignReport(set); int screenQual(set); int screenTaxonomy(set); @@ -56,7 +57,7 @@ private: #endif bool abort; - string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy; + string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile; int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria; vector outputNames; vector optimize; diff --git a/secondarystructurecommand.cpp b/secondarystructurecommand.cpp index ee50ab1..4d04270 100644 --- a/secondarystructurecommand.cpp +++ b/secondarystructurecommand.cpp @@ -9,13 +9,16 @@ #include "secondarystructurecommand.h" #include "sequence.hpp" +#include "counttable.h" //********************************************************************************************************************** vector AlignCheckCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter pmap("map", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pmap); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); vector myArray; @@ -31,7 +34,7 @@ vector AlignCheckCommand::setParameters(){ string AlignCheckCommand::getHelpString(){ try { string helpString = ""; - helpString += "The align.check command reads a fasta file and map file.\n"; + helpString += "The align.check command reads a fasta file and map file as well as an optional name or count file.\n"; helpString += "It outputs a file containing the secondary structure matches in the .align.check file.\n"; helpString += "The align.check command parameters are fasta and map, both are required.\n"; helpString += "The align.check command should be in the following format: align.check(fasta=yourFasta, map=yourMap).\n"; @@ -134,6 +137,14 @@ AlignCheckCommand::AlignCheckCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -154,16 +165,25 @@ AlignCheckCommand::AlignCheckCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it } - if ((namefile == "") && (fastafile != "")){ - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if ((namefile == "") && (fastafile != "")){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -183,6 +203,11 @@ int AlignCheckCommand::execute(){ readMap(); if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } if (m->control_pressed) { return 0; } @@ -216,7 +241,7 @@ int AlignCheckCommand::execute(){ if (haderror == 1) { m->control_pressed = true; break; } int num = 1; - if (namefile != "") { + if ((namefile != "") || (countfile != "")) { //make sure this sequence is in the namefile, else error map::iterator it = nameMap.find(seq.getName()); @@ -273,7 +298,7 @@ int AlignCheckCommand::execute(){ m->mothurOut("75%-tile:\t" + toString(pound[ptile75]) + "\t" + toString(dash[ptile75]) + "\t" + toString(plus[ptile75]) + "\t" + toString(equal[ptile75]) + "\t" + toString(loop[ptile75]) + "\t" + toString(tilde[ptile75]) + "\t" + toString(total[ptile75])); m->mothurOutEndLine(); m->mothurOut("97.5%-tile:\t" + toString(pound[ptile97_5]) + "\t" + toString(dash[ptile97_5]) + "\t" + toString(plus[ptile97_5]) + "\t" + toString(equal[ptile97_5]) + "\t" + toString(loop[ptile97_5]) + "\t" + toString(tilde[ptile97_5]) + "\t" + toString(total[ptile97_5])); m->mothurOutEndLine(); m->mothurOut("Maximum:\t" + toString(pound[ptile100]) + "\t" + toString(dash[ptile100]) + "\t" + toString(plus[ptile100]) + "\t" + toString(equal[ptile100]) + "\t" + toString(loop[ptile100]) + "\t" + toString(tilde[ptile100]) + "\t" + toString(total[ptile100])); m->mothurOutEndLine(); - if (namefile == "") { m->mothurOut("# of Seqs:\t" + toString(count)); m->mothurOutEndLine(); } + if ((namefile == "") && (countfile == "")) { m->mothurOut("# of Seqs:\t" + toString(count)); m->mothurOutEndLine(); } else { m->mothurOut("# of unique seqs:\t" + toString(count)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(size)); m->mothurOutEndLine(); } diff --git a/secondarystructurecommand.h b/secondarystructurecommand.h index 110f019..becafc5 100644 --- a/secondarystructurecommand.h +++ b/secondarystructurecommand.h @@ -50,7 +50,7 @@ class AlignCheckCommand : public Command { private: vector structMap; - string mapfile, fastafile, outputDir, namefile; + string mapfile, fastafile, outputDir, namefile, countfile; bool abort; int seqLength, haderror; vector outputNames; diff --git a/sensspeccommand.cpp b/sensspeccommand.cpp index cfa1f5b..b62bb00 100644 --- a/sensspeccommand.cpp +++ b/sensspeccommand.cpp @@ -14,7 +14,6 @@ vector SensSpecCommand::setParameters(){ try { CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip); - //CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname); CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pcolumn); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pcutoff("cutoff", "Number", "", "-1.00", "", "", "",false,false); parameters.push_back(pcutoff); @@ -136,16 +135,7 @@ SensSpecCommand::SensSpecCommand(string option) { path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["column"] = inputDir + it->second; } - } - - //it = parameters.find("name"); - //user has given a template file - //if(it != parameters.end()){ - //path = m->hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - //if (path == "") { parameters["name"] = inputDir + it->second; } - //} - + } } //check for required parameters listFile = validParameter.validFile(parameters, "list", true); @@ -196,12 +186,6 @@ SensSpecCommand::SensSpecCommand(string option) { else if(!m->isTrue(temp)) { hard = 0; } else if(m->isTrue(temp)) { hard = 1; } -// temp = validParameter.validFile(parameters, "name", true); -// if (temp == "not found") { nameFile = ""; } -// else if(temp == "not open") { abort = true; } -// else { nameFile = temp; } -// cout << "name:\t" << nameFile << endl; - temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "-1.00"; } m->mothurConvert(temp, cutoff); // cout << cutoff << endl; diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index c328a04..a9bb573 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -8,13 +8,14 @@ */ #include "seqsummarycommand.h" - +#include "counttable.h" //********************************************************************************************************************** vector SeqSummaryCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -33,8 +34,9 @@ string SeqSummaryCommand::getHelpString(){ try { string helpString = ""; helpString += "The summary.seqs command reads a fastafile and summarizes the sequences.\n"; - helpString += "The summary.seqs command parameters are fasta, name and processors, fasta is required, unless you have a valid current fasta file.\n"; + helpString += "The summary.seqs command parameters are fasta, name, count and processors, fasta is required, unless you have a valid current fasta file.\n"; helpString += "The name parameter allows you to enter a name file associated with your fasta file. \n"; + helpString += "The count parameter allows you to enter a count file associated with your fasta file. \n"; helpString += "The summary.seqs command should be in the following format: \n"; helpString += "summary.seqs(fasta=yourFastaFile, processors=2) \n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n"; @@ -123,6 +125,14 @@ SeqSummaryCommand::SeqSummaryCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //initialize outputTypes @@ -142,6 +152,13 @@ SeqSummaryCommand::SeqSummaryCommand(string option) { if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ @@ -153,11 +170,12 @@ SeqSummaryCommand::SeqSummaryCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } - + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } catch(exception& e) { @@ -186,6 +204,11 @@ int SeqSummaryCommand::execute(){ vector longHomoPolymer; if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } if (m->control_pressed) { return 0; } @@ -344,7 +367,7 @@ int SeqSummaryCommand::execute(){ int size = startPosition.size(); //find means - float meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer; + double meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer; meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0; for (int i = 0; i < size; i++) { meanStartPosition += startPosition[i]; @@ -353,6 +376,7 @@ int SeqSummaryCommand::execute(){ meanAmbigBases += ambigBases[i]; meanLongHomoPolymer += longHomoPolymer[i]; } + //this is an int divide so the remainder is lost meanStartPosition /= (float) size; meanEndPosition /= (float) size; meanLongHomoPolymer /= (float) size; meanSeqLength /= (float) size; meanAmbigBases /= (float) size; @@ -380,7 +404,7 @@ int SeqSummaryCommand::execute(){ m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine(); m->mothurOut("Mean:\t" + toString(meanStartPosition) + "\t" + toString(meanEndPosition) + "\t" + toString(meanSeqLength) + "\t" + toString(meanAmbigBases) + "\t" + toString(meanLongHomoPolymer)); m->mothurOutEndLine(); - if (namefile == "") { m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); } + if ((namefile == "") && (countfile == "")) { m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); } else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); } if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } @@ -420,7 +444,7 @@ int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vectorcontrol_pressed) { in.close(); outSummary.close(); return 1; } @@ -430,11 +454,11 @@ int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vector::iterator it = nameMap.find(current.getName()); - if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your name or count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } else { num = it->second; } } @@ -505,11 +529,11 @@ int SeqSummaryCommand::MPICreateSummary(int start, int num, vector& startPo if (current.getName() != "") { int num = 1; - if (namefile != "") { + if ((namefile != "") || (countfile != "")) { //make sure this sequence is in the namefile, else error map::iterator it = nameMap.find(current.getName()); - if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your namefile, please correct." << endl; m->control_pressed = true; } + if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your name or count file, please correct." << endl; m->control_pressed = true; } else { num = it->second; } } @@ -626,14 +650,17 @@ int SeqSummaryCommand::createProcessesCreateSummary(vector& startPosition, vector pDataArray; DWORD dwThreadIdArray[processors-1]; HANDLE hThreadArray[processors-1]; - + + bool hasNameMap = false; + if ((namefile !="") || (countfile != "")) { hasNameMap = true; } + //Create processor worker threads. for( int i=0; istart, lines[i]->end, namefile, nameMap); + seqSumData* tempSum = new seqSumData(filename, (sumFile+extension), m, lines[i]->start, lines[i]->end, hasNameMap, nameMap); pDataArray.push_back(tempSum); //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. diff --git a/seqsummarycommand.h b/seqsummarycommand.h index 79e8be9..3926e25 100644 --- a/seqsummarycommand.h +++ b/seqsummarycommand.h @@ -34,7 +34,7 @@ public: void help() { m->mothurOut(getHelpString()); } private: bool abort; - string fastafile, outputDir, namefile; + string fastafile, outputDir, namefile, countfile; int processors; vector outputNames; map nameMap; @@ -74,18 +74,18 @@ struct seqSumData { unsigned long long end; int count; MothurOut* m; - string namefile; + bool hasNameMap; map nameMap; seqSumData(){} - seqSumData(string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, string na, map nam) { + seqSumData(string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, bool na, map nam) { filename = f; sumFile = sf; m = mout; start = st; end = en; - namefile = na; + hasNameMap = na; nameMap = nam; count = 0; } @@ -123,11 +123,11 @@ static DWORD WINAPI MySeqSumThreadFunction(LPVOID lpParam){ if (current.getName() != "") { int num = 1; - if (pDataArray->namefile != "") { + if (pDataArray->hasNameMap){ //make sure this sequence is in the namefile, else error map::iterator it = pDataArray->nameMap.find(current.getName()); - if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your name or count file, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } else { num = it->second; } } diff --git a/sequencecountparser.cpp b/sequencecountparser.cpp new file mode 100644 index 0000000..1300c0f --- /dev/null +++ b/sequencecountparser.cpp @@ -0,0 +1,289 @@ +// +// sequencecountparser.cpp +// Mothur +// +// Created by Sarah Westcott on 8/7/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "sequencecountparser.h" + +/************************************************************/ +SequenceCountParser::SequenceCountParser(string countfile, string fastafile) { + try { + + m = MothurOut::getInstance(); + + //read count file + CountTable countTable; + countTable.readTable(countfile); + + //initialize maps + namesOfGroups = countTable.getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { + vector temp; + map tempMap; + seqs[namesOfGroups[i]] = temp; + countTablePerGroup[namesOfGroups[i]] = tempMap; + } + + //read fasta file making sure each sequence is in the group file + ifstream in; + m->openInputFile(fastafile, in); + + int fastaCount = 0; + while (!in.eof()) { + + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + fastaCount++; + if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } } + + if (seq.getName() != "") { + + allSeqsMap[seq.getName()] = seq.getName(); + vector groupCounts = countTable.getGroupCounts(seq.getName()); + + for (int i = 0; i < namesOfGroups.size(); i++) { + if (groupCounts[i] != 0) { + seqs[namesOfGroups[i]].push_back(seq); + countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i]; + } + } + } + } + in.close(); + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "SequenceCountParser"); + exit(1); + } +} +/************************************************************/ +SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTable) { + try { + + m = MothurOut::getInstance(); + + //initialize maps + if (countTable.hasGroupInfo()) { + namesOfGroups = countTable.getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { + vector temp; + map tempMap; + seqs[namesOfGroups[i]] = temp; + countTablePerGroup[namesOfGroups[i]] = tempMap; + } + + //read fasta file making sure each sequence is in the group file + ifstream in; + m->openInputFile(fastafile, in); + + int fastaCount = 0; + while (!in.eof()) { + + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + fastaCount++; + if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } } + + if (seq.getName() != "") { + + allSeqsMap[seq.getName()] = seq.getName(); + vector groupCounts = countTable.getGroupCounts(seq.getName()); + + for (int i = 0; i < namesOfGroups.size(); i++) { + if (groupCounts[i] != 0) { + seqs[namesOfGroups[i]].push_back(seq); + countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i]; + } + } + } + } + in.close(); + }else { m->control_pressed = true; m->mothurOut("[ERROR]: cannot parse fasta file by group with a count table that does not include group data, please correct.\n"); } + + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "SequenceCountParser"); + exit(1); + } +} +/************************************************************/ +SequenceCountParser::~SequenceCountParser(){ } +/************************************************************/ +int SequenceCountParser::getNumGroups(){ return namesOfGroups.size(); } +/************************************************************/ +vector SequenceCountParser::getNamesOfGroups(){ return namesOfGroups; } +/************************************************************/ +int SequenceCountParser::getNumSeqs(string g){ + try { + map >::iterator it; + int num = 0; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine(); + }else { + num = (it->second).size(); + } + + return num; + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "getNumSeqs"); + exit(1); + } +} +/************************************************************/ +vector SequenceCountParser::getSeqs(string g){ + try { + map >::iterator it; + vector seqForThisGroup; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + seqForThisGroup = it->second; + if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences."); } + } + + return seqForThisGroup; + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "getSeqs"); + exit(1); + } +} +/************************************************************/ +int SequenceCountParser::getSeqs(string g, string filename, bool uchimeFormat=false){ + try { + map >::iterator it; + vector seqForThisGroup; + vector nameVector; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + + ofstream out; + m->openOutputFile(filename, out); + + seqForThisGroup = it->second; + + if (uchimeFormat) { + // format should look like + //>seqName /ab=numRedundantSeqs/ + //sequence + + map countForThisGroup = getCountTable(g); + map::iterator itCount; + int error = 0; + + for (int i = 0; i < seqForThisGroup.size(); i++) { + itCount = countForThisGroup.find(seqForThisGroup[i].getName()); + + if (itCount == countForThisGroup.end()){ + error = 1; + m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine(); + }else { + seqPriorityNode temp(itCount->second, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName()); + nameVector.push_back(temp); + } + } + + if (error == 1) { out.close(); m->mothurRemove(filename); return 1; } + + //sort by num represented + sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes); + + //print new file in order of + for (int i = 0; i < nameVector.size(); i++) { + + if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } + + out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; // + } + + }else { + //m->mothurOut("Group " + g + " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n"); + for (int i = 0; i < seqForThisGroup.size(); i++) { + + if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } + + seqForThisGroup[i].printSequence(out); + } + } + out.close(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "getSeqs"); + exit(1); + } +} + +/************************************************************/ +map SequenceCountParser::getCountTable(string g){ + try { + map >::iterator it; + map countForThisGroup; + + it = countTablePerGroup.find(g); + if(it == countTablePerGroup.end()) { + m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + countForThisGroup = it->second; + if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " count file has " + toString(countForThisGroup.size()) + " unique sequences."); } + } + + return countForThisGroup; + } + catch(exception& e) { + m->errorOut(e, "SequenceCountParser", "getCountTable"); + exit(1); + } +} +/************************************************************/ +int SequenceCountParser::getCountTable(string g, string filename){ + try { + map >::iterator it; + map countForThisGroup; + + it = countTablePerGroup.find(g); + if(it == countTablePerGroup.end()) { + m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + countForThisGroup = it->second; + + ofstream out; + m->openOutputFile(filename, out); + out << "Representative_Sequence\ttotal\t" << g << endl; + + for (map::iterator itFile = countForThisGroup.begin(); itFile != countForThisGroup.end(); itFile++) { + + if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } + + out << itFile->first << '\t' << itFile->second << '\t' << itFile->second << endl; + } + + out.close(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getCountTable"); + exit(1); + } +} +/************************************************************/ + + + diff --git a/sequencecountparser.h b/sequencecountparser.h new file mode 100644 index 0000000..4889ea6 --- /dev/null +++ b/sequencecountparser.h @@ -0,0 +1,59 @@ +#ifndef Mothur_sequencecountparser_h +#define Mothur_sequencecountparser_h + +// +// sequencecountparser.h +// Mothur +// +// Created by Sarah Westcott on 8/7/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "mothur.h" +#include "mothurout.h" +#include "sequence.hpp" +#include "counttable.h" + +/* This class reads a fasta and count file and parses the data by group. The countfile must contain group information. + + Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences. + This is because when we parse the count file we make a unique for each group instead of 1 unique for all + groups. + + */ + +class SequenceCountParser { + +public: + + SequenceCountParser(string, string); //count, fasta - file mismatches will set m->control_pressed = true + SequenceCountParser(string, CountTable&); //fasta, counttable - file mismatches will set m->control_pressed = true + ~SequenceCountParser(); + + //general operations + int getNumGroups(); + vector getNamesOfGroups(); + + int getNumSeqs(string); //returns the number of unique sequences in a specific group + vector getSeqs(string); //returns unique sequences in a specific group + map getCountTable(string); //returns seqName -> numberOfRedundantSeqs for a specific group - the count file format, but each line is parsed by group. + + int getSeqs(string, string, bool); //prints unique sequences in a specific group to a file - group, filename, uchimeFormat=false + int getCountTable(string, string); //print seqName -> numberRedundantSeqs for a specific group - group, filename + + map getAllSeqsMap(){ return allSeqsMap; } //returns map where the key=sequenceName and the value=representativeSequence - helps us remove duplicates after group by group processing +private: + + CountTable countTable; + MothurOut* m; + + int numSeqs; + map allSeqsMap; + map > seqs; //a vector for each group + map > countTablePerGroup; //countTable for each group + vector namesOfGroups; +}; + + + +#endif diff --git a/sequenceparser.cpp b/sequenceparser.cpp index 08e5ae8..37891eb 100644 --- a/sequenceparser.cpp +++ b/sequenceparser.cpp @@ -59,7 +59,7 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi in.close(); if (error == 1) { m->control_pressed = true; } - + //read name file ifstream inName; m->openInputFile(nameFile, inName); @@ -148,6 +148,78 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi } } inName.close(); + + //in case file does not end in white space + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { //save one line + if (m->debug) { m->mothurOut("[DEBUG]: reading names: " + firstCol + '\t' + secondCol + ".\n"); } + vector names; + m->splitAtChar(secondCol, names, ','); + + //get aligned string for these seqs from the fasta file + string alignedString = ""; + map::iterator itAligned = seqName.find(names[0]); + if (itAligned == seqName.end()) { + error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine(); + }else { + alignedString = itAligned->second; + } + + //separate by group - parse one line in name file + map splitMap; //group -> name1,name2,... + map::iterator it; + for (int i = 0; i < names.size(); i++) { + + string group = groupMap->getGroup(names[i]); + if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); } + else { + + it = splitMap.find(group); + if (it != splitMap.end()) { //adding seqs to this group + (it->second) += "," + names[i]; + thisnames1.insert(names[i]); + countName++; + }else { //first sighting of this group + splitMap[group] = names[i]; + countName++; + thisnames1.insert(names[i]); + + //is this seq in the fasta file? + if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match + Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file. + seqs[group].push_back(tempSeq); + } + } + } + + allSeqsMap[names[i]] = names[0]; + } + + + //fill nameMapPerGroup - holds all lines in namefile separated by group + for (it = splitMap.begin(); it != splitMap.end(); it++) { + //grab first name + string firstName = ""; + for(int i = 0; i < (it->second).length(); i++) { + if (((it->second)[i]) != ',') { + firstName += ((it->second)[i]); + }else { break; } + } + + //group1 -> seq1 -> seq1,seq2,seq3 + nameMapPerGroup[it->first][firstName] = it->second; + } + + pairDone = false; + } + } + } if (error == 1) { m->control_pressed = true; } @@ -238,8 +310,6 @@ vector SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGr /************************************************************/ bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); } /************************************************************/ -string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); } -/************************************************************/ int SequenceParser::getNumSeqs(string g){ try { map >::iterator it; @@ -330,7 +400,7 @@ int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } - out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; + out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; // } }else { diff --git a/sequenceparser.h b/sequenceparser.h index 23fcb9e..98438f6 100644 --- a/sequenceparser.h +++ b/sequenceparser.h @@ -36,7 +36,6 @@ class SequenceParser { int getNumGroups(); vector getNamesOfGroups(); bool isValidGroup(string); //return true if string is a valid group - string getGroup(string); //returns group of a specific sequence int getNumSeqs(string); //returns the number of unique sequences in a specific group vector getSeqs(string); //returns unique sequences in a specific group diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp index 08cf21e..c50255a 100644 --- a/sffinfocommand.cpp +++ b/sffinfocommand.cpp @@ -9,17 +9,26 @@ #include "sffinfocommand.h" #include "endiannessmacros.h" +#include "trimoligos.h" +#include "sequence.hpp" +#include "qualityscores.h" //********************************************************************************************************************** vector SffInfoCommand::setParameters(){ try { CommandParameter psff("sff", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(psff); + CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos); CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos); CommandParameter psfftxt("sfftxt", "String", "", "", "", "", "",false,false); parameters.push_back(psfftxt); CommandParameter pflow("flow", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pflow); CommandParameter ptrim("trim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(ptrim); CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pfasta); CommandParameter pqfile("name", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqfile); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); + CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -37,10 +46,16 @@ string SffInfoCommand::getHelpString(){ try { string helpString = ""; helpString += "The sffinfo command reads a sff file and extracts the sequence data, or you can use it to parse a sfftxt file.\n"; - helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, and trim. sff is required. \n"; + helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, oligos, bdiffs, tdiffs, ldiffs, sdiffs, pdiffs and trim. sff is required. \n"; helpString += "The sff parameter allows you to enter the sff file you would like to extract data from. You may enter multiple files by separating them by -'s.\n"; helpString += "The fasta parameter allows you to indicate if you would like a fasta formatted file generated. Default=True. \n"; helpString += "The qfile parameter allows you to indicate if you would like a quality file generated. Default=True. \n"; + helpString += "The oligos parameter allows you to provide an oligos file to split your sff file into separate sff files by barcode. \n"; + helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; + helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; helpString += "The flow parameter allows you to indicate if you would like a flowgram file generated. Default=True. \n"; helpString += "The sfftxt parameter allows you to indicate if you would like a sff.txt file generated. Default=False. \n"; helpString += "If you want to parse an existing sfftxt file into flow, fasta and quality file, enter the file name using the sfftxt parameter. \n"; @@ -68,6 +83,7 @@ string SffInfoCommand::getOutputFileNameTag(string type, string inputName=""){ if (type == "fasta") { outputFileName = "fasta"; } else if (type == "flow") { outputFileName = "flow"; } else if (type == "sfftxt") { outputFileName = "sff.txt"; } + else if (type == "sff") { outputFileName = "sff"; } else if (type == "qfile") { outputFileName = "qual"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -90,6 +106,7 @@ SffInfoCommand::SffInfoCommand(){ outputTypes["flow"] = tempOutNames; outputTypes["sfftxt"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["sff"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "SffInfoCommand", "SffInfoCommand"); @@ -101,7 +118,8 @@ SffInfoCommand::SffInfoCommand(){ SffInfoCommand::SffInfoCommand(string option) { try { abort = false; calledHelp = false; - hasAccnos = false; + hasAccnos = false; hasOligos = false; + split = 1; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -126,6 +144,7 @@ SffInfoCommand::SffInfoCommand(string option) { outputTypes["flow"] = tempOutNames; outputTypes["sfftxt"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["sff"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -268,7 +287,80 @@ SffInfoCommand::SffInfoCommand(string option) { //make sure there is at least one valid file left if (accnosFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } } - + + oligosfile = validParameter.validFile(parameters, "oligos", false); + if (oligosfile == "not found") { oligosfile = ""; } + else { + hasOligos = true; + m->splitAtDash(oligosfile, oligosFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < oligosFileNames.size(); i++) { + bool ignore = false; + if (oligosFileNames[i] == "current") { + oligosFileNames[i] = m->getOligosFile(); + if (oligosFileNames[i] != "") { m->mothurOut("Using " + oligosFileNames[i] + " as input file for the accnos parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current oligosfile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + oligosFileNames.erase(oligosFileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(oligosFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { oligosFileNames[i] = inputDir + oligosFileNames[i]; } + } + + ifstream in; + int ableToOpen = m->openInputFile(oligosFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(oligosFileNames[i]); + m->mothurOut("Unable to open " + oligosFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + oligosFileNames[i] = tryPath; + } + } + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(oligosFileNames[i]); + m->mothurOut("Unable to open " + oligosFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + oligosFileNames[i] = tryPath; + } + } + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + oligosFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + oligosFileNames.erase(oligosFileNames.begin()+i); + i--; + } + } + } + + //make sure there is at least one valid file left + if (oligosFileNames.size() == 0) { m->mothurOut("no valid oligos files."); m->mothurOutEndLine(); abort = true; } + } + + if (hasOligos) { + split = 2; + if (oligosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a oligos file, you must have one for each sff file."); m->mothurOutEndLine(); } + } + if (hasAccnos) { if (accnosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a accnos file, you must have one for each sff file."); m->mothurOutEndLine(); } } @@ -284,7 +376,24 @@ SffInfoCommand::SffInfoCommand(string option) { temp = validParameter.validFile(parameters, "trim", false); if (temp == "not found"){ temp = "T"; } trim = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } + m->mothurConvert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } + temp = validParameter.validFile(parameters, "sfftxt", false); if (temp == "not found") { temp = "F"; sfftxt = false; sfftxtFilename = ""; } else if (m->isTrue(temp)) { sfftxt = true; sfftxtFilename = ""; } @@ -311,6 +420,8 @@ SffInfoCommand::SffInfoCommand(string option) { if (filename != "") { filenames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the sff parameter."); m->mothurOutEndLine(); } else { m->mothurOut("[ERROR]: you must provide a valid sff or sfftxt file."); m->mothurOutEndLine(); abort=true; } } + + } } catch(exception& e) { @@ -334,8 +445,11 @@ int SffInfoCommand::execute(){ string accnos = ""; if (hasAccnos) { accnos = accnosFileNames[s]; } + + string oligos = ""; + if (hasOligos) { oligos = oligosFileNames[s]; } - int numReads = extractSffInfo(filenames[s], accnos); + int numReads = extractSffInfo(filenames[s], accnos, oligos); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to extract " + toString(numReads) + "."); } @@ -375,13 +489,15 @@ int SffInfoCommand::execute(){ } } //********************************************************************************************************************** -int SffInfoCommand::extractSffInfo(string input, string accnos){ +int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ try { - + currentFileName = input; if (outputDir == "") { outputDir += m->hasPath(input); } if (accnos != "") { readAccnosFile(accnos); } else { seqNames.clear(); } + + if (oligos != "") { readOligos(oligos); split = 2; } ofstream outSfftxt, outFasta, outQual, outFlow; string outFastaFileName, outQualFileName; @@ -424,14 +540,10 @@ int SffInfoCommand::extractSffInfo(string input, string accnos){ while (!in.eof()) { bool print = true; - - //read header - Header readheader; - readHeader(in, readheader); - + //read data - seqRead read; - readSeqData(in, read, header.numFlowsPerRead, readheader.numBases); + seqRead read; Header readheader; + readSeqData(in, read, header.numFlowsPerRead, readheader); bool okay = sanityCheck(readheader, read); if (!okay) { break; } @@ -448,7 +560,7 @@ int SffInfoCommand::extractSffInfo(string input, string accnos){ count++; mycount++; - + //report progress if((count+1) % 10000 == 0){ m->mothurOut(toString(count+1)); m->mothurOutEndLine(); } @@ -467,6 +579,48 @@ int SffInfoCommand::extractSffInfo(string input, string accnos){ if (qual) { outQual.close(); } if (flow) { outFlow.close(); } + if (split > 1) { + //create new common headers for each file with the correct number of reads + adjustCommonHeader(header); + + map::iterator it; + set namesToRemove; + for(int i=0;iisBlank(filehandles[i][j])){ + m->mothurRemove(filehandles[i][j]); + m->mothurRemove(filehandlesHeaders[i][j]); + namesToRemove.insert(filehandles[i][j]); + } + } + } + } + } + + //append new header to reads + for (int i = 0; i < filehandles.size(); i++) { + for (int j = 0; j < filehandles[i].size(); j++) { + m->appendFiles(filehandles[i][j], filehandlesHeaders[i][j]); + m->renameFile(filehandlesHeaders[i][j], filehandles[i][j]); + m->mothurRemove(filehandlesHeaders[i][j]); + if (numSplitReads[i][j] == 0) { m->mothurRemove(filehandles[i][j]); } + } + } + + //remove names for outputFileNames, just cleans up the output + for(int i = 0; i < outputNames.size(); i++) { + if (namesToRemove.count(outputNames[i]) != 0) { + outputNames.erase(outputNames.begin()+i); + i--; + } + } + + if(m->isBlank(noMatchFile)){ m->mothurRemove(noMatchFile); } + else { outputNames.push_back(noMatchFile); outputTypes["sff"].push_back(noMatchFile); } + } + return count; } catch(exception& e) { @@ -477,20 +631,20 @@ int SffInfoCommand::extractSffInfo(string input, string accnos){ //********************************************************************************************************************** int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ try { - + if (!in.eof()) { //read magic number char buffer[4]; in.read(buffer, 4); header.magicNumber = be_int4(*(unsigned int *)(&buffer)); - + //read version char buffer9[4]; in.read(buffer9, 4); header.version = ""; - for (int i = 0; i < 4; i++) { header.version += toString((int)(buffer9[i])); } - + for (int i = 0; i < 4; i++) { header.version += toString((int)(buffer9[i])); } + //read offset char buffer2 [8]; in.read(buffer2, 8); @@ -539,17 +693,18 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ header.keySequence = tempBuffer2; if (header.keySequence.length() > header.keyLength) { header.keySequence = header.keySequence.substr(0, header.keyLength); } delete[] tempBuffer2; - + /* Pad to 8 chars */ unsigned long long spotInFile = in.tellg(); unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts in.seekg(spot); - - }else{ + + }else{ m->mothurOut("Error reading sff common header."); m->mothurOutEndLine(); } - + return 0; + } catch(exception& e) { m->errorOut(e, "SffInfoCommand", "readCommonHeader"); @@ -557,21 +712,207 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ } } //********************************************************************************************************************** -int SffInfoCommand::readHeader(ifstream& in, Header& header){ +int SffInfoCommand::adjustCommonHeader(CommonHeader header){ try { - - if (!in.eof()) { + + char* mybuffer = new char[4]; + ifstream in; + in.open(currentFileName.c_str(), ios::binary); + + //magic number + in.read(mybuffer,4); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + + //version + mybuffer = new char[4]; + in.read(mybuffer,4); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + + //offset + mybuffer = new char[8]; + in.read(mybuffer,8); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + - //read header length + //read index length + mybuffer = new char[4]; + in.read(mybuffer,4); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + + //change num reads + mybuffer = new char[4]; + in.read(mybuffer,4); + delete[] mybuffer; + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + //convert number of reads to 4 byte char* + char* thisbuffer = new char[4]; + thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF; + thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF; + thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF; + thisbuffer[3] = numSplitReads[i][j] & 0xFF; + out.write(thisbuffer, 4); + out.close(); + delete[] thisbuffer; + } + } + + //read header length + mybuffer = new char[2]; + in.read(mybuffer,2); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + + //read key length + mybuffer = new char[2]; + in.read(mybuffer,2); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + + //read number of flow reads + mybuffer = new char[2]; + in.read(mybuffer,2); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + + //read format code + mybuffer = new char[1]; + in.read(mybuffer,1); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + + //read flow chars + mybuffer = new char[header.numFlowsPerRead]; + in.read(mybuffer,header.numFlowsPerRead); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + + //read key + mybuffer = new char[header.keyLength]; + in.read(mybuffer,header.keyLength); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, in.gcount()); + out.close(); + } + } + delete[] mybuffer; + + + /* Pad to 8 chars */ + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts + in.seekg(spot); + + mybuffer = new char[spot-spotInFile]; + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + ofstream out; + m->openOutputFileAppend(filehandlesHeaders[i][j], out); + out.write(mybuffer, spot-spotInFile); + out.close(); + } + } + delete[] mybuffer; + in.close(); + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "adjustCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, Header& header){ + try { + unsigned long long startSpotInFile = in.tellg(); + if (!in.eof()) { + + /*****************************************/ + //read header + + //read header length char buffer [2]; in.read(buffer, 2); header.headerLength = be_int2(*(unsigned short *)(&buffer)); - + //read name length char buffer2 [2]; in.read(buffer2, 2); header.nameLength = be_int2(*(unsigned short *)(&buffer2)); - + //read num bases char buffer3 [4]; in.read(buffer3, 4); @@ -592,12 +933,12 @@ int SffInfoCommand::readHeader(ifstream& in, Header& header){ char buffer6 [2]; in.read(buffer6, 2); header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6)); - + //read clipAdapterRight char buffer7 [2]; in.read(buffer7, 2); header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7)); - + //read name char* tempBuffer = new char[header.nameLength]; in.read(&(*tempBuffer), header.nameLength); @@ -612,24 +953,10 @@ int SffInfoCommand::readHeader(ifstream& in, Header& header){ unsigned long long spotInFile = in.tellg(); unsigned long long spot = (spotInFile + 7)& ~7; in.seekg(spot); - - }else{ - m->mothurOut("Error reading sff header info."); m->mothurOutEndLine(); - } - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "readHeader"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, int numBases){ - try { - - if (!in.eof()) { - + /*****************************************/ + //sequence read + //read flowgram read.flowgram.resize(numFlowReads); for (int i = 0; i < numFlowReads; i++) { @@ -639,33 +966,62 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, i } //read flowIndex - read.flowIndex.resize(numBases); - for (int i = 0; i < numBases; i++) { + read.flowIndex.resize(header.numBases); + for (int i = 0; i < header.numBases; i++) { char temp[1]; in.read(temp, 1); read.flowIndex[i] = be_int1(*(unsigned char *)(&temp)); } //read bases - char* tempBuffer = new char[numBases]; - in.read(&(*tempBuffer), numBases); - read.bases = tempBuffer; - if (read.bases.length() > numBases) { read.bases = read.bases.substr(0, numBases); } - delete[] tempBuffer; + char* tempBuffer6 = new char[header.numBases]; + in.read(&(*tempBuffer6), header.numBases); + read.bases = tempBuffer6; + if (read.bases.length() > header.numBases) { read.bases = read.bases.substr(0, header.numBases); } + delete[] tempBuffer6; //read qual scores - read.qualScores.resize(numBases); - for (int i = 0; i < numBases; i++) { + read.qualScores.resize(header.numBases); + for (int i = 0; i < header.numBases; i++) { char temp[1]; in.read(temp, 1); read.qualScores[i] = be_int1(*(unsigned char *)(&temp)); } /* Pad to 8 chars */ - unsigned long long spotInFile = in.tellg(); - unsigned long long spot = (spotInFile + 7)& ~7; + spotInFile = in.tellg(); + spot = (spotInFile + 7)& ~7; in.seekg(spot); - + + if (split > 1) { + char * mybuffer; + mybuffer = new char [spot-startSpotInFile]; + ifstream in2; + m->openInputFile(currentFileName, in2); + in2.seekg(startSpotInFile); + in2.read(mybuffer,spot-startSpotInFile); + in2.close(); + + int barcodeIndex, primerIndex; + int trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex); + + if(trashCodeLength == 0){ + ofstream out; + m->openOutputFileAppend(filehandles[barcodeIndex][primerIndex], out); + out.write(mybuffer, in2.gcount()); + out.close(); + delete[] mybuffer; + numSplitReads[barcodeIndex][primerIndex]++; + } + else{ + ofstream out; + m->openOutputFileAppend(noMatchFile, out); + out.write(mybuffer, in2.gcount()); + out.close(); + delete[] mybuffer; + } + + } }else{ m->mothurOut("Error reading."); m->mothurOutEndLine(); } @@ -678,6 +1034,83 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, i } } //********************************************************************************************************************** +int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& primer) { + try { + //find group read belongs to + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); + + int success = 1; + string trashCode = ""; + int currentSeqsDiffs = 0; + + string seq = read.bases; + + if (trim) { + if(header.clipQualRight < header.clipQualLeft){ + seq = "NNNN"; + } + else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ + seq = seq.substr((header.clipQualLeft-1), (header.clipQualRight-header.clipQualLeft)); + } + else { + seq = seq.substr(header.clipQualLeft-1); + } + }else{ + //if you wanted the sfftxt then you already converted the bases to the right case + if (!sfftxt) { + //make the bases you want to clip lowercase and the bases you want to keep upper case + if(header.clipQualRight == 0){ header.clipQualRight = seq.length(); } + for (int i = 0; i < (header.clipQualLeft-1); i++) { seq[i] = tolower(seq[i]); } + for (int i = (header.clipQualLeft-1); i < (header.clipQualRight-1); i++) { seq[i] = toupper(seq[i]); } + for (int i = (header.clipQualRight-1); i < seq.length(); i++) { seq[i] = tolower(seq[i]); } + } + } + + Sequence currSeq(header.name, seq); + QualityScores currQual; + + if(numLinkers != 0){ + success = trimOligos.stripLinker(currSeq, currQual); + if(success > ldiffs) { trashCode += 'k'; } + else{ currentSeqsDiffs += success; } + + } + + if(barcodes.size() != 0){ + success = trimOligos.stripBarcode(currSeq, currQual, barcode); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + + if(numSpacers != 0){ + success = trimOligos.stripSpacer(currSeq, currQual); + if(success > sdiffs) { trashCode += 's'; } + else{ currentSeqsDiffs += success; } + + } + + if(numFPrimers != 0){ + success = trimOligos.stripForward(currSeq, currQual, primer, true); + if(success > pdiffs) { trashCode += 'f'; } + else{ currentSeqsDiffs += success; } + } + + if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + + if(revPrimer.size() != 0){ + success = trimOligos.stripReverse(currSeq, currQual); + if(!success) { trashCode += 'r'; } + } + + + return trashCode.length(); + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "findGroup"); + exit(1); + } +} +//********************************************************************************************************************** int SffInfoCommand::decodeName(string& timestamp, string& region, string& xy, string name) { try { @@ -1175,6 +1608,224 @@ vector SffInfoCommand::parseHeaderLineToIntVector(ifstream& file, exit(1); } } +//*************************************************************************************************************** + +bool SffInfoCommand::readOligos(string oligoFile){ + try { + filehandles.clear(); + numSplitReads.clear(); + filehandlesHeaders.clear(); + + ifstream inOligos; + m->openInputFile(oligoFile, inOligos); + + string type, oligo, group; + + int indexPrimer = 0; + int indexBarcode = 0; + + while(!inOligos.eof()){ + + inOligos >> type; + + if(type[0] == '#'){ + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + m->gobble(inOligos); + } + else{ + m->gobble(inOligos); + //make type case insensitive + for(int i=0;i> oligo; + + for(int i=0;i::iterator itPrime = primers.find(oligo); + if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + primers[oligo]=indexPrimer; indexPrimer++; + primerNameVector.push_back(group); + }else if(type == "REVERSE"){ + //Sequence oligoRC("reverse", oligo); + //oligoRC.reverseComplement(); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); + } + else if(type == "BARCODE"){ + inOligos >> group; + + //check for repeat barcodes + map::iterator itBar = barcodes.find(oligo); + if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + barcodes[oligo]=indexBarcode; indexBarcode++; + barcodeNameVector.push_back(group); + }else if(type == "LINKER"){ + linker.push_back(oligo); + }else if(type == "SPACER"){ + spacer.push_back(oligo); + } + else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } + } + m->gobble(inOligos); + } + inOligos.close(); + + if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ split = 1; } + + //add in potential combos + if(barcodeNameVector.size() == 0){ + barcodes[""] = 0; + barcodeNameVector.push_back(""); + } + + if(primerNameVector.size() == 0){ + primers[""] = 0; + primerNameVector.push_back(""); + } + + filehandles.resize(barcodeNameVector.size()); + for(int i=0;i 1){ + set uniqueNames; //used to cleanup outputFileNames + for(map::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ + for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ + + string primerName = primerNameVector[itPrimer->second]; + string barcodeName = barcodeNameVector[itBar->second]; + + string comboGroupName = ""; + string fastaFileName = ""; + string qualFileName = ""; + string nameFileName = ""; + + if(primerName == ""){ + comboGroupName = barcodeNameVector[itBar->second]; + } + else{ + if(barcodeName == ""){ + comboGroupName = primerNameVector[itPrimer->second]; + } + else{ + comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second]; + } + } + + ofstream temp; + string thisFilename = outputDir + m->getRootName(m->getSimpleName(currentFileName)) + comboGroupName + "." + getOutputFileNameTag("sff"); + if (uniqueNames.count(thisFilename) == 0) { + outputNames.push_back(thisFilename); + outputTypes["sff"].push_back(thisFilename); + uniqueNames.insert(thisFilename); + } + + filehandles[itBar->second][itPrimer->second] = thisFilename; + m->openOutputFile(thisFilename, temp); temp.close(); + } + } + } + numFPrimers = primers.size(); + numLinkers = linker.size(); + numSpacers = spacer.size(); + noMatchFile = outputDir + m->getRootName(m->getSimpleName(currentFileName)) + "scrap." + getOutputFileNameTag("sff"); + m->mothurRemove(noMatchFile); + + bool allBlank = true; + for (int i = 0; i < barcodeNameVector.size(); i++) { + if (barcodeNameVector[i] != "") { + allBlank = false; + break; + } + } + for (int i = 0; i < primerNameVector.size(); i++) { + if (primerNameVector[i] != "") { + allBlank = false; + break; + } + } + + filehandlesHeaders.resize(filehandles.size()); + numSplitReads.resize(filehandles.size()); + for (int i = 0; i < filehandles.size(); i++) { + numSplitReads[i].resize(filehandles[i].size(), 0); + for (int j = 0; j < filehandles[i].size(); j++) { + filehandlesHeaders[i].push_back(filehandles[i][j]+"headers"); + } + } + + if (allBlank) { + m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a split the sff file."); m->mothurOutEndLine(); + split = 1; + return false; + } + + return true; + + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readOligos"); + exit(1); + } +} +//********************************************************************/ +string SffInfoCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "reverseOligo"); + exit(1); + } +} //********************************************************************************************************************** diff --git a/sffinfocommand.h b/sffinfocommand.h index 4e72a96..4917a27 100644 --- a/sffinfocommand.h +++ b/sffinfocommand.h @@ -78,18 +78,24 @@ public: void help() { m->mothurOut(getHelpString()); } private: - string sffFilename, sfftxtFilename, outputDir, accnosName; - vector filenames, outputNames, accnosFileNames; - bool abort, fasta, qual, trim, flow, sfftxt, hasAccnos; - int mycount; + string sffFilename, sfftxtFilename, outputDir, accnosName, currentFileName, oligosfile, noMatchFile; + vector filenames, outputNames, accnosFileNames, oligosFileNames; + bool abort, fasta, qual, trim, flow, sfftxt, hasAccnos, hasOligos; + int mycount, split, numFPrimers, numLinkers, numSpacers, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs; set seqNames; + map barcodes; + map primers; + vector linker, spacer, primerNameVector, barcodeNameVector, revPrimer; + vector > numSplitReads; + vector > filehandles, filehandlesHeaders; //extract sff file functions - int extractSffInfo(string, string); + int extractSffInfo(string, string, string); int readCommonHeader(ifstream&, CommonHeader&); - int readHeader(ifstream&, Header&); - int readSeqData(ifstream&, seqRead&, int, int); + //int readHeader(ifstream&, Header&); + int readSeqData(ifstream&, seqRead&, int, Header&); int decodeName(string&, string&, string&, string); + bool readOligos(string oligosFile); int printCommonHeader(ofstream&, CommonHeader&); int printHeader(ofstream&, Header&); @@ -100,6 +106,9 @@ private: int readAccnosFile(string); int parseSffTxt(); bool sanityCheck(Header&, seqRead&); + int adjustCommonHeader(CommonHeader); + int findGroup(Header header, seqRead read, int& barcode, int& primer); + string reverseOligo(string oligo); //parsesfftxt file functions int parseHeaderLineToInt(ifstream&); diff --git a/sffmultiplecommand.cpp b/sffmultiplecommand.cpp new file mode 100644 index 0000000..05bc9aa --- /dev/null +++ b/sffmultiplecommand.cpp @@ -0,0 +1,836 @@ +// +// sffmultiplecommand.cpp +// Mothur +// +// Created by Sarah Westcott on 8/14/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "sffmultiplecommand.h" + + + +//********************************************************************************************************************** +vector SffMultipleCommand::setParameters(){ + try { + CommandParameter pfile("file", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfile); + + //sffinfo + CommandParameter ptrim("trim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(ptrim); + + //trim.flows + CommandParameter pmaxhomop("maxhomop", "Number", "", "9", "", "", "",false,false); parameters.push_back(pmaxhomop); + CommandParameter pmaxflows("maxflows", "Number", "", "450", "", "", "",false,false); parameters.push_back(pmaxflows); + CommandParameter pminflows("minflows", "Number", "", "450", "", "", "",false,false); parameters.push_back(pminflows); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); + CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); + CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal); + CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise); + CommandParameter porder("order", "String", "", "TACG", "", "", "",false,false); parameters.push_back(porder); + + //shhh.flows + CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup); + CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff); + CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter); + CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge); + CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma); + CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta); + + //trim.seqs parameters + CommandParameter pallfiles("allfiles", "Boolean", "", "t", "", "", "",false,false); parameters.push_back(pallfiles); + CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip); + CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig); + CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength); + CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength); + CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward); + CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst); + CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast); + + + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "SffMultipleCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string SffMultipleCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The sff.multiple command reads a file containing sff filenames and optional oligos filenames. It runs the files through sffinfo, trim.flows, shhh.flows and trim.seqs combining the results.\n"; + helpString += "The sff.multiple command parameters are: "; + vector parameters = setParameters(); + for (int i = 0; i < parameters.size()-1; i++) { + helpString += parameters[i] + ", "; + } + helpString += parameters[parameters.size()-1] + ".\n"; + helpString += "The file parameter allows you to enter the a file containing the list of sff files and optional oligos files.\n"; + helpString += "The trim parameter allows you to indicate if you would like a sequences and quality scores generated by sffinfo trimmed to the clipQualLeft and clipQualRight values. Default=True. \n"; + helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"; + helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; + helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; + helpString += "The maxlength parameter allows you to set and maximum sequence length. \n"; + helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; + helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; + helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n"; + helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n"; + helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n"; + helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n"; + + helpString += "Example sff.multiple(file=mySffOligosFile.txt, trim=F).\n"; + helpString += "Note: No spaces between parameter labels (i.e. file), '=' and parameters (i.e.mySffOligosFile.txt).\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "SffMultipleCommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** +string SffMultipleCommand::getOutputFileNameTag(string type, string inputName=""){ + try { + string outputFileName = ""; + map >::iterator it; + + //is this a type this command creates + it = outputTypes.find(type); + if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } + else { + if (type == "fasta") { outputFileName = "fasta"; } + else if (type == "name") { outputFileName = "names"; } + else if (type == "group") { outputFileName = "groups"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } + } + return outputFileName; + } + catch(exception& e) { + m->errorOut(e, "SffMultipleCommand", "getOutputFileNameTag"); + exit(1); + } +} + + +//********************************************************************************************************************** +SffMultipleCommand::SffMultipleCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["group"] = tempOutNames; + outputTypes["flow"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "SffMultipleCommand", "SffMultipleCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +SffMultipleCommand::SffMultipleCommand(string option) { + try { + abort = false; calledHelp = false; append=false; makeGroup=false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + //valid paramters for this command + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["flow"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["group"] = tempOutNames; + + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("file"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["file"] = inputDir + it->second; } + } + + it = parameters.find("lookup"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["lookup"] = inputDir + it->second; } + } + } + + filename = validParameter.validFile(parameters, "file", true); + if (filename == "not open") { filename = ""; abort = true; } + else if (filename == "not found") { filename = ""; } + + string temp; + temp = validParameter.validFile(parameters, "trim", false); if (temp == "not found"){ temp = "T"; } + trim = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "minflows", false); if (temp == "not found") { temp = "450"; } + m->mothurConvert(temp, minFlows); + + temp = validParameter.validFile(parameters, "maxflows", false); if (temp == "not found") { temp = "450"; } + m->mothurConvert(temp, maxFlows); + + temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found"){ temp = "9"; } + m->mothurConvert(temp, maxHomoP); + + temp = validParameter.validFile(parameters, "signal", false); if (temp == "not found"){ temp = "0.50"; } + m->mothurConvert(temp, signal); + + temp = validParameter.validFile(parameters, "noise", false); if (temp == "not found"){ temp = "0.70"; } + m->mothurConvert(temp, noise); + + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found"){ temp = "0"; } + m->mothurConvert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found"){ temp = "0"; } + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } + m->mothurConvert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } + + + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + m->mothurConvert(temp, processors); + + flowOrder = validParameter.validFile(parameters, "order", false); + if (flowOrder == "not found"){ flowOrder = "TACG"; } + else if(flowOrder.length() != 4){ + m->mothurOut("The value of the order option must be four bases long\n"); + } + + temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.01"; } + m->mothurConvert(temp, cutoff); + + temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){ temp = "0.000001"; } + minDelta = temp; + + temp = validParameter.validFile(parameters, "maxiter", false); if (temp == "not found"){ temp = "1000"; } + m->mothurConvert(temp, maxIters); + + temp = validParameter.validFile(parameters, "large", false); if (temp == "not found"){ temp = "0"; } + m->mothurConvert(temp, largeSize); + if (largeSize != 0) { large = true; } + else { large = false; } + if (largeSize < 0) { m->mothurOut("The value of the large cannot be negative.\n"); } + + temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found") { temp = "60"; } + m->mothurConvert(temp, sigma); + + temp = validParameter.validFile(parameters, "flip", false); + if (temp == "not found") { flip = 0; } + else { flip = m->isTrue(temp); } + + temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, maxAmbig); + + temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, minLength); + + temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, maxLength); + + temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; } + convert(temp, keepFirst); + + temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; } + convert(temp, removeLast); + + temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } + allFiles = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; } + keepforward = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "lookup", true); + if (temp == "not found") { + lookupFileName = "LookUp_Titanium.pat"; + + int ableToOpen; + ifstream in; + ableToOpen = m->openInputFile(lookupFileName, in, "noerror"); + in.close(); + + //if you can't open it, try input location + if (ableToOpen == 1) { + if (inputDir != "") { //default path is set + string tryPath = inputDir + lookupFileName; + m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + lookupFileName = tryPath; + } + } + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName); + m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + lookupFileName = tryPath; + } + } + + //if you can't open it its not in current working directory or inputDir, try mothur excutable location + if (ableToOpen == 1) { + string exepath = m->argv; + string tempPath = exepath; + for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); } + exepath = exepath.substr(0, (tempPath.find_last_of('m'))); + + string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName); + m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + lookupFileName = tryPath; + } + + if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; } + } + else if(temp == "not open") { + + lookupFileName = validParameter.validFile(parameters, "lookup", false); + + //if you can't open it its not inputDir, try mothur excutable location + string exepath = m->argv; + string tempPath = exepath; + for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); } + exepath = exepath.substr(0, (tempPath.find_last_of('m'))); + + string tryPath = m->getFullPathName(exepath) + lookupFileName; + m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine(); + ifstream in2; + int ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + lookupFileName = tryPath; + + if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; } + }else { lookupFileName = temp; } + } + } + catch(exception& e) { + m->errorOut(e, "SffMultipleCommand", "SffMultipleCommand"); + exit(1); + } +} +//********************************************************************************************************************** +int SffMultipleCommand::execute(){ + try { + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + vector sffFiles, oligosFiles; + readFile(sffFiles, oligosFiles); + + outputDir = m->hasPath(filename); + string fileroot = outputDir + m->getRootName(m->getSimpleName(filename)); + string fasta = fileroot + getOutputFileNameTag("fasta"); + string name = fileroot + getOutputFileNameTag("name"); + string group = fileroot + getOutputFileNameTag("group"); + + if (m->control_pressed) { return 0; } + + if (sffFiles.size() < processors) { processors = sffFiles.size(); } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else + //trim.flows, shhh.flows cannot handle multiple processors for windows. + processors = 1; m->mothurOut("This command can only use 1 processor on Windows platforms, using 1 processors.\n\n"); +#endif + if (processors == 1) { driver(sffFiles, oligosFiles, 0, sffFiles.size(), fasta, name, group); } + else { createProcesses(sffFiles, oligosFiles, fasta, name, group); } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + if (append) { + outputNames.push_back(fasta); outputTypes["fasta"].push_back(fasta); + m->setFastaFile(fasta); + outputNames.push_back(name); outputTypes["name"].push_back(name); + m->setNameFile(name); + if (makeGroup) { outputNames.push_back(group); outputTypes["group"].push_back(group); m->setGroupFile(group); } + } + + //report output filenames + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffMultipleCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +int SffMultipleCommand::readFile(vector& sffFiles, vector& oligosFiles){ + try { + + ifstream in; + m->openInputFile(filename, in); + bool allBlank = true; + bool allFull = true; + + string oligos, sff; + while (!in.eof()) { + + if (m->control_pressed) { break; } + + in >> sff; + + sff = m->getFullPathName(sff); + + //ignore file pairing + if(sff[0] == '#'){ while (!in.eof()) { char c = in.get(); if (c == 10 || c == 13){ break; } } m->gobble(in); } + else { //check for oligos file + oligos = ""; + + // get rest of line in case there is a oligos filename + while (!in.eof()) { + char c = in.get(); + if (c == 10 || c == 13){ break; } + else if (c == 32 || c == 9){;} //space or tab + else { oligos += c; } + } + sffFiles.push_back(sff); + if (oligos != "") { oligos = m->getFullPathName(oligos); allBlank = false; } + if (oligos == "") { allFull = false; } + oligosFiles.push_back(oligos); //will push a blank if there is not an oligos for this sff file + } + m->gobble(in); + } + in.close(); + + if (allBlank || allFull) { append = true; } + if (allFull) { makeGroup = true; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffMultipleCommand", "readFile"); + exit(1); + } +} +//********************************************************************************************************************** +//runs sffinfo, summary.seqs, trim.flows, shhh.flows, trim.seqs, summary.seqs for each sff file. +int SffMultipleCommand::driver(vector sffFiles, vector oligosFiles, int start, int end, string fasta, string name, string group){ + try { + m->mothurRemove(fasta); m->mothurRemove(name); m->mothurRemove(group); + int count = 0; + for (int s = start; s < end; s++) { + + string sff = sffFiles[s]; + string oligos = oligosFiles[s]; + + m->mothurOut("\n>>>>>\tProcessing " + sff + " (file " + toString(s+1) + " of " + toString(sffFiles.size()) + ")\t<<<<<\n"); + + //run sff.info + string inputString = "sff=" + sff + ", flow=T"; + if (trim) { inputString += ", trim=T"; } + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: sffinfo(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* sffCommand = new SffInfoCommand(inputString); + sffCommand->execute(); + + if (m->control_pressed){ break; } + + map > filenames = sffCommand->getOutputFiles(); + + delete sffCommand; + m->mothurCalling = false; + m->mothurOutEndLine(); + + //run summary.seqs on the fasta file + string fastaFile = ""; + map >::iterator it = filenames.find("fasta"); + if (it != filenames.end()) { if ((it->second).size() != 0) { fastaFile = (it->second)[0]; } } + else { m->mothurOut("[ERROR]: sffinfo did not create a fasta file, quitting.\n"); m->control_pressed = true; break; } + + inputString = "fasta=" + fastaFile + ", processors=1"; + m->mothurOutEndLine(); + m->mothurOut("Running command: summary.seqs(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* summarySeqsCommand = new SeqSummaryCommand(inputString); + summarySeqsCommand->execute(); + + if (m->control_pressed){ break; } + + map > temp = summarySeqsCommand->getOutputFiles(); + mergeOutputFileList(filenames, temp); + + delete summarySeqsCommand; + m->mothurCalling = false; + + m->mothurOutEndLine(); + + //run trim.flows on the fasta file + string flowFile = ""; + it = filenames.find("flow"); + if (it != filenames.end()) { if ((it->second).size() != 0) { flowFile = (it->second)[0]; } } + else { m->mothurOut("[ERROR]: sffinfo did not create a flow file, quitting.\n"); m->control_pressed = true; break; } + + inputString = "flow=" + flowFile; + if (oligos != "") { inputString += ", oligos=" + oligos; } + inputString += ", maxhomop=" + toString(maxHomoP) + ", maxflows=" + toString(maxFlows) + ", minflows=" + toString(minFlows); + inputString += ", pdiffs=" + toString(pdiffs) + ", bdiffs=" + toString(bdiffs) + ", ldiffs=" + toString(ldiffs) + ", sdiffs=" + toString(sdiffs); + inputString += ", tdiffs=" + toString(tdiffs) + ", signal=" + toString(signal) + ", noise=" + toString(noise) + ", order=" + flowOrder + ", processors=1"; + + m->mothurOutEndLine(); + m->mothurOut("Running command: trim.flows(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* trimFlowCommand = new TrimFlowsCommand(inputString); + trimFlowCommand->execute(); + + if (m->control_pressed){ break; } + + temp = trimFlowCommand->getOutputFiles(); + mergeOutputFileList(filenames, temp); + + delete trimFlowCommand; + m->mothurCalling = false; + + + string fileFileName = ""; + flowFile = ""; + if (oligos != "") { + it = temp.find("file"); + if (it != temp.end()) { if ((it->second).size() != 0) { fileFileName = (it->second)[0]; } } + else { m->mothurOut("[ERROR]: trim.flows did not create a file file, quitting.\n"); m->control_pressed = true; break; } + }else { + vector flowFiles; + it = temp.find("flow"); + if (it != temp.end()) { if ((it->second).size() != 0) { flowFiles = (it->second); } } + else { m->mothurOut("[ERROR]: trim.flows did not create a flow file, quitting.\n"); m->control_pressed = true; break; } + + for (int i = 0; i < flowFiles.size(); i++) { + string end = flowFiles[i].substr(flowFiles[i].length()-9); + if (end == "trim.flow") { + flowFile = flowFiles[i]; i+=flowFiles.size(); //if we found the trim.flow file stop looking + } + } + } + + if ((fileFileName == "") && (flowFile == "")) { m->mothurOut("[ERROR]: trim.flows did not create a file file or a trim.flow file, quitting.\n"); m->control_pressed = true; break; } + + if (fileFileName != "") { inputString = "file=" + fileFileName; } + else { inputString = "flow=" + flowFile; } + + inputString += ", lookup=" + lookupFileName + ", cutoff=" + toString(cutoff); + ", maxiters=" + toString(maxIters); + if (large) { inputString += ", large=" + toString(largeSize); } + inputString += ", sigma=" +toString(sigma); + inputString += ", mindelta=" + toString(minDelta); + inputString += ", order=" + flowOrder + ", processors=1"; + + //run shhh.flows + m->mothurOutEndLine(); + m->mothurOut("Running command: shhh.flows(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* shhhFlowCommand = new ShhherCommand(inputString); + shhhFlowCommand->execute(); + + if (m->control_pressed){ break; } + + temp = shhhFlowCommand->getOutputFiles(); + mergeOutputFileList(filenames, temp); + + delete shhhFlowCommand; + m->mothurCalling = false; + + vector fastaFiles; + vector nameFiles; + it = temp.find("fasta"); + if (it != temp.end()) { if ((it->second).size() != 0) { fastaFiles = (it->second); } } + else { m->mothurOut("[ERROR]: shhh.flows did not create a fasta file, quitting.\n"); m->control_pressed = true; break; } + + it = temp.find("name"); + if (it != temp.end()) { if ((it->second).size() != 0) { nameFiles = (it->second); } } + else { m->mothurOut("[ERROR]: shhh.flows did not create a name file, quitting.\n"); m->control_pressed = true; break; } + + //find fasta and name files with the shortest name. This is because if there is a composite name it will be the shortest. + fastaFile = fastaFiles[0]; + for (int i = 1; i < fastaFiles.size(); i++) { if (fastaFiles[i].length() < fastaFile.length()) { fastaFile = fastaFiles[i]; } } + string nameFile = nameFiles[0]; + for (int i = 1; i < nameFiles.size(); i++) { if (nameFiles[i].length() < nameFile.length()) { nameFile = nameFiles[i]; } } + + inputString = "fasta=" + fastaFile + ", name=" + nameFile; + if (oligos != "") { inputString += ", oligos=" + oligos; } + if (allFiles) { inputString += ", allfiles=t"; } + else { inputString += ", allfiles=f"; } + if (flip) { inputString += ", flip=t"; } + else { inputString += ", flip=f"; } + if (keepforward) { inputString += ", keepforward=t"; } + else { inputString += ", keepforward=f"; } + + + inputString += ", pdiffs=" + toString(pdiffs) + ", bdiffs=" + toString(bdiffs) + ", ldiffs=" + toString(ldiffs) + ", sdiffs=" + toString(sdiffs); + inputString += ", tdiffs=" + toString(tdiffs) + ", maxambig=" + toString(maxAmbig) + ", minlength=" + toString(minLength) + ", maxlength=" + toString(maxLength); + if (keepFirst != 0) { inputString += ", keepfirst=" + toString(keepFirst); } + if (removeLast != 0) { inputString += ", removelast=" + toString(removeLast); } + inputString += ", processors=1"; + + //run trim.seqs + m->mothurOutEndLine(); + m->mothurOut("Running command: trim.seqs(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* trimseqsCommand = new TrimSeqsCommand(inputString); + trimseqsCommand->execute(); + + if (m->control_pressed){ break; } + + temp = trimseqsCommand->getOutputFiles(); + mergeOutputFileList(filenames, temp); + + delete trimseqsCommand; + m->mothurCalling = false; + + it = temp.find("fasta"); + if (it != temp.end()) { if ((it->second).size() != 0) { fastaFiles = (it->second); } } + else { m->mothurOut("[ERROR]: trim.seqs did not create a fasta file, quitting.\n"); m->control_pressed = true; break; } + + for (int i = 0; i < fastaFiles.size(); i++) { + string end = fastaFiles[i].substr(fastaFiles[i].length()-10); + if (end == "trim.fasta") { + fastaFile = fastaFiles[i]; i+=fastaFiles.size(); //if we found the trim.fasta file stop looking + } + } + + it = temp.find("name"); + if (it != temp.end()) { if ((it->second).size() != 0) { nameFiles = (it->second); } } + else { m->mothurOut("[ERROR]: trim.seqs did not create a name file, quitting.\n"); m->control_pressed = true; break; } + + for (int i = 0; i < nameFiles.size(); i++) { + string end = nameFiles[i].substr(nameFiles[i].length()-10); + if (end == "trim.names") { + nameFile = nameFiles[i]; i+=nameFiles.size(); //if we found the trim.names file stop looking + } + } + + vector groupFiles; + string groupFile = ""; + if (makeGroup) { + it = temp.find("group"); + if (it != temp.end()) { if ((it->second).size() != 0) { groupFiles = (it->second); } } + + //find group file with the shortest name. This is because if there is a composite group file it will be the shortest. + groupFile = groupFiles[0]; + for (int i = 1; i < groupFiles.size(); i++) { if (groupFiles[i].length() < groupFile.length()) { groupFile = groupFiles[i]; } } + } + + inputString = "fasta=" + fastaFile + ", processors=1, name=" + nameFile; + m->mothurOutEndLine(); + m->mothurOut("Running command: summary.seqs(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + summarySeqsCommand = new SeqSummaryCommand(inputString); + summarySeqsCommand->execute(); + + if (m->control_pressed){ break; } + + temp = summarySeqsCommand->getOutputFiles(); + mergeOutputFileList(filenames, temp); + + delete summarySeqsCommand; + m->mothurCalling = false; + + m->mothurOutEndLine(); + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + if (append) { + m->appendFiles(fastaFile, fasta); + m->appendFiles(nameFile, name); + if (makeGroup) { m->appendFiles(groupFile, group); } + } + count++; + + for (it = filenames.begin(); it != filenames.end(); it++) { + for (int i = 0; i < (it->second).size(); i++) { + outputNames.push_back((it->second)[i]); outputTypes[it->first].push_back((it->second)[i]); + } + } + } + + return count; + } + catch(exception& e) { + m->errorOut(e, "SffMultipleCommand", "driver"); + exit(1); + } +} +//********************************************************************************************************************** +int SffMultipleCommand::mergeOutputFileList(map >& files, map >& temp){ + try { + map >::iterator it; + for (it = temp.begin(); it != temp.end(); it++) { + map >::iterator it2 = files.find(it->first); + if (it2 == files.end()) { //we do not already have this type so just add it + files[it->first] = it->second; + }else { //merge them + for (int i = 0; i < (it->second).size(); i++) { + files[it->first].push_back((it->second)[i]); + } + } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffMultipleCommand", "mergeOutputFileList"); + exit(1); + } +} +//********************************************************************************************************************** +int SffMultipleCommand::createProcesses(vector sffFiles, vector oligosFiles, string fasta, string name, string group){ + try { + vector processIDS; + int process = 1; + int num = 0; + + //divide the groups between the processors + vector lines; + vector numFilesToComplete; + int numFilesPerProcessor = sffFiles.size() / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numFilesPerProcessor; + int endIndex = (i+1) * numFilesPerProcessor; + if(i == (processors - 1)){ endIndex = sffFiles.size(); } + lines.push_back(linePair(startIndex, endIndex)); + numFilesToComplete.push_back((endIndex-startIndex)); + } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + num = driver(sffFiles, oligosFiles, lines[process].start, lines[process].end, fasta + toString(getpid()) + ".temp", name + toString(getpid()) + ".temp", group + toString(getpid()) + ".temp"); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + out << num << '\t' << outputNames.size() << endl; + for (int i = 0; i < outputNames.size(); i++) { out << outputNames[i] << endl; } + out.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + //do my part + num = driver(sffFiles, oligosFiles, lines[0].start, lines[0].end, fasta, name, group); + + //force parent to wait until all the processes are done + for (int i=0;iopenInputFile(tempFile, in); + if (!in.eof()) { + int tempNum = 0; int outputNamesSize = 0; + in >> tempNum >> outputNamesSize; m->gobble(in); + for (int j = 0; j < outputNamesSize; j++) { + string tempName; + in >> tempName; m->gobble(in); + outputNames.push_back(tempName); + } + if (tempNum != numFilesToComplete[i+1]) { + m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches. The flow files may be too large to process with multiple processors. \n"); + } + } + in.close(); m->mothurRemove(tempFile); + + if (append) { + m->appendFiles(fasta+toString(processIDS[i])+".temp", fasta); m->mothurRemove(fasta+toString(processIDS[i])+".temp"); + m->appendFiles(name+toString(processIDS[i])+".temp", name); m->mothurRemove(name+toString(processIDS[i])+".temp"); + if (makeGroup) { m->appendFiles(group+toString(processIDS[i])+".temp", group); m->mothurRemove(group+toString(processIDS[i])+".temp"); } + } + } +#endif + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "createProcesses"); + exit(1); + } +} +//********************************************************************************************************************** + + + + diff --git a/sffmultiplecommand.h b/sffmultiplecommand.h new file mode 100644 index 0000000..4ab2c97 --- /dev/null +++ b/sffmultiplecommand.h @@ -0,0 +1,62 @@ +#ifndef Mothur_sffmultiplecommand_h +#define Mothur_sffmultiplecommand_h + +// +// sffmultiplecommand.h +// Mothur +// +// Created by Sarah Westcott on 8/14/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "command.hpp" +#include "sffinfocommand.h" +#include "seqsummarycommand.h" +#include "trimflowscommand.h" +#include "shhhercommand.h" +#include "trimseqscommand.h" + +class SffMultipleCommand : public Command { + +public: + SffMultipleCommand(string); + SffMultipleCommand(); + ~SffMultipleCommand(){} + + vector setParameters(); + string getCommandName() { return "sff.multiple"; } + string getCommandCategory() { return "Sequence Processing"; } + string getOutputFileNameTag(string, string); + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Sff.multiple"; } + string getDescription() { return "run multiple sff files through, sffinfo, trim.flow, shhh.flows and trim.seqs combining the results"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + + struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + }; + + string filename, outputDir, flowOrder, lookupFileName, minDelta; + vector outputNames; + bool abort, trim, large, flip, allFiles, keepforward, append, makeGroup; + int maxFlows, minFlows, minLength, maxLength, maxHomoP, tdiffs, bdiffs, pdiffs, sdiffs, ldiffs; + int processors, maxIters, largeSize; + float signal, noise, cutoff, sigma; + int keepFirst, removeLast, maxAmbig; + + int readFile(vector& sffFiles, vector& oligosFiles); + int createProcesses(vector sffFiles, vector oligosFiles, string, string, string); + int driver(vector sffFiles, vector oligosFiles, int start, int end, string, string, string); + int mergeOutputFileList(map >& files, map >& temp); + + + +}; + +#endif diff --git a/sharedcommand.cpp b/sharedcommand.cpp index 1150e53..542f8d3 100644 --- a/sharedcommand.cpp +++ b/sharedcommand.cpp @@ -188,7 +188,11 @@ SharedCommand::SharedCommand(string option) { countfile = validParameter.validFile(parameters, "count", true); if (countfile == "not open") { countfile = ""; abort = true; } else if (countfile == "not found") { countfile = ""; } - else { m->setCountTableFile(countfile); } + else { + m->setCountTableFile(countfile); + CountTable temp; + if (!temp.testGroups(countfile)) { m->mothurOut("[ERROR]: Your count file does not have group info, aborting."); m->mothurOutEndLine(); abort=true; } + } if ((biomfile == "") && (listfile == "")) { //is there are current file available for either of these? @@ -825,7 +829,7 @@ int SharedCommand::createSharedFromListGroup(string filename) { int error = ListGroupSameSeqs(namesSeqs, SharedList); if ((!pickedGroups) && (SharedList->getNumSeqs() != numGroupNames)) { //if the user has not specified any groups and their files don't match exit with error - m->mothurOut("Your group file contains " + toString(numGroupNames) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct."); m->mothurOutEndLine(); + m->mothurOut("Your group file contains " + toString(numGroupNames) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; out.close(); m->mothurRemove(filename); //remove blank shared file you made @@ -1083,8 +1087,12 @@ int SharedCommand::ListGroupSameSeqs(vector& groupMapsSeqs, SharedListVe for (int j = 0; j < listNames.size(); j++) { int num = groupNamesSeqs.count(listNames[j]); - if (num == 0) { error = 1; m->mothurOut("[ERROR]: " + listNames[j] + " is in your listfile and not in your groupfile. Please correct."); m->mothurOutEndLine(); } - else { groupNamesSeqs.erase(listNames[j]); } + if (num == 0) { + error = 1; + if (groupfile != "") { + m->mothurOut("[ERROR]: " + listNames[j] + " is in your listfile and not in your groupfile. Please correct."); m->mothurOutEndLine(); } + else{ m->mothurOut("[ERROR]: " + listNames[j] + " is in your listfile and not in your count file. Please correct."); m->mothurOutEndLine(); } + }else { groupNamesSeqs.erase(listNames[j]); } } } diff --git a/sharedrabundvector.h b/sharedrabundvector.h index 792543e..419d15a 100644 --- a/sharedrabundvector.h +++ b/sharedrabundvector.h @@ -24,7 +24,6 @@ An individual which knows the OTU from which it came, the group it is in and its abundance. */ -//class GlobalData; class SharedRAbundVector : public DataVector { diff --git a/sharedutilities.cpp b/sharedutilities.cpp index 151b254..71d7782 100644 --- a/sharedutilities.cpp +++ b/sharedutilities.cpp @@ -120,7 +120,7 @@ void SharedUtil::setGroups(vector& userGroups, vector& allGroups //if the user only entered invalid groups if (userGroups.size() == 0) { - m->mothurOut("You provided no valid groups. I will run the command using all the groups in your groupfile."); m->mothurOutEndLine(); + m->mothurOut("You provided no valid groups. I will run the command using all the groups in your file."); m->mothurOutEndLine(); for (int i = 0; i < allGroups.size(); i++) { userGroups.push_back(allGroups[i]); } diff --git a/shhhercommand.cpp b/shhhercommand.cpp index c34f25d..19ffc89 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -776,8 +776,8 @@ int ShhherCommand::execute(){ if(compositeFASTAFileName != ""){ - outputNames.push_back(compositeFASTAFileName); - outputNames.push_back(compositeNamesFileName); + outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName); + outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); } m->mothurOutEndLine(); @@ -1039,7 +1039,12 @@ void ShhherCommand::getFlowData(){ float intensity; - flowFile >> numFlowCells; + string numFlowTest; + flowFile >> numFlowTest; + + if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); } + else { convert(numFlowTest, numFlowCells); } + int index = 0;//pcluster while(!flowFile.eof()){ @@ -1376,17 +1381,17 @@ string ShhherCommand::cluster(string distFileName, string namesFileName){ try { ReadMatrix* read = new ReadColumnMatrix(distFileName); - read->setCutoff(cutoff); - - NameAssignment* clusterNameMap = new NameAssignment(namesFileName); - clusterNameMap->readMap(); - read->read(clusterNameMap); - - ListVector* list = read->getListVector(); - SparseMatrix* matrix = read->getMatrix(); + read->setCutoff(cutoff); + + NameAssignment* clusterNameMap = new NameAssignment(namesFileName); + clusterNameMap->readMap(); + read->read(clusterNameMap); - delete read; - delete clusterNameMap; + ListVector* list = read->getListVector(); + SparseDistanceMatrix* matrix = read->getDMatrix(); + + delete read; + delete clusterNameMap; RAbundVector* rabund = new RAbundVector(list->getRAbundVector()); @@ -1738,7 +1743,7 @@ void ShhherCommand::writeQualities(vector otuCounts){ } } qualityFile.close(); - outputNames.push_back(qualityFileName); + outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName); } catch(exception& e) { @@ -1783,7 +1788,7 @@ void ShhherCommand::writeSequences(vector otuCounts){ } fastaFile.close(); - outputNames.push_back(fastaFileName); + outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName); if(compositeFASTAFileName != ""){ m->appendFiles(fastaFileName, compositeFASTAFileName); @@ -1820,7 +1825,7 @@ void ShhherCommand::writeNames(vector otuCounts){ } } nameFile.close(); - outputNames.push_back(nameFileName); + outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName); if(compositeNamesFileName != ""){ @@ -1852,7 +1857,7 @@ void ShhherCommand::writeGroups(){ groupFile << seqNameVector[i] << '\t' << fileGroup << endl; } groupFile.close(); - outputNames.push_back(groupFileName); + outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName); } catch(exception& e) { @@ -1912,7 +1917,7 @@ void ShhherCommand::writeClusters(vector otuCounts){ } } otuCountsFile.close(); - outputNames.push_back(otuCountsFileName); + outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName); } catch(exception& e) { @@ -1926,7 +1931,7 @@ void ShhherCommand::writeClusters(vector otuCounts){ int ShhherCommand::execute(){ try { - if (abort == true) { return 0; } + if (abort == true) { if (calledHelp) { return 0; } return 2; } getSingleLookUp(); if (m->control_pressed) { return 0; } getJointLookUp(); if (m->control_pressed) { return 0; } @@ -1943,8 +1948,8 @@ int ShhherCommand::execute(){ #endif if(compositeFASTAFileName != ""){ - outputNames.push_back(compositeFASTAFileName); - outputNames.push_back(compositeNamesFileName); + outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName); + outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); } m->mothurOutEndLine(); @@ -2029,7 +2034,7 @@ int ShhherCommand::createProcesses(vector filenames){ //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. //Above fork() will clone, so memory is separate, but that's not the case with windows, ////////////////////////////////////////////////////////////////////////////////////////////////////// - + /* vector pDataArray; DWORD dwThreadIdArray[processors-1]; HANDLE hThreadArray[processors-1]; @@ -2060,7 +2065,7 @@ int ShhherCommand::createProcesses(vector filenames){ CloseHandle(hThreadArray[i]); delete pDataArray[i]; } - + */ #endif for (int i=0;i& thisSeqNameVecto thisFlowDataIntI.clear(); thisNameMap.clear(); - flowFile >> numFlowCells; + string numFlowTest; + flowFile >> numFlowTest; + + if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); } + else { convert(numFlowTest, numFlowCells); } + if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); } int index = 0;//pcluster while(!flowFile.eof()){ @@ -3256,7 +3266,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string quality } } qualityFile.close(); - outputNames.push_back(qualityFileName); + outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName); } catch(exception& e) { @@ -3300,7 +3310,7 @@ void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTU } fastaFile.close(); - outputNames.push_back(fastaFileName); + outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName); if(thisCompositeFASTAFileName != ""){ m->appendFiles(fastaFileName, thisCompositeFASTAFileName); @@ -3335,7 +3345,7 @@ void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, s } } nameFile.close(); - outputNames.push_back(nameFileName); + outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName); if(thisCompositeNamesFileName != ""){ @@ -3360,7 +3370,7 @@ void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSe groupFile << seqNameVector[i] << '\t' << fileRoot << endl; } groupFile.close(); - outputNames.push_back(groupFileName); + outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName); } catch(exception& e) { @@ -3419,7 +3429,7 @@ void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int num } } otuCountsFile.close(); - outputNames.push_back(otuCountsFileName); + outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName); } catch(exception& e) { diff --git a/shhhercommand.h b/shhhercommand.h index 8446444..ef52dcd 100644 --- a/shhhercommand.h +++ b/shhhercommand.h @@ -18,7 +18,6 @@ #include "sabundvector.hpp" #include "listvector.hpp" #include "cluster.hpp" -#include "sparsematrix.hpp" #include //********************************************************************************************************************** @@ -167,7 +166,7 @@ private: }; -/**************************************************************************************************/ +/************************************************************************************************** //custom data structure for threads to use. // This is passed by void pointer so it can be any data type // that can be passed using a single void pointer (LPVOID). @@ -203,7 +202,7 @@ struct shhhFlowsData { } }; -/**************************************************************************************************/ +/************************************************************************************************** #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) #else static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ @@ -234,7 +233,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ int numFlowCells; //int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells); - /*****************************************************************************************************/ + /***************************************************************************************************** ifstream flowFile; // cout << "herethread " << flowFileName << '\t' << &flowFile << endl; @@ -279,13 +278,13 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ } } // cout << "here" << endl; - /*****************************************************************************************************/ + /***************************************************************************************************** if (pDataArray->m->control_pressed) { return 0; } pDataArray->m->mothurOut("Identifying unique flowgrams...\n"); //int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI); - /*****************************************************************************************************/ + /***************************************************************************************************** int numUniques = 0; uniqueFlowgrams.assign(numFlowCells * numSeqs, -1); uniqueCount.assign(numSeqs, 0); // anWeights @@ -364,7 +363,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ } } - /*****************************************************************************************************/ + /***************************************************************************************************** if (pDataArray->m->control_pressed) { return 0; } @@ -374,7 +373,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ double begClock = clock(); //flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); - /*****************************************************************************************************/ + /***************************************************************************************************** ostringstream outStream; outStream.setf(ios::fixed, ios::floatfield); outStream.setf(ios::dec, ios::basefield); @@ -390,7 +389,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ for(int j=0;jm->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC)); pDataArray->m->mothurOutEndLine(); } - /*****************************************************************************************************/ + /***************************************************************************************************** pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n'); string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names"; //createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq); - /*****************************************************************************************************/ + /***************************************************************************************************** vector duplicateNames(numUniques, ""); for(int i=0;im->control_pressed) { return 0; } pDataArray->m->mothurOut("\nClustering flowgrams...\n"); string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list"; //cluster(listFileName, distFileName, namesFileName); - /*****************************************************************************************************/ + /***************************************************************************************************** ReadMatrix* read = new ReadColumnMatrix(distFileName); read->setCutoff(pDataArray->cutoff); @@ -502,7 +501,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ listFileOut.close(); delete matrix; delete cluster; delete rabund; delete list; - /*****************************************************************************************************/ + /***************************************************************************************************** if (pDataArray->m->control_pressed) { return 0; } @@ -516,7 +515,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ //int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap); - /*****************************************************************************************************/ + /***************************************************************************************************** ifstream listFile; pDataArray->m->openInputFile(listFileName, listFile); string label; @@ -596,7 +595,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ seqIndex = seqNumber; listFile.close(); - /*****************************************************************************************************/ + /***************************************************************************************************** if (pDataArray->m->control_pressed) { return 0; } @@ -643,7 +642,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ double cycClock = clock(); unsigned long long cycTime = time(NULL); //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); - /*****************************************************************************************************/ + /***************************************************************************************************** int indexFill = 0; for(int i=0;im->control_pressed) { break; } //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); - /*****************************************************************************************************/ + /***************************************************************************************************** for(int i=0;im->control_pressed) { break; } @@ -708,7 +707,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ for(int k=0;km->control_pressed) { break; } //maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight); - /*****************************************************************************************************/ + /***************************************************************************************************** double maxChange = 0; for(int i=0;i maxChange){ maxChange = difference; } } maxDelta = maxChange; - /*****************************************************************************************************/ + /***************************************************************************************************** if (pDataArray->m->control_pressed) { break; } //double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); - /*****************************************************************************************************/ + /***************************************************************************************************** vector P(numSeqs, 0); int effNumOTUs = 0; @@ -804,12 +803,12 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ } nLL = nLL -(double)numSeqs * log(pDataArray->sigma); - /*****************************************************************************************************/ + /***************************************************************************************************** if (pDataArray->m->control_pressed) { break; } //checkCentroids(numOTUs, centroids, weight); - /*****************************************************************************************************/ + /***************************************************************************************************** vector unique(numOTUs, 1); for(int i=0;im->control_pressed) { break; } //calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths); - /*****************************************************************************************************/ + /***************************************************************************************************** int total = 0; vector newTau(numOTUs,0); vector norms(numSeqs, 0); @@ -860,7 +859,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ if(weight[j] > MIN_WEIGHT && change[j] == 1){ //dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells); - /*****************************************************************************************************/ + /***************************************************************************************************** int flowAValue = centroids[j] * numFlowCells; int flowBValue = i * numFlowCells; @@ -873,7 +872,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ } dist[indexOffset + j] = distTemp / (double)lengths[i]; - /*****************************************************************************************************/ + /***************************************************************************************************** } @@ -917,7 +916,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ } - /*****************************************************************************************************/ + /***************************************************************************************************** if (pDataArray->m->control_pressed) { break; } @@ -931,7 +930,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ pDataArray->m->mothurOut("\nFinalizing...\n"); //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); - /*****************************************************************************************************/ + /***************************************************************************************************** int indexFill = 0; for(int i=0;im->control_pressed) { break; } //setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI); - /*****************************************************************************************************/ + /***************************************************************************************************** vector bigTauMatrix(numOTUs * numSeqs, 0.0000); for(int i=0;im->control_pressed) { break; } @@ -1017,7 +1016,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ for(int i=0;im->control_pressed) { break; } @@ -1062,7 +1061,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ for(int k=0;km->control_pressed) { break; } //writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (pDataArray->m->control_pressed) { break; } - /*****************************************************************************************************/ + /***************************************************************************************************** string thisOutputDir = pDataArray->outputDir; if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } string qualityFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.qual"; @@ -1200,11 +1199,11 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ } qualityFile.close(); pDataArray->outputNames.push_back(qualityFileName); - /*****************************************************************************************************/ + /***************************************************************************************************** // writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids); if (pDataArray->m->control_pressed) { break; } - /*****************************************************************************************************/ + /***************************************************************************************************** thisOutputDir = pDataArray->outputDir; if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } string fastaFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.fasta"; @@ -1243,11 +1242,11 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ pDataArray->m->appendFiles(fastaFileName, pDataArray->thisCompositeFASTAFileName); } - /*****************************************************************************************************/ + /***************************************************************************************************** //writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (pDataArray->m->control_pressed) { break; } - /*****************************************************************************************************/ + /***************************************************************************************************** thisOutputDir = pDataArray->outputDir; if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } string nameFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.names"; @@ -1275,11 +1274,11 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ if(pDataArray->thisCompositeNameFileName != ""){ pDataArray->m->appendFiles(nameFileName, pDataArray->thisCompositeNameFileName); } - /*****************************************************************************************************/ + /***************************************************************************************************** //writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (pDataArray->m->control_pressed) { break; } - /*****************************************************************************************************/ + /***************************************************************************************************** thisOutputDir = pDataArray->outputDir; if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } string otuCountsFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.counts"; @@ -1327,12 +1326,12 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ } } otuCountsFile.close(); - pDataArray->outputNames.push_back(otuCountsFileName); - /*****************************************************************************************************/ + pDataArray->outputNames.push_back(otuCountsFileName) + /***************************************************************************************************** //writeGroups(flowFileName, numSeqs, seqNameVector); if (pDataArray->m->control_pressed) { break; } - /*****************************************************************************************************/ + /***************************************************************************************************** thisOutputDir = pDataArray->outputDir; if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } string fileRoot = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)); @@ -1346,7 +1345,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ } groupFile.close(); pDataArray->outputNames.push_back(groupFileName); - /*****************************************************************************************************/ + /***************************************************************************************************** pDataArray->m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n'); } @@ -1362,7 +1361,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ } } #endif - +*/ #endif diff --git a/sortseqscommand.cpp b/sortseqscommand.cpp index ee7bf73..b0af154 100644 --- a/sortseqscommand.cpp +++ b/sortseqscommand.cpp @@ -15,8 +15,9 @@ vector SortSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); CommandParameter pflow("flow", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pflow); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pqfile); CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge); @@ -37,8 +38,8 @@ vector SortSeqsCommand::setParameters(){ string SortSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The sort.seqs command puts the sequences in the same order for the following file types: accnos fasta, name, group, taxonomy, flow or quality file.\n"; - helpString += "The sort.seqs command parameters are accnos, fasta, name, group, taxonomy, flow, qfile and large.\n"; + helpString += "The sort.seqs command puts the sequences in the same order for the following file types: accnos fasta, name, group, count, taxonomy, flow or quality file.\n"; + helpString += "The sort.seqs command parameters are accnos, fasta, name, group, count, taxonomy, flow, qfile and large.\n"; helpString += "The accnos file allows you to specify the order you want the files in. If none is provided, mothur will use the order of the first file it reads.\n"; helpString += "The large parameters is used to indicate your files are too large to fit in RAM.\n"; helpString += "The sort.seqs command should be in the following format: sort.seqs(fasta=yourFasta).\n"; @@ -65,6 +66,7 @@ string SortSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ if (type == "fasta") { outputFileName = "sorted" + m->getExtension(inputName); } else if (type == "taxonomy") { outputFileName = "sorted" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "sorted" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "sorted" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "sorted" + m->getExtension(inputName); } else if (type == "flow") { outputFileName = "sorted" + m->getExtension(inputName); } else if (type == "qfile") { outputFileName = "sorted" + m->getExtension(inputName); } @@ -87,6 +89,7 @@ SortSeqsCommand::SortSeqsCommand(){ outputTypes["fasta"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["flow"] = tempOutNames; @@ -127,6 +130,7 @@ SortSeqsCommand::SortSeqsCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["flow"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -191,6 +195,14 @@ SortSeqsCommand::SortSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["flow"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -229,16 +241,31 @@ SortSeqsCommand::SortSeqsCommand(string option) { if (qualfile == "not open") { abort = true; } else if (qualfile == "not found") { qualfile = ""; } else { m->setQualFile(qualfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "f"; } large = m->isTrue(temp); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (taxfile == "") && (flowfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, flow or quality."); m->mothurOutEndLine(); abort = true; } + if ((fastafile == "") && (namefile == "") && (countfile == "") && (groupfile == "") && (taxfile == "") && (flowfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, count, taxonomy, flow or quality."); m->mothurOutEndLine(); abort = true; } - if ((fastafile != "") && (namefile == "")) { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -267,6 +294,7 @@ int SortSeqsCommand::execute(){ if (qualfile != "") { readQual(); } if (namefile != "") { readName(); } if (groupfile != "") { readGroup(); } + if (countfile != "") { readCount(); } if (taxfile != "") { readTax(); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -308,7 +336,12 @@ int SortSeqsCommand::execute(){ itTypes = outputTypes.find("flow"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFlowFile(current); } - } + } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -927,7 +960,88 @@ int SortSeqsCommand::readName(){ exit(1); } } - +//********************************************************************************************************************** +int SortSeqsCommand::readCount(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + string firstCol, rest; + + if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have + + vector seqs; seqs.resize(names.size(), ""); + + string headers = m->getline(in); m->gobble(in); + + while(!in.eof()){ + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> firstCol; m->gobble(in); + rest = m->getline(in); m->gobble(in); + + if (firstCol != "") { + map::iterator it = names.find(firstCol); + if (it != names.end()) { //we found it, so put it in the vector in the right place. + seqs[it->second] = firstCol + '\t' + rest; + }else { //if we cant find it then add it to the end + names[firstCol] = seqs.size(); + seqs.push_back((firstCol + '\t' + rest)); + m->mothurOut(firstCol + " was not in the contained the file which determined the order, adding it to the end.\n"); + } + } + } + in.close(); + + int count = 0; + out << headers << endl; + for (int i = 0; i < seqs.size(); i++) { + if (seqs[i] != "") { out << seqs[i] << endl; count++; } + } + out.close(); + + m->mothurOut("Ordered " + toString(count) + " sequences from " + countfile + ".\n"); + + }else { //read in file to fill names + int count = 0; + + string headers = m->getline(in); m->gobble(in); + out << headers << endl; + + while(!in.eof()){ + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> firstCol; m->gobble(in); + rest = m->getline(in); m->gobble(in); + + if (firstCol != "") { + //if this name is in the accnos file + names[firstCol] = count; + count++; + out << firstCol << '\t' << rest << endl; + } + m->gobble(in); + } + in.close(); + out.close(); + + m->mothurOut("\nUsing " + countfile + " to determine the order. It contains " + toString(count) + " representative sequences.\n"); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SortSeqsCommand", "readCount"); + exit(1); + } +} //********************************************************************************************************************** int SortSeqsCommand::readGroup(){ try { diff --git a/sortseqscommand.h b/sortseqscommand.h index 6d9c5ed..4ba8e42 100644 --- a/sortseqscommand.h +++ b/sortseqscommand.h @@ -36,7 +36,7 @@ public: private: map names; - string accnosfile, fastafile, namefile, groupfile, taxfile, qualfile, flowfile, outputDir; + string accnosfile, fastafile, namefile, groupfile, countfile, taxfile, qualfile, flowfile, outputDir; bool abort, large; vector outputNames; @@ -45,6 +45,7 @@ private: int readName(); int readGroup(); int readTax(); + int readCount(); int readQual(); }; diff --git a/sparsedistancematrix.cpp b/sparsedistancematrix.cpp index 7d50523..b315c48 100644 --- a/sparsedistancematrix.cpp +++ b/sparsedistancematrix.cpp @@ -126,7 +126,7 @@ ull SparseDistanceMatrix::getSmallestCell(ull& row){ return col; } catch(exception& e) { - m->errorOut(e, "SparseMatrix", "getSmallestCell"); + m->errorOut(e, "SparseDistanceMatrix", "getSmallestCell"); exit(1); } } @@ -141,7 +141,7 @@ int SparseDistanceMatrix::sortSeqVec(){ return 0; } catch(exception& e) { - m->errorOut(e, "SparseMatrix", "getSmallestCell"); + m->errorOut(e, "SparseDistanceMatrix", "sortSeqVec"); exit(1); } } diff --git a/splitabundcommand.cpp b/splitabundcommand.cpp index bc1cdb3..48fada8 100644 --- a/splitabundcommand.cpp +++ b/splitabundcommand.cpp @@ -8,13 +8,15 @@ */ #include "splitabundcommand.h" +#include "sharedutilities.h" //********************************************************************************************************************** vector SplitAbundCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff); @@ -37,8 +39,8 @@ string SplitAbundCommand::getHelpString(){ try { string helpString = ""; helpString += "The split.abund command reads a fasta file and a list or a names file splits the sequences into rare and abundant groups. \n"; - helpString += "The split.abund command parameters are fasta, list, name, cutoff, group, label, groups, cutoff and accnos.\n"; - helpString += "The fasta and a list or name parameter are required, and you must provide a cutoff value.\n"; + helpString += "The split.abund command parameters are fasta, list, name, count, cutoff, group, label, groups, cutoff and accnos.\n"; + helpString += "The fasta and a list or name or count parameter are required, and you must provide a cutoff value.\n"; helpString += "The cutoff parameter is used to qualify what is abundant and rare.\n"; helpString += "The group parameter allows you to parse a group file into rare and abundant groups.\n"; helpString += "The label parameter is used to read specific labels in your listfile you want to use.\n"; @@ -69,6 +71,7 @@ string SplitAbundCommand::getOutputFileNameTag(string type, string inputName="") if (type == "fasta") { outputFileName = "fasta"; } else if (type == "list") { outputFileName = "list"; } else if (type == "name") { outputFileName = "names"; } + else if (type == "count") { outputFileName = "count_table"; } else if (type == "group") { outputFileName = "groups"; } else if (type == "accnos") { outputFileName = "accnos"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } @@ -88,6 +91,7 @@ SplitAbundCommand::SplitAbundCommand(){ vector tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["accnos"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["fasta"] = tempOutNames; @@ -126,7 +130,8 @@ SplitAbundCommand::SplitAbundCommand(string option) { outputTypes["name"] = tempOutNames; outputTypes["accnos"] = tempOutNames; outputTypes["group"] = tempOutNames; - outputTypes["fasta"] = tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -165,6 +170,13 @@ SplitAbundCommand::SplitAbundCommand(string option) { if (path == "") { parameters["name"] = inputDir + it->second; } } + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -194,35 +206,52 @@ SplitAbundCommand::SplitAbundCommand(string option) { if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } else { - groupMap = new GroupMap(groupfile); - - int error = groupMap->readMap(); + int error = groupMap.readMap(groupfile); if (error == 1) { abort = true; } m->setGroupFile(groupfile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { + m->setCountTableFile(countfile); + ct.readTable(countfile); + } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = ""; } - else if (groups == "all") { - if (groupfile != "") { Groups = groupMap->getNamesOfGroups(); } - else { m->mothurOut("You cannot select groups without a valid groupfile, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = ""; } - }else { - m->splitAtDash(groups, Groups); - } + else { m->splitAtDash(groups, Groups); } - if ((groupfile == "") && (groups != "")) { m->mothurOut("You cannot select groups without a valid groupfile, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = ""; Groups.clear(); } + if (((groupfile == "") && (countfile == ""))&& (groups != "")) { m->mothurOut("You cannot select groups without a valid group or count file, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = ""; Groups.clear(); } + if (countfile != "") { + if (!ct.hasGroupInfo()) { m->mothurOut("You cannot pick groups without group info in your count file; I will disregard your groups selection."); m->mothurOutEndLine(); groups = ""; Groups.clear(); } + } + //do you have all files needed - if ((listfile == "") && (namefile == "")) { + if ((listfile == "") && (namefile == "") && (countfile == "")) { namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { listfile = m->getListFile(); if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current list or namefile and the list or name parameter is required."); m->mothurOutEndLine(); abort = true; } + else { + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current list, count or namefile and one is required."); m->mothurOutEndLine(); abort = true; } + } } } - + //check for optional parameter and set defaults // ...at some point should added some additional type checking... label = validParameter.validFile(parameters, "label", false); @@ -248,14 +277,20 @@ SplitAbundCommand::SplitAbundCommand(string option) { } } //********************************************************************************************************************** -SplitAbundCommand::~SplitAbundCommand(){ - if (groupfile != "") { delete groupMap; } -} +SplitAbundCommand::~SplitAbundCommand(){} //********************************************************************************************************************** int SplitAbundCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } + + if (Groups.size() != 0) { + vector allGroups; + if (countfile != "") { allGroups = ct.getNamesOfGroups(); } + else { allGroups = groupMap.getNamesOfGroups(); } + SharedUtil util; + util.setGroups(Groups, allGroups); + } if (listfile != "") { //you are using a listfile to determine abundance if (outputDir == "") { outputDir = m->hasPath(listfile); } @@ -264,19 +299,19 @@ int SplitAbundCommand::execute(){ set processedLabels; set userLabels = labels; - input = new InputData(listfile, "list"); - list = input->getListVector(); + InputData input(listfile, "list"); + ListVector* list = input.getListVector(); string lastLabel = list->getLabel(); //do you have a namefile or do we need to similate one? if (namefile != "") { readNamesFile(); } else { createNameMap(list); } - if (m->control_pressed) { delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { - if (m->control_pressed) { delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if(allLines == 1 || labels.count(list->getLabel()) == 1){ @@ -291,7 +326,7 @@ int SplitAbundCommand::execute(){ string saveLabel = list->getLabel(); delete list; - list = input->getListVector(lastLabel); //get new list vector to process + list = input.getListVector(lastLabel); //get new list vector to process m->mothurOut(list->getLabel()); m->mothurOutEndLine(); splitList(list); @@ -307,10 +342,10 @@ int SplitAbundCommand::execute(){ lastLabel = list->getLabel(); delete list; - list = input->getListVector(); //get new list vector to process + list = input.getListVector(); //get new list vector to process } - if (m->control_pressed) { delete input; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //output error messages about any remaining user labels set::iterator it; @@ -326,12 +361,12 @@ int SplitAbundCommand::execute(){ } - if (m->control_pressed) { delete input; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //run last label if you need to if (needToRun == true) { if (list != NULL) { delete list; } - list = input->getListVector(lastLabel); //get new list vector to process + list = input.getListVector(lastLabel); //get new list vector to process m->mothurOut(list->getLabel()); m->mothurOutEndLine(); splitList(list); @@ -339,11 +374,9 @@ int SplitAbundCommand::execute(){ delete list; } - delete input; - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - }else { //you are using the namefile to determine abundance + }else if (namefile != "") { //you are using the namefile to determine abundance if (outputDir == "") { outputDir = m->hasPath(namefile); } splitNames(); @@ -353,7 +386,14 @@ int SplitAbundCommand::execute(){ if (groupfile != "") { parseGroup(tag); } if (accnos) { writeAccnos(tag); } if (fastafile != "") { parseFasta(tag); } - } + }else { + //split by countfile + string tag = ""; + splitCount(); + + if (accnos) { writeAccnos(tag); } + if (fastafile != "") { parseFasta(tag); } + } //set fasta file as new current fastafile string current = ""; @@ -381,6 +421,11 @@ int SplitAbundCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); @@ -401,6 +446,7 @@ int SplitAbundCommand::splitList(ListVector* thisList) { abundNames.clear(); //get rareNames and abundNames + int numRareBins = 0; for (int i = 0; i < thisList->getNumBins(); i++) { if (m->control_pressed) { return 0; } @@ -409,8 +455,15 @@ int SplitAbundCommand::splitList(ListVector* thisList) { vector names; m->splitAtComma(bin, names); //parses bin into individual sequence names int size = names.size(); - + + //if countfile is not blank we assume the list file is unique, otherwise we assume it includes all seqs + if (countfile != "") { + size = 0; + for (int j = 0; j < names.size(); j++) { size += ct.getNumSeqs(names[j]); } + } + if (size <= cutoff) { + numRareBins++; for (int j = 0; j < names.size(); j++) { rareNames.insert(names[j]); } }else{ for (int j = 0; j < names.size(); j++) { abundNames.insert(names[j]); } @@ -419,13 +472,14 @@ int SplitAbundCommand::splitList(ListVector* thisList) { string tag = thisList->getLabel() + "."; - - writeList(thisList, tag); - + + writeList(thisList, tag, numRareBins); + if (groupfile != "") { parseGroup(tag); } if (accnos) { writeAccnos(tag); } if (fastafile != "") { parseFasta(tag); } - + if (countfile != "") { parseCount(tag); } + return 0; } @@ -435,24 +489,13 @@ int SplitAbundCommand::splitList(ListVector* thisList) { } } /**********************************************************************************************************************/ -int SplitAbundCommand::writeList(ListVector* thisList, string tag) { +int SplitAbundCommand::writeList(ListVector* thisList, string tag, int numRareBins) { try { map filehandles; if (Groups.size() == 0) { - SAbundVector* sabund = new SAbundVector(); - *sabund = thisList->getSAbundVector(); - - //find out how many bins are rare and how many are abundant so you can process the list vector one bin at a time - // and don't have to store the bins until you are done with the whole vector, this save alot of space. - int numRareBins = 0; - for (int i = 0; i <= sabund->getMaxRank(); i++) { - if (i > cutoff) { break; } - numRareBins += sabund->get(i); - } int numAbundBins = thisList->getNumBins() - numRareBins; - delete sabund; ofstream aout; ofstream rout; @@ -471,9 +514,15 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) { for (int i = 0; i < thisList->getNumBins(); i++) { if (m->control_pressed) { break; } - string bin = list->get(i); - - int size = m->getNumNames(bin); + string bin = thisList->get(i); + vector names; + m->splitAtComma(bin, names); + + int size = names.size(); + if (countfile != "") { + size = 0; + for (int j = 0; j < names.size(); j++) { size += ct.getNumSeqs(names[j]); } + } if (size <= cutoff) { rout << bin << '\t'; } else { aout << bin << '\t'; } @@ -499,8 +548,8 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) { temp2 = new ofstream; filehandles[Groups[i]+".abund"] = temp2; - string rareGroupFileName = fileroot + Groups[i] + tag + ".rare." + getOutputFileNameTag("list"); - string abundGroupFileName = fileroot + Groups[i] + tag + ".abund." + getOutputFileNameTag("list"); + string rareGroupFileName = fileroot + Groups[i] +"."+ tag + "rare." + getOutputFileNameTag("list"); + string abundGroupFileName = fileroot + Groups[i] +"."+ tag + "abund." + getOutputFileNameTag("list"); m->openOutputFile(rareGroupFileName, *(filehandles[Groups[i]+".rare"])); m->openOutputFile(abundGroupFileName, *(filehandles[Groups[i]+".abund"])); outputNames.push_back(rareGroupFileName); outputTypes["list"].push_back(rareGroupFileName); @@ -520,7 +569,7 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) { if (m->control_pressed) { break; } map groupBins; - string bin = list->get(i); + string bin = thisList->get(i); vector names; m->splitAtComma(bin, names); //parses bin into individual sequence names @@ -534,19 +583,34 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) { rareAbund = ".abund"; } - string group = groupMap->getGroup(names[j]); - - if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want - itGroup = groupBins.find(group+rareAbund); - if(itGroup == groupBins.end()) { - groupBins[group+rareAbund] = names[j]; //add first name - groupNumBins[group+rareAbund]++; - }else{ //add another name - groupBins[group+rareAbund] += "," + names[j]; - } - }else if(group == "not found") { - m->mothurOut(names[j] + " is not in your groupfile. Ignoring."); m->mothurOutEndLine(); - } + if (countfile == "") { + string group = groupMap.getGroup(names[j]); + + if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want + itGroup = groupBins.find(group+rareAbund); + if(itGroup == groupBins.end()) { + groupBins[group+rareAbund] = names[j]; //add first name + groupNumBins[group+rareAbund]++; + }else{ //add another name + groupBins[group+rareAbund] += "," + names[j]; + } + }else if(group == "not found") { + m->mothurOut(names[j] + " is not in your groupfile. Ignoring."); m->mothurOutEndLine(); + } + }else { + vector thisSeqsGroups = ct.getGroups(names[j]); + for (int k = 0; k < thisSeqsGroups.size(); k++) { + if (m->inUsersGroups(thisSeqsGroups[k], Groups)) { //only add if this is in a group we want + itGroup = groupBins.find(thisSeqsGroups[k]+rareAbund); + if(itGroup == groupBins.end()) { + groupBins[thisSeqsGroups[k]+rareAbund] = names[j]; //add first name + groupNumBins[thisSeqsGroups[k]+rareAbund]++; + }else{ //add another name + groupBins[thisSeqsGroups[k]+rareAbund] += "," + names[j]; + } + } + } + } } @@ -572,6 +636,37 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) { } } /**********************************************************************************************************************/ +int SplitAbundCommand::splitCount() { //countfile + try { + rareNames.clear(); + abundNames.clear(); + + vector allNames = ct.getNamesOfSeqs(); + for (int i = 0; i < allNames.size(); i++) { + + if (m->control_pressed) { return 0; } + + int size = ct.getNumSeqs(allNames[i]); + nameMap[allNames[i]] = allNames[i]; + + if (size <= cutoff) { + rareNames.insert(allNames[i]); + }else{ + abundNames.insert(allNames[i]); + } + } + + //write out split count files + parseCount(""); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SplitAbundCommand", "splitCount"); + exit(1); + } +} +/**********************************************************************************************************************/ int SplitAbundCommand::splitNames() { //namefile try { @@ -658,6 +753,115 @@ int SplitAbundCommand::createNameMap(ListVector* thisList) { } } /**********************************************************************************************************************/ +int SplitAbundCommand::parseCount(string tag) { //namefile + try { + + map filehandles; + + if (Groups.size() == 0) { + string rare = outputDir + m->getRootName(m->getSimpleName(countfile)) + tag + "rare." + getOutputFileNameTag("count"); + outputNames.push_back(rare); outputTypes["count"].push_back(rare); + + string abund = outputDir + m->getRootName(m->getSimpleName(countfile)) + tag + "abund." + getOutputFileNameTag("count"); + outputNames.push_back(abund); outputTypes["count"].push_back(abund); + + CountTable rareTable; + CountTable abundTable; + if (ct.hasGroupInfo()) { + vector ctGroups = ct.getNamesOfGroups(); + for (int i = 0; i < ctGroups.size(); i++) { rareTable.addGroup(ctGroups[i]); abundTable.addGroup(ctGroups[i]); } + } + + if (rareNames.size() != 0) { + for (set::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) { + if (ct.hasGroupInfo()) { + vector groupCounts = ct.getGroupCounts(*itRare); + rareTable.push_back(*itRare, groupCounts); + }else { + int groupCounts = ct.getNumSeqs(*itRare); + rareTable.push_back(*itRare, groupCounts); + } + } + if (rareTable.hasGroupInfo()) { + vector ctGroups = rareTable.getNamesOfGroups(); + for (int i = 0; i < ctGroups.size(); i++) { + if (rareTable.getGroupCount(ctGroups[i]) == 0) { rareTable.removeGroup(ctGroups[i]); } + } + } + rareTable.printTable(rare); + } + + + if (abundNames.size() != 0) { + for (set::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) { + if (ct.hasGroupInfo()) { + vector groupCounts = ct.getGroupCounts(*itAbund); + abundTable.push_back(*itAbund, groupCounts); + }else { + int groupCounts = ct.getNumSeqs(*itAbund); + abundTable.push_back(*itAbund, groupCounts); + } + } + if (abundTable.hasGroupInfo()) { + vector ctGroups = abundTable.getNamesOfGroups(); + for (int i = 0; i < ctGroups.size(); i++) { + if (abundTable.getGroupCount(ctGroups[i]) == 0) { abundTable.removeGroup(ctGroups[i]); } + } + } + abundTable.printTable(abund); + } + + }else{ //parse names by abundance and group + map countTableMap; + map::iterator it3; + + for (int i=0; iaddGroup(Groups[i]); + countTableMap[Groups[i]+".rare"] = rareCt; + CountTable* abundCt = new CountTable(); + abundCt->addGroup(Groups[i]); + countTableMap[Groups[i]+".abund"] = abundCt; + } + + vector allNames = ct.getNamesOfSeqs(); + for (int i = 0; i < allNames.size(); i++) { + string rareAbund; + if (rareNames.count(allNames[i]) != 0) { //you are a rare name + rareAbund = ".rare"; + }else{ //you are a abund name + rareAbund = ".abund"; + } + + vector thisSeqsGroups = ct.getGroups(allNames[i]); + for (int j = 0; j < thisSeqsGroups.size(); j++) { + if (m->inUsersGroups(thisSeqsGroups[j], Groups)) { //only add if this is in a group we want + int num = ct.getGroupCount(allNames[i], thisSeqsGroups[j]); + vector nums; nums.push_back(num); + countTableMap[thisSeqsGroups[j]+rareAbund]->push_back(allNames[i], nums); + } + } + } + + + for (it3 = countTableMap.begin(); it3 != countTableMap.end(); it3++) { + string fileroot = outputDir + m->getRootName(m->getSimpleName(countfile)); + string filename = fileroot + it3->first + "." + getOutputFileNameTag("count"); + outputNames.push_back(filename); outputTypes["count"].push_back(filename); + (it3->second)->printTable(filename); + delete it3->second; + } + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SplitAbundCommand", "parseCount"); + exit(1); + } +} +/**********************************************************************************************************************/ int SplitAbundCommand::writeNames() { //namefile try { @@ -723,7 +927,7 @@ int SplitAbundCommand::writeNames() { //namefile map::iterator itout; for (int i = 0; i < names.size(); i++) { - string group = groupMap->getGroup(names[i]); + string group = groupMap.getGroup(names[i]); if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want itout = outputStrings.find(group+rareAbund); @@ -803,7 +1007,7 @@ int SplitAbundCommand::writeAccnos(string tag) { //write rare for (set::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) { - string group = groupMap->getGroup(*itRare); + string group = groupMap.getGroup(*itRare); if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want *(filehandles[group+".rare"]) << *itRare << endl; @@ -812,7 +1016,7 @@ int SplitAbundCommand::writeAccnos(string tag) { //write abund for (set::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) { - string group = groupMap->getGroup(*itAbund); + string group = groupMap.getGroup(*itAbund); if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want *(filehandles[group+".abund"]) << *itAbund << endl; @@ -860,7 +1064,7 @@ int SplitAbundCommand::parseGroup(string tag) { //namefile for (int i = 0; i < names.size(); i++) { - string group = groupMap->getGroup(names[i]); + string group = groupMap.getGroup(names[i]); if (group == "not found") { m->mothurOut(names[i] + " is not in your groupfile, ignoring, please correct."); m->mothurOutEndLine(); @@ -907,7 +1111,7 @@ int SplitAbundCommand::parseGroup(string tag) { //namefile for (int i = 0; i < names.size(); i++) { - string group = groupMap->getGroup(names[i]); + string group = groupMap.getGroup(names[i]); if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want *(filehandles[group+rareAbund]) << names[i] << '\t' << group << endl; @@ -964,7 +1168,7 @@ int SplitAbundCommand::parseFasta(string tag) { //namefile itNames = nameMap.find(seq.getName()); if (itNames == nameMap.end()) { - m->mothurOut(seq.getName() + " is not in your namesfile, ignoring."); m->mothurOutEndLine(); + m->mothurOut(seq.getName() + " is not in your names or list file, ignoring."); m->mothurOutEndLine(); }else{ if (rareNames.count(seq.getName()) != 0) { //you are a rare name seq.printSequence(rout); @@ -1008,7 +1212,7 @@ int SplitAbundCommand::parseFasta(string tag) { //namefile map::iterator itNames = nameMap.find(seq.getName()); if (itNames == nameMap.end()) { - m->mothurOut(seq.getName() + " is not in your namesfile, ignoring."); m->mothurOutEndLine(); + m->mothurOut(seq.getName() + " is not in your names or list file, ignoring."); m->mothurOutEndLine(); }else{ vector names; m->splitAtComma(itNames->second, names); //parses bin into individual sequence names @@ -1019,17 +1223,25 @@ int SplitAbundCommand::parseFasta(string tag) { //namefile }else{ //you are a abund name rareAbund = ".abund"; } - - for (int i = 0; i < names.size(); i++) { - - string group = groupMap->getGroup(seq.getName()); - - if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want - seq.printSequence(*(filehandles[group+rareAbund])); - }else if(group == "not found") { - m->mothurOut(seq.getName() + " is not in your groupfile. Ignoring."); m->mothurOutEndLine(); - } - } + + if (countfile == "") { + for (int i = 0; i < names.size(); i++) { + string group = groupMap.getGroup(seq.getName()); + + if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want + seq.printSequence(*(filehandles[group+rareAbund])); + }else if(group == "not found") { + m->mothurOut(seq.getName() + " is not in your groupfile. Ignoring."); m->mothurOutEndLine(); + } + } + }else { + vector thisSeqsGroups = ct.getGroups(names[0]); //we only need names[0], because there is no namefile + for (int i = 0; i < thisSeqsGroups.size(); i++) { + if (m->inUsersGroups(thisSeqsGroups[i], Groups)) { //only add if this is in a group we want + seq.printSequence(*(filehandles[thisSeqsGroups[i]+rareAbund])); + } + } + } } } } diff --git a/splitabundcommand.h b/splitabundcommand.h index 232c36b..d054264 100644 --- a/splitabundcommand.h +++ b/splitabundcommand.h @@ -22,6 +22,7 @@ also allow an option where a user can give a group file with the list or names f #include "inputdata.h" #include "listvector.hpp" #include "sequence.hpp" +#include "counttable.h" /***************************************************************************************/ @@ -47,24 +48,24 @@ private: int splitList(ListVector*); int splitNames(); //namefile int writeNames(); - int writeList(ListVector*, string); + int writeList(ListVector*, string, int); int writeAccnos(string); int parseGroup(string); int parseFasta(string); + int parseCount(string); + int splitCount(); int readNamesFile(); //namefile int createNameMap(ListVector*); vector outputNames; - ListVector* list; - GroupMap* groupMap; - InputData* input; + GroupMap groupMap; + CountTable ct; - string outputDir, listfile, namefile, groupfile, label, groups, fastafile, inputFile; + string outputDir, listfile, namefile, groupfile, countfile, label, groups, fastafile, inputFile; set labels, rareNames, abundNames; vector Groups; bool abort, allLines, accnos; int cutoff; - //map wroteListFile; map nameMap; diff --git a/splitgroupscommand.cpp b/splitgroupscommand.cpp index af3ca66..f3c6cd9 100644 --- a/splitgroupscommand.cpp +++ b/splitgroupscommand.cpp @@ -10,13 +10,15 @@ #include "splitgroupscommand.h" #include "sharedutilities.h" #include "sequenceparser.h" +#include "counttable.h" //********************************************************************************************************************** vector SplitGroupCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "CountGroup", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "CountGroup", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -34,9 +36,9 @@ vector SplitGroupCommand::setParameters(){ string SplitGroupCommand::getHelpString(){ try { string helpString = ""; - helpString += "The split.group command reads a group file, and parses your fasta and names files by groups. \n"; - helpString += "The split.group command parameters are fasta, name, group and groups.\n"; - helpString += "The fasta and group parameters are required.\n"; + helpString += "The split.group command reads a group or count file, and parses your fasta and names or count files by groups. \n"; + helpString += "The split.group command parameters are fasta, name, group, count and groups.\n"; + helpString += "The fasta and group or count parameters are required.\n"; helpString += "The groups parameter allows you to select groups to create files for. \n"; helpString += "For example if you set groups=A-B-C, you will get a .A.fasta, .A.names, .B.fasta, .B.names, .C.fasta, .C.names files. \n"; helpString += "If you want .fasta and .names files for all groups, set groups=all. \n"; @@ -62,6 +64,7 @@ string SplitGroupCommand::getOutputFileNameTag(string type, string inputName="") else { if (type == "fasta") { outputFileName = "fasta"; } else if (type == "name") { outputFileName = "names"; } + else if (type == "count") { outputFileName = "count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -79,6 +82,7 @@ SplitGroupCommand::SplitGroupCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "SplitGroupCommand", "SplitGroupCommand"); @@ -112,6 +116,7 @@ SplitGroupCommand::SplitGroupCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -141,7 +146,14 @@ SplitGroupCommand::SplitGroupCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } - + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -160,23 +172,56 @@ SplitGroupCommand::SplitGroupCommand(string option) { groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { groupfile = ""; abort = true; } - else if (groupfile == "not found") { - groupfile = m->getGroupFile(); - if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current groupfile and the group parameter is required."); m->mothurOutEndLine(); abort = true; } + else if (groupfile == "not found") { groupfile = ""; }else { m->setGroupFile(groupfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + if ((countfile != "") && (groupfile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } + + if ((countfile == "") && (groupfile == "")) { + if (namefile == "") { //check for count then group + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + groupfile = m->getGroupFile(); + if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a count or group file."); m->mothurOutEndLine(); + abort = true; + } + } + }else { //check for group + groupfile = m->getGroupFile(); + if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a count or group file."); m->mothurOutEndLine(); + abort = true; + } + } + } groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = ""; } else { m->splitAtDash(groups, Groups); } //if the user changes the output directory command factory will send this info to us in the output parameter - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(groupfile); } + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + if (groupfile != "") { outputDir = m->hasPath(groupfile); } + else { outputDir = m->hasPath(countfile); } + } - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -191,13 +236,48 @@ int SplitGroupCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - SequenceParser* parser; + if (countfile == "" ) { runNameGroup(); } + else { runCount(); } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "SplitGroupCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +int SplitGroupCommand::runNameGroup(){ + try { + SequenceParser* parser; if (namefile == "") { parser = new SequenceParser(groupfile, fastafile); } else { parser = new SequenceParser(groupfile, fastafile, namefile); } if (m->control_pressed) { delete parser; return 0; } - + vector namesGroups = parser->getNamesOfGroups(); SharedUtil util; util.setGroups(Groups, namesGroups); @@ -215,7 +295,7 @@ int SplitGroupCommand::execute(){ parser->getSeqs(Groups[i], newFasta, false); outputNames.push_back(newFasta); outputTypes["fasta"].push_back(newFasta); if (m->control_pressed) { delete parser; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - + if (namefile != "") { parser->getNameMap(Groups[i], newName); outputNames.push_back(newName); outputTypes["name"].push_back(newName); @@ -225,29 +305,77 @@ int SplitGroupCommand::execute(){ } delete parser; - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - - string current = ""; - itTypes = outputTypes.find("fasta"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } - } - - itTypes = outputTypes.find("name"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } - } - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); - - return 0; + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SplitGroupCommand", "runNameGroup"); + exit(1); } +} +//********************************************************************************************************************** +int SplitGroupCommand::runCount(){ + try { + + CountTable ct; + ct.readTable(countfile); + if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: your count file does not contain group info, cannot split by group.\n"); m->control_pressed = true; } + + if (m->control_pressed) { return 0; } + + vector namesGroups = ct.getNamesOfGroups(); + SharedUtil util; util.setGroups(Groups, namesGroups); + + //fill filehandles with neccessary ofstreams + map ffiles; + map cfiles; + ofstream* temp; + for (int i=0; igetRootName(m->getSimpleName(fastafile)) + Groups[i] + "." + getOutputFileNameTag("fasta"); + outputNames.push_back(newFasta); outputTypes["fasta"].push_back(newFasta); + m->openOutputFile(newFasta, (*temp)); + temp = new ofstream; + cfiles[Groups[i]] = temp; + string newCount = outputDir + m->getRootName(m->getSimpleName(countfile)) + Groups[i] + "." + getOutputFileNameTag("count"); + m->openOutputFile(newCount, (*temp)); + outputNames.push_back(newCount); outputTypes["count"].push_back(newCount); + (*temp) << "Representative_Sequence\ttotal\t" << Groups[i] << endl; + } + + ifstream in; + m->openInputFile(fastafile, in); + + while (!in.eof()) { + Sequence seq(in); m->gobble(in); + + if (m->control_pressed) { break; } + if (seq.getName() != "") { + vector thisSeqsGroups = ct.getGroups(seq.getName()); + for (int i = 0; i < thisSeqsGroups.size(); i++) { + if (m->inUsersGroups(thisSeqsGroups[i], Groups)) { //if this sequence belongs to a group we want them print + seq.printSequence(*(ffiles[thisSeqsGroups[i]])); + int numSeqs = ct.getGroupCount(seq.getName(), Groups[i]); + (*(cfiles[thisSeqsGroups[i]])) << seq.getName() << '\t' << numSeqs << '\t' << numSeqs << endl; + } + } + } + } + in.close(); + + //close and delete ofstreams + for (int i=0; ierrorOut(e, "SplitGroupCommand", "execute"); + m->errorOut(e, "SplitGroupCommand", "runCount"); exit(1); } } diff --git a/splitgroupscommand.h b/splitgroupscommand.h index a8dc9a1..62e063d 100644 --- a/splitgroupscommand.h +++ b/splitgroupscommand.h @@ -42,9 +42,12 @@ public: private: vector outputNames; - string outputDir, namefile, groupfile, groups, fastafile; + string outputDir, namefile, groupfile, countfile, groups, fastafile; vector Groups; bool abort; + + int runNameGroup(); + int runCount(); }; /***************************************************************************************/ diff --git a/splitmatrix.cpp b/splitmatrix.cpp index 384b09a..28bc5d4 100644 --- a/splitmatrix.cpp +++ b/splitmatrix.cpp @@ -14,21 +14,23 @@ /***********************************************************************/ -SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){ +SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){ m = MothurOut::getInstance(); distFile = distfile; cutoff = c; namefile = name; method = t; taxFile = tax; + countfile = count; large = l; } /***********************************************************************/ -SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, bool cl, string output){ +SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){ m = MothurOut::getInstance(); fastafile = ffile; namefile = name; + countfile = count; taxFile = tax; cutoff = c; //tax level cutoff distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that @@ -50,7 +52,8 @@ int SplitMatrix::split(){ }else { m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine(); map temp; - temp[distFile] = namefile; + if (namefile != "") { temp[distFile] = namefile; } + else { temp[distFile] = countfile; } dists.push_back(temp); } @@ -159,7 +162,7 @@ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numG it = seqGroup.find(query.getName()); //save names in case no namefile is given - if (namefile == "") { names.insert(query.getName()); } + if ((namefile == "") && (countfile == "")) { names.insert(query.getName()); } if (it != seqGroup.end()) { //not singleton m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile); @@ -196,74 +199,21 @@ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numG m->mothurRemove((fastafile + "." + toString(i) + ".temp")); //remove old names files just in case - m->mothurRemove((namefile + "." + toString(i) + ".temp")); + if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); } + else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); } } - - singleton = namefile + ".extra.temp"; - ofstream remainingNames; - m->openOutputFile(singleton, remainingNames); - - bool wroteExtra = false; - - ifstream bigNameFile; - m->openInputFile(namefile, bigNameFile); - - string name, nameList; - while(!bigNameFile.eof()){ - bigNameFile >> name >> nameList; m->gobble(bigNameFile); - - //did this sequence get assigned a group - it = seqGroup.find(name); - - if (it != seqGroup.end()) { - m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); - outFile << name << '\t' << nameList << endl; - outFile.close(); - }else{ - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - } - bigNameFile.close(); - - for(int i=0;ihasPath(fastafile); } - string tempDistFile = ""; + + vector tempDistFiles; + for(int i=0;ihasPath(fastafile); } + string tempDistFile = ""; if (classic) { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";} else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; } - - //if there are valid distances - ifstream fileHandle; - fileHandle.open(tempDistFile.c_str()); - if(fileHandle) { - m->gobble(fileHandle); - if (!fileHandle.eof()) { //check for blank file - this could occur if all dists in group are above cutoff - map temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); - }else { - ifstream in; - m->openInputFile(tempNameFile, in); - - while(!in.eof()) { - in >> name >> nameList; m->gobble(in); - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - in.close(); - m->mothurRemove(tempNameFile); - } - } - fileHandle.close(); - } - - remainingNames.close(); - if (!wroteExtra) { - m->mothurRemove(singleton); - singleton = "none"; - } - + tempDistFiles.push_back(tempDistFile); + } + + splitNames(seqGroup, numGroups, tempDistFiles); + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); } return 0; @@ -279,9 +229,10 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup map::iterator it; map::iterator it2; + ofstream outFile; ifstream dFile; m->openInputFile(distFile, dFile); - ofstream outFile; + for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case m->mothurRemove((distFile + "." + toString(i) + ".temp")); @@ -326,9 +277,15 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup } } dFile.close(); - + + string inputFile = namefile; + if (countfile != "") { inputFile = countfile; } + + vector tempDistFiles; for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case - m->mothurRemove((namefile + "." + toString(i) + ".temp")); + string tempDistFile = distFile + "." + toString(i) + ".temp"; + tempDistFiles.push_back(tempDistFile); + m->mothurRemove((inputFile + "." + toString(i) + ".temp")); //write out any remaining buffers if (numOutputs[i] > 0) { @@ -341,63 +298,8 @@ int SplitMatrix::splitDistanceFileByTax(map& seqGroup, int numGroup } } - ifstream bigNameFile; - m->openInputFile(namefile, bigNameFile); - - singleton = namefile + ".extra.temp"; - ofstream remainingNames; - m->openOutputFile(singleton, remainingNames); - - bool wroteExtra = false; - - string name, nameList; - while(!bigNameFile.eof()){ - bigNameFile >> name >> nameList; m->gobble(bigNameFile); - - //did this sequence get assigned a group - it = seqGroup.find(name); - - if (it != seqGroup.end()) { - m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); - outFile << name << '\t' << nameList << endl; - outFile.close(); - }else{ - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - } - bigNameFile.close(); - - for(int i=0;i temp; - temp[tempDistFile] = tempNameFile; - dists.push_back(temp); - }else{ - ifstream in; - m->openInputFile(tempNameFile, in); - - while(!in.eof()) { - in >> name >> nameList; m->gobble(in); - wroteExtra = true; - remainingNames << name << '\t' << nameList << endl; - } - in.close(); - m->mothurRemove(tempNameFile); - } - } - - remainingNames.close(); - - if (!wroteExtra) { - m->mothurRemove(singleton); - singleton = "none"; - } - + splitNames(seqGroup, numGroups, tempDistFiles); + if (m->control_pressed) { for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); @@ -645,17 +547,29 @@ int SplitMatrix::splitDistanceLarge(){ m->gobble(dFile); } dFile.close(); - + + vector tempDistFiles; for (int i = 0; i < numGroups; i++) { + string fileName = distFile + "." + toString(i) + ".temp"; + tempDistFiles.push_back(fileName); + //remove old names files just in case + if (numOutputs[i] > 0) { - string fileName = distFile + "." + toString(i) + ".temp"; outFile.open(fileName.c_str(), ios::app); outFile << outputs[i]; outFile.close(); } } - - splitNames(groups); + + map seqGroup; + for (int i = 0; i < groups.size(); i++) { + for (set::iterator itNames = groups[i].begin(); itNames != groups[i].end();) { + seqGroup[*itNames] = i; + groups[i].erase(itNames++); + } + } + + splitNames(seqGroup, numGroups, tempDistFiles); return 0; } @@ -665,73 +579,104 @@ int SplitMatrix::splitDistanceLarge(){ } } //******************************************************************************************************************** -int SplitMatrix::splitNames(vector >& groups){ +int SplitMatrix::splitNames(map& seqGroup, int numGroups, vector& tempDistFiles){ try { - int numGroups = groups.size(); - - ifstream bigNameFile(namefile.c_str()); - if(!bigNameFile){ - cerr << "Error: We can't open the name file\n"; - exit(1); - } - - map nameMap; - string name, nameList; - while(bigNameFile){ - bigNameFile >> name >> nameList; - nameMap[name] = nameList; - m->gobble(bigNameFile); - } - bigNameFile.close(); - - for(int i=0;i 0){ - string fileName = namefile + "." + toString(i) + ".temp"; - ofstream smallNameFile(fileName.c_str(), ios::ate); - - for(set::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){ - map::iterator nIt = nameMap.find(*gIt); - if (nIt != nameMap.end()) { - smallNameFile << nIt->first << '\t' << nIt->second << endl; - nameMap.erase(nIt); - }else{ - m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1); - } - } - smallNameFile.close(); - } - } - - //names of singletons - if (nameMap.size() != 0) { - singleton = namefile + ".extra.temp"; - ofstream remainingNames(singleton.c_str(), ios::ate); - for(map::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){ - remainingNames << nIt->first << '\t' << nIt->second << endl; - } - remainingNames.close(); - }else { singleton = "none"; } - + ofstream outFile; + map::iterator it; + + string inputFile = namefile; + if (countfile != "") { inputFile = countfile; } + + for(int i=0;imothurRemove((inputFile + "." + toString(i) + ".temp")); } + + singleton = inputFile + ".extra.temp"; + ofstream remainingNames; + m->openOutputFile(singleton, remainingNames); + + bool wroteExtra = false; + + ifstream bigNameFile; + m->openInputFile(inputFile, bigNameFile); + + //grab header line + string headers = ""; + if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); } + + string name, nameList; + while(!bigNameFile.eof()){ + bigNameFile >> name >> nameList; + m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given. + + //did this sequence get assigned a group + it = seqGroup.find(name); + + if (it != seqGroup.end()) { + m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile); + outFile << name << '\t' << nameList << endl; + outFile.close(); + }else{ + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + } + bigNameFile.close(); + for(int i=0;i 0){ - string tempNameFile = namefile + "." + toString(i) + ".temp"; - string tempDistFile = distFile + "." + toString(i) + ".temp"; - + string tempNameFile = inputFile + "." + toString(i) + ".temp"; + string tempDistFile = tempDistFiles[i]; + + //if there are valid distances + ifstream fileHandle; + fileHandle.open(tempDistFile.c_str()); + if(fileHandle) { + m->gobble(fileHandle); + if (!fileHandle.eof()) { //check map temp; + if (countfile != "") { + //add header + ofstream out; + string newtempNameFile = tempNameFile + "2"; + m->openOutputFile(newtempNameFile, out); + out << headers << endl; + out.close(); + m->appendFiles(tempNameFile, newtempNameFile); + m->mothurRemove(tempNameFile); + m->renameFile(newtempNameFile, tempNameFile); + } temp[tempDistFile] = tempNameFile; dists.push_back(temp); + }else{ + ifstream in; + m->openInputFile(tempNameFile, in); + + while(!in.eof()) { + in >> name >> nameList; m->gobble(in); + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + in.close(); + m->mothurRemove(tempNameFile); } + } + fileHandle.close(); } - if (m->control_pressed) { - for (int i = 0; i < dists.size(); i++) { - m->mothurRemove((dists[i].begin()->first)); - m->mothurRemove((dists[i].begin()->second)); - } - dists.clear(); - } + remainingNames.close(); + + if (!wroteExtra) { + m->mothurRemove(singleton); + singleton = "none"; + }else if (countfile != "") { + //add header + ofstream out; + string newtempNameFile = singleton + "2"; + m->openOutputFile(newtempNameFile, out); + out << headers << endl; + out.close(); + m->appendFiles(singleton, newtempNameFile); + m->mothurRemove(singleton); + m->renameFile(newtempNameFile, singleton); + } return 0; } @@ -836,17 +781,27 @@ int SplitMatrix::splitDistanceRAM(){ } dFile.close(); + vector tempDistFiles; for (int i = 0; i < numGroups; i++) { + string fileName = distFile + "." + toString(i) + ".temp"; + tempDistFiles.push_back(fileName); if (outputs[i] != "") { ofstream outFile; - string fileName = distFile + "." + toString(i) + ".temp"; outFile.open(fileName.c_str(), ios::ate); outFile << outputs[i]; outFile.close(); } } - - splitNames(groups); + + map seqGroup; + for (int i = 0; i < groups.size(); i++) { + for (set::iterator itNames = groups[i].begin(); itNames != groups[i].end();) { + seqGroup[*itNames] = i; + groups[i].erase(itNames++); + } + } + + splitNames(seqGroup, numGroups, tempDistFiles); return 0; } diff --git a/splitmatrix.h b/splitmatrix.h index b8aa551..7b468e9 100644 --- a/splitmatrix.h +++ b/splitmatrix.h @@ -19,8 +19,8 @@ class SplitMatrix { public: - SplitMatrix(string, string, string, float, string, bool); //column formatted distance file, namesfile, cutoff, method, large - SplitMatrix(string, string, string, float, float, string, int, bool, string); //fastafile, namefile, taxFile, taxcutoff, cutoff, method, processors, classic, outputDir + SplitMatrix(string, string, string, string, float, string, bool); //column formatted distance file, namesfile, countfile, cutoff, method, large + SplitMatrix(string, string, string, string, float, float, string, int, bool, string); //fastafile, namefile, countfile, taxFile, taxcutoff, cutoff, method, processors, classic, outputDir ~SplitMatrix(); int split(); @@ -30,7 +30,7 @@ class SplitMatrix { private: MothurOut* m; - string distFile, namefile, singleton, method, taxFile, fastafile, outputDir; + string distFile, namefile, singleton, method, taxFile, fastafile, outputDir, countfile; vector< map< string, string> > dists; float cutoff, distCutoff; bool large, classic; @@ -40,7 +40,7 @@ class SplitMatrix { int splitClassify(); int splitDistanceLarge(); int splitDistanceRAM(); - int splitNames(vector >& groups); + int splitNames(map& groups, int, vector&); int splitDistanceFileByTax(map&, int); int createDistanceFilesFromTax(map&, int); }; diff --git a/subsample.cpp b/subsample.cpp index 261297d..392f97b 100644 --- a/subsample.cpp +++ b/subsample.cpp @@ -8,62 +8,54 @@ #include "subsample.h" //********************************************************************************************************************** -Tree* SubSample::getSample(Tree* T, TreeMap* tmap, TreeMap* newTmap, int size, map originalNameMap) { +Tree* SubSample::getSample(Tree* T, CountTable* ct, CountTable* newCt, int size) { try { Tree* newTree = NULL; - map > newGroups; - vector subsampledSeqs = getSample(tmap, size, newGroups); + //remove seqs not in sample from counttable + vector Groups = ct->getNamesOfGroups(); + newCt->copy(ct); + newCt->addGroup("doNotIncludeMe"); - //remove seqs not in sample from treemap - for (map >::iterator it = newGroups.begin(); it != newGroups.end(); it++) { - for (int i = 0; i < (it->second).size(); i++) { - newTmap->addSeq((it->second)[i], it->first); - } - } - - newTree = new Tree(newTmap); - newTree->getCopy(T, originalNameMap); - - return newTree; - } - catch(exception& e) { - m->errorOut(e, "SubSample", "getSample-Tree"); - exit(1); - } -} -/********************************************************************************************************************** -Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, int size) { - try { - Tree* newTree = NULL; - - vector subsampledSeqs = getSample(tmap, size); - map sampledNameMap = deconvolute(whole, subsampledSeqs); + map doNotIncludeTotals; + vector namesSeqs = ct->getNamesOfSeqs(); + for (int i = 0; i < namesSeqs.size(); i++) { doNotIncludeTotals[namesSeqs[i]] = 0; } + + for (int i = 0; i < Groups.size(); i++) { + if (m->inUsersGroups(Groups[i], m->getGroups())) { + if (m->control_pressed) { break; } - //remove seqs not in sample from treemap - for (int i = 0; i < tmap->namesOfSeqs.size(); i++) { - //is that name in the subsample? - int count = 0; - for (int j = 0; j < subsampledSeqs.size(); j++) { - if (tmap->namesOfSeqs[i] == subsampledSeqs[j]) { break; } //found it - count++; + int thisSize = ct->getGroupCount(Groups[i]); + + if (thisSize >= size) { + + vector names = ct->getNamesOfSeqs(Groups[i]); + vector random; + for (int j = 0; j < names.size(); j++) { + int num = ct->getGroupCount(names[j], Groups[i]); + for (int k = 0; k < num; k++) { random.push_back(j); } + } + random_shuffle(random.begin(), random.end()); + + vector sampleRandoms; sampleRandoms.resize(names.size(), 0); + for (int j = 0; j < size; j++) { sampleRandoms[random[j]]++; } + for (int j = 0; j < sampleRandoms.size(); j++) { + newCt->setAbund(names[j], Groups[i], sampleRandoms[j]); + } + sampleRandoms.clear(); sampleRandoms.resize(names.size(), 0); + for (int j = size; j < thisSize; j++) { sampleRandoms[random[j]]++; } + for (int j = 0; j < sampleRandoms.size(); j++) { doNotIncludeTotals[names[j]] += sampleRandoms[j]; } + }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } } - if (m->control_pressed) { return newTree; } - - //if you didnt find it, remove it - if (count == subsampledSeqs.size()) { - tmap->removeSeq(tmap->namesOfSeqs[i]); - i--; //need this because removeSeq removes name from namesOfSeqs - } } - //create new tree - int numUniques = sampledNameMap.size(); - if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); } + for (map::iterator it = doNotIncludeTotals.begin(); it != doNotIncludeTotals.end(); it++) { + newCt->setAbund(it->first, "doNotIncludeMe", it->second); + } - newTree = new Tree(numUniques, tmap); //numNodes, treemap - newTree->getSubTree(T, subsampledSeqs, sampledNameMap); + newTree = new Tree(newCt); + newTree->getCopy(T, true); return newTree; } @@ -71,7 +63,7 @@ Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map whole, in m->errorOut(e, "SubSample", "getSample-Tree"); exit(1); } -}*/ +} //********************************************************************************************************************** //assumes whole maps dupName -> uniqueName map SubSample::deconvolute(map whole, vector& wanted) { @@ -112,100 +104,6 @@ map SubSample::deconvolute(map whole, vector SubSample::getSample(TreeMap* tMap, int size, map >& sample) { - try { - vector temp2; - sample["doNotIncludeMe"] = temp2; - - vector namesInSample; - - vector Groups = tMap->getNamesOfGroups(); - for (int i = 0; i < Groups.size(); i++) { - - if (m->inUsersGroups(Groups[i], m->getGroups())) { - if (m->control_pressed) { break; } - - vector thisGroup; thisGroup.push_back(Groups[i]); - vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); - vector temp; - sample[Groups[i]] = temp; - - if (thisSize >= size) { - - random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end()); - - for (int j = 0; j < size; j++) { sample[Groups[i]].push_back(thisGroupsSeqs[j]); namesInSample.push_back(thisGroupsSeqs[j]); } - for (int j = size; j < thisSize; j++) { sample["doNotIncludeMe"].push_back(thisGroupsSeqs[j]); } - - }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } - } - } - - return namesInSample; - } - catch(exception& e) { - m->errorOut(e, "SubSample", "getSample-TreeMap"); - exit(1); - } -} - -//********************************************************************************************************************** -vector SubSample::getSample(TreeMap* tMap, int size) { - try { - vector sample; - - vector Groups = tMap->getNamesOfGroups(); - for (int i = 0; i < Groups.size(); i++) { - - if (m->inUsersGroups(Groups[i], m->getGroups())) { - if (m->control_pressed) { break; } - - vector thisGroup; thisGroup.push_back(Groups[i]); - vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); - - if (thisSize >= size) { - - random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end()); - - for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); } - }else { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } - } - } - - return sample; - } - catch(exception& e) { - m->errorOut(e, "SubSample", "getSample-TreeMap"); - exit(1); - } -} -//********************************************************************************************************************** -vector SubSample::getSample(TreeMap* tMap, vector Groups) { - try { - vector sample; - - //vector Groups = tMap->getNamesOfGroups(); - for (int i = 0; i < Groups.size(); i++) { - - if (m->control_pressed) { break; } - - vector thisGroup; thisGroup.push_back(Groups[i]); - vector thisGroupsSeqs = tMap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); - - for (int j = 0; j < thisSize; j++) { sample.push_back(thisGroupsSeqs[j]); } - } - - return sample; - } - catch(exception& e) { - m->errorOut(e, "SubSample", "getSample-TreeMap"); - exit(1); - } -} -//********************************************************************************************************************** vector SubSample::getSample(vector& thislookup, int size) { try { @@ -366,7 +264,164 @@ int SubSample::getSample(SAbundVector*& sabund, int size) { m->errorOut(e, "SubSampleCommand", "getSample"); exit(1); } -} +} +//********************************************************************************************************************** +CountTable SubSample::getSample(CountTable& ct, int size, vector Groups) { + try { + if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: Cannot subsample by group because your count table doesn't have group information.\n"); m->control_pressed = true; } + + CountTable sampledCt; + map > tempCount; + for (int i = 0; i < Groups.size(); i++) { + sampledCt.addGroup(Groups[i]); + + vector names = ct.getNamesOfSeqs(Groups[i]); + vector allNames; + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { return sampledCt; } + + int num = ct. getGroupCount(names[j], Groups[i]); + for (int k = 0; k < num; k++) { allNames.push_back(names[j]); } + } + + random_shuffle(allNames.begin(), allNames.end()); + + if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; } + else{ + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return sampledCt; } + + map >::iterator it = tempCount.find(allNames[j]); + + if (it == tempCount.end()) { //we have not seen this sequence at all yet + vector tempGroups; tempGroups.resize(Groups.size(), 0); + tempGroups[i]++; + tempCount[allNames[j]] = tempGroups; + }else{ + tempCount[allNames[j]][i]++; + } + } + } + } + + //build count table + for (map >::iterator it = tempCount.begin(); it != tempCount.end();) { + sampledCt.push_back(it->first, it->second); + tempCount.erase(it++); + } + + return sampledCt; + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getSample"); + exit(1); + } +} +//********************************************************************************************************************** +CountTable SubSample::getSample(CountTable& ct, int size, vector Groups, bool pickedGroups) { + try { + CountTable sampledCt; + if (!ct.hasGroupInfo() && pickedGroups) { m->mothurOut("[ERROR]: Cannot subsample with groups because your count table doesn't have group information.\n"); m->control_pressed = true; return sampledCt; } + + if (ct.hasGroupInfo()) { + map > tempCount; + vector allNames; + map groupMap; + + vector myGroups; + if (pickedGroups) { myGroups = Groups; } + else { myGroups = ct.getNamesOfGroups(); } + + for (int i = 0; i < myGroups.size(); i++) { + sampledCt.addGroup(myGroups[i]); + groupMap[myGroups[i]] = i; + + vector names = ct.getNamesOfSeqs(myGroups[i]); + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { return sampledCt; } + + int num = ct. getGroupCount(names[j], myGroups[i]); + for (int k = 0; k < num; k++) { + item temp(names[j], myGroups[i]); + allNames.push_back(temp); + } + } + } + + random_shuffle(allNames.begin(), allNames.end()); + + if (allNames.size() < size) { + if (pickedGroups) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); } + else { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences in the groups you chose.\n"); } + m->control_pressed = true; return sampledCt; } + else{ + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return sampledCt; } + + map >::iterator it = tempCount.find(allNames[j].name); + + if (it == tempCount.end()) { //we have not seen this sequence at all yet + vector tempGroups; tempGroups.resize(myGroups.size(), 0); + tempGroups[groupMap[allNames[j].group]]++; + tempCount[allNames[j].name] = tempGroups; + }else{ + tempCount[allNames[j].name][groupMap[allNames[j].group]]++; + } + } + } + + //build count table + for (map >::iterator it = tempCount.begin(); it != tempCount.end();) { + sampledCt.push_back(it->first, it->second); + tempCount.erase(it++); + } + + //remove empty groups + for (int i = 0; i < myGroups.size(); i++) { if (sampledCt.getGroupCount(myGroups[i]) == 0) { sampledCt.removeGroup(myGroups[i]); } } + + }else { + vector names = ct.getNamesOfSeqs(); + map nameMap; + vector allNames; + + for (int i = 0; i < names.size(); i++) { + int num = ct.getNumSeqs(names[i]); + for (int j = 0; j < num; j++) { allNames.push_back(names[i]); } + } + + if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); m->control_pressed = true; return sampledCt; } + else { + random_shuffle(allNames.begin(), allNames.end()); + + for (int j = 0; j < size; j++) { + if (m->control_pressed) { return sampledCt; } + + map::iterator it = nameMap.find(allNames[j]); + + //we have not seen this sequence at all yet + if (it == nameMap.end()) { nameMap[allNames[j]] = 1; } + else{ nameMap[allNames[j]]++; } + } + + //build count table + for (map::iterator it = nameMap.begin(); it != nameMap.end();) { + sampledCt.push_back(it->first, it->second); + nameMap.erase(it++); + } + } + } + + return sampledCt; + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "getSample"); + exit(1); + } +} //********************************************************************************************************************** diff --git a/subsample.h b/subsample.h index b00f1a7..fdf8576 100644 --- a/subsample.h +++ b/subsample.h @@ -13,6 +13,16 @@ #include "sharedrabundvector.h" #include "treemap.h" #include "tree.h" +#include "counttable.h" + +struct item { + string name; + string group; + + item() {} + item(string n, string g) : name(n), group(g) {} + ~item() {} +}; //subsampling overwrites the sharedRabunds. If you need to reuse the original use the getSamplePreserve function. @@ -24,20 +34,16 @@ class SubSample { ~SubSample() {} vector getSample(vector&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first. - - //Tree* getSample(Tree*, TreeMap*, map, int); //creates new subsampled tree, destroys treemap so copy if needed. - Tree* getSample(Tree*, TreeMap*, TreeMap*, int, map); //creates new subsampled tree. Uses first treemap to fill new treemap with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe". + Tree* getSample(Tree*, CountTable*, CountTable*, int); //creates new subsampled tree. Uses first counttable to fill new counttable with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe". int getSample(SAbundVector*&, int); //destroys sabundvector passed in, so copy it if you need it + CountTable getSample(CountTable&, int, vector); //subsample a countTable bygroup(same number sampled from each group, returns subsampled countTable + CountTable getSample(CountTable&, int, vector, bool); //subsample a countTable. If you want to only sample from specific groups, pass in groups in the vector and set bool=true, otherwise set bool=false. private: MothurOut* m; int eliminateZeroOTUS(vector&); - - vector getSample(TreeMap*, vector); - vector getSample(TreeMap*, int); //names of seqs to include in sample tree - vector getSample(TreeMap* tMap, int size, map >& sample); //sample maps group -> seqs in group. seqs not in sample are in doNotIncludeMe group - map deconvolute(map wholeSet, vector& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap. + map deconvolute(map wholeSet, vector& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap. }; diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index f9cb1e6..e1793f4 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -16,8 +16,9 @@ vector SubSampleCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(plist); CommandParameter pshared("shared", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(pshared); CommandParameter prabund("rabund", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(prabund); @@ -43,7 +44,7 @@ string SubSampleCommand::getHelpString(){ try { string helpString = ""; helpString += "The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n"; - helpString += "The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size, persample and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n"; + helpString += "The sub.sample command parameters are fasta, name, list, group, count, rabund, sabund, shared, groups, size, persample and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n"; helpString += "The namefile is only used with the fasta file, not with the listfile, because the list file should contain all sequences.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n"; helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"; @@ -76,6 +77,7 @@ string SubSampleCommand::getOutputFileNameTag(string type, string inputName=""){ if (type == "fasta") { outputFileName = "subsample" + m->getExtension(inputName); } else if (type == "sabund") { outputFileName = "subsample" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "subsample" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "subsample" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "subsample" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "subsample" + m->getExtension(inputName); } else if (type == "rabund") { outputFileName = "subsample" + m->getExtension(inputName); } @@ -103,6 +105,7 @@ SubSampleCommand::SubSampleCommand(){ outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; outputTypes["group"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "SubSampleCommand", "GetRelAbundCommand"); @@ -142,6 +145,7 @@ SubSampleCommand::SubSampleCommand(string option) { outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; outputTypes["group"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -206,6 +210,14 @@ SubSampleCommand::SubSampleCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -244,6 +256,22 @@ SubSampleCommand::SubSampleCommand(string option) { else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { + m->setCountTableFile(countfile); + ct.readTable(countfile); + } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... label = validParameter.validFile(parameters, "label", false); @@ -267,26 +295,34 @@ SubSampleCommand::SubSampleCommand(string option) { temp = validParameter.validFile(parameters, "persample", false); if (temp == "not found"){ temp = "f"; } persample = m->isTrue(temp); - if (groupfile == "") { persample = false; } + if ((groupfile == "") && (countfile == "")) { persample = false; } + if (countfile != "") { + if (!ct.hasGroupInfo()) { + persample = false; + if (pickedGroups) { m->mothurOut("You cannot pick groups without group info in your count file."); m->mothurOutEndLine(); abort = true; } + } + } if ((namefile != "") && (fastafile == "")) { m->mothurOut("You may only use a namefile with a fastafile."); m->mothurOutEndLine(); abort = true; } if ((fastafile == "") && (listfile == "") && (sabundfile == "") && (rabundfile == "") && (sharedfile == "")) { m->mothurOut("You must provide a fasta, list, sabund, rabund or shared file as an input file."); m->mothurOutEndLine(); abort = true; } - if (pickedGroups && ((groupfile == "") && (sharedfile == ""))) { - m->mothurOut("You cannot pick groups without a valid group file or shared file."); m->mothurOutEndLine(); abort = true; } + if (pickedGroups && ((groupfile == "") && (sharedfile == "") && (countfile == ""))) { + m->mothurOut("You cannot pick groups without a valid group, count or shared file."); m->mothurOutEndLine(); abort = true; } - if ((groupfile != "") && ((fastafile == "") && (listfile == ""))) { - m->mothurOut("Group file only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; } + if (((groupfile != "") || (countfile != "")) && ((fastafile == "") && (listfile == ""))) { + m->mothurOut("Group or count files are only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; } - if ((groupfile != "") && ((fastafile != "") && (listfile != ""))) { - m->mothurOut("A new group file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; } + if (((groupfile != "") || (countfile != "")) && ((fastafile != "") && (listfile != ""))) { + m->mothurOut("A new group or count file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; } - if ((fastafile != "") && (namefile == "")) { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -353,6 +389,11 @@ int SubSampleCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOutEndLine(); @@ -374,49 +415,67 @@ int SubSampleCommand::getSubSampleFasta() { if (namefile != "") { readNames(); } //fills names with all names in namefile. else { getNames(); }//no name file, so get list of names to pick from - GroupMap* groupMap; + GroupMap groupMap; if (groupfile != "") { - - groupMap = new GroupMap(groupfile); - groupMap->readMap(); + groupMap.readMap(groupfile); //takes care of user setting groupNames that are invalid or setting groups=all - SharedUtil* util = new SharedUtil(); - vector namesGroups = groupMap->getNamesOfGroups(); - util->setGroups(Groups, namesGroups); - delete util; + SharedUtil util; + vector namesGroups = groupMap.getNamesOfGroups(); + util.setGroups(Groups, namesGroups); //file mismatch quit - if (names.size() != groupMap->getNumSeqs()) { - m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); + if (names.size() != groupMap.getNumSeqs()) { + m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your groupfile contains " + toString(groupMap.getNumSeqs()) + ", please correct."); m->mothurOutEndLine(); - delete groupMap; return 0; } - } + }else if (countfile != "") { + if (ct.hasGroupInfo()) { + SharedUtil util; + vector namesGroups = ct.getNamesOfGroups(); + util.setGroups(Groups, namesGroups); + } + + //file mismatch quit + if (names.size() != ct.getNumUniqueSeqs()) { + m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your count file contains " + toString(ct.getNumUniqueSeqs()) + " unique sequences, please correct."); + m->mothurOutEndLine(); + return 0; + } + } if (m->control_pressed) { return 0; } - //make sure that if your picked groups size is not too big - int thisSize = names.size(); + int thisSize = 0; + if (countfile == "") { thisSize = names.size(); } + else { thisSize = ct. getNumSeqs(); } //all seqs not just unique + if (persample) { if (size == 0) { //user has not set size, set size = smallest samples size - size = groupMap->getNumSeqs(Groups[0]); + if (countfile == "") { size = groupMap.getNumSeqs(Groups[0]); } + else { size = ct.getGroupCount(Groups[0]); } + for (int i = 1; i < Groups.size(); i++) { - int thisSize = groupMap->getNumSeqs(Groups[i]); + int thisSize = 0; + if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); } + else { thisSize = ct.getGroupCount(Groups[i]); } if (thisSize < size) { size = thisSize; } } }else { //make sure size is not too large vector newGroups; for (int i = 0; i < Groups.size(); i++) { - int thisSize = groupMap->getNumSeqs(Groups[i]); + int thisSize = 0; + if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); } + else { thisSize = ct.getGroupCount(Groups[i]); } if (thisSize >= size) { newGroups.push_back(Groups[i]); } else { m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); } } Groups = newGroups; + if (newGroups.size() == 0) { m->mothurOut("[ERROR]: all groups removed."); m->mothurOutEndLine(); m->control_pressed = true; } } m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); @@ -424,7 +483,8 @@ int SubSampleCommand::getSubSampleFasta() { if (pickedGroups) { int total = 0; for(int i = 0; i < Groups.size(); i++) { - total += groupMap->getNumSeqs(Groups[i]); + if (countfile == "") { total += groupMap.getNumSeqs(Groups[i]); } + else { total += ct.getGroupCount(Groups[i]); } } if (size == 0) { //user has not set size, set size = 10% samples size @@ -442,64 +502,87 @@ int SubSampleCommand::getSubSampleFasta() { } if (size == 0) { //user has not set size, set size = 10% samples size - size = int (names.size() * 0.10); - } - - if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); - size = thisSize; + if (countfile == "") { size = int (names.size() * 0.10); } + else { size = int (ct.getNumSeqs() * 0.10); } } - if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); } + + if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); + size = thisSize; + } + + if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); } } random_shuffle(names.begin(), names.end()); set subset; //dont want repeat sequence names added if (persample) { - //initialize counts - map groupCounts; - map::iterator itGroupCounts; - for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + if (countfile == "") { + //initialize counts + map groupCounts; + map::iterator itGroupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } - for (int j = 0; j < names.size(); j++) { + for (int j = 0; j < names.size(); j++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { return 0; } - string group = groupMap->getGroup(names[j]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - else{ - itGroupCounts = groupCounts.find(group); - if (itGroupCounts != groupCounts.end()) { - if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } - } - } - } + string group = groupMap.getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + itGroupCounts = groupCounts.find(group); + if (itGroupCounts != groupCounts.end()) { + if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } + } + } + } + }else { + SubSample sample; + CountTable sampledCt = sample.getSample(ct, size, Groups); + vector sampledSeqs = sampledCt.getNamesOfSeqs(); + for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); } + + string countOutputDir = outputDir; + if (outputDir == "") { countOutputDir += m->hasPath(countfile); } + string countOutputFileName = countOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + outputTypes["count"].push_back(countOutputFileName); outputNames.push_back(countOutputFileName); + sampledCt.printTable(countOutputFileName); + } }else { - - //randomly select a subset of those names to include in the subsample - //since names was randomly shuffled just grab the next one - for (int j = 0; j < names.size(); j++) { - - if (m->control_pressed) { return 0; } - - if (groupfile != "") { //if there is a groupfile given fill in group info - string group = groupMap->getGroup(names[j]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { - subset.insert(names[j]); - } - }else{ - subset.insert(names[j]); - } - }else{ //save everyone, group - subset.insert(names[j]); - } - - //do we have enough?? - if (subset.size() == size) { break; } - } + if (countfile == "") { + //randomly select a subset of those names to include in the subsample + //since names was randomly shuffled just grab the next one + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { return 0; } + + if (groupfile != "") { //if there is a groupfile given fill in group info + string group = groupMap.getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + + if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups + if (m->inUsersGroups(group, Groups)) { subset.insert(names[j]); } + }else{ subset.insert(names[j]); } + }else{ //save everyone, group + subset.insert(names[j]); + } + + //do we have enough?? + if (subset.size() == size) { break; } + } + }else { + SubSample sample; + CountTable sampledCt = sample.getSample(ct, size, Groups, pickedGroups); + vector sampledSeqs = sampledCt.getNamesOfSeqs(); + for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); } + + string countOutputDir = outputDir; + if (outputDir == "") { countOutputDir += m->hasPath(countfile); } + string countOutputFileName = countOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + outputTypes["count"].push_back(countOutputFileName); outputNames.push_back(countOutputFileName); + sampledCt.printTable(countOutputFileName); + } } if (subset.size() == 0) { m->mothurOut("The size you selected is too large, skipping fasta file."); m->mothurOutEndLine(); return 0; } @@ -808,7 +891,7 @@ int SubSampleCommand::processShared(vector& thislookup) { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + getOutputFileNameTag("shared", sharedfile); + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + "." +getOutputFileNameTag("shared", sharedfile); SubSample sample; vector subsampledLabels = sample.getSample(thislookup, size); @@ -858,67 +941,76 @@ int SubSampleCommand::getSubSampleList() { //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = labels; - + ofstream outGroup; - GroupMap* groupMap; + GroupMap groupMap; if (groupfile != "") { - - groupMap = new GroupMap(groupfile); - groupMap->readMap(); + groupMap.readMap(groupfile); //takes care of user setting groupNames that are invalid or setting groups=all - SharedUtil* util = new SharedUtil(); - vector namesGroups = groupMap->getNamesOfGroups(); - util->setGroups(Groups, namesGroups); - delete util; + SharedUtil util; vector namesGroups = groupMap.getNamesOfGroups(); util.setGroups(Groups, namesGroups); //create outputfiles string groupOutputDir = outputDir; if (outputDir == "") { groupOutputDir += m->hasPath(groupfile); } string groupOutputFileName = groupOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "subsample" + m->getExtension(groupfile); - m->openOutputFile(groupOutputFileName, outGroup); outputTypes["group"].push_back(groupOutputFileName); outputNames.push_back(groupOutputFileName); //file mismatch quit - if (list->getNumSeqs() != groupMap->getNumSeqs()) { - m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); + if (list->getNumSeqs() != groupMap.getNumSeqs()) { + m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your groupfile contains " + toString(groupMap.getNumSeqs()) + ", please correct."); + m->mothurOutEndLine(); delete list; delete input; out.close(); outGroup.close(); return 0; + } + }else if (countfile != "") { + if (ct.hasGroupInfo()) { + SharedUtil util; + vector namesGroups = ct.getNamesOfGroups(); + util.setGroups(Groups, namesGroups); + } + + //file mismatch quit + if (list->getNumSeqs() != ct.getNumUniqueSeqs()) { + m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your count file contains " + toString(ct.getNumUniqueSeqs()) + " unique sequences, please correct."); m->mothurOutEndLine(); - delete groupMap; - delete list; - delete input; - out.close(); - outGroup.close(); return 0; - } - } - + } + } + //make sure that if your picked groups size is not too big if (persample) { if (size == 0) { //user has not set size, set size = smallest samples size - size = groupMap->getNumSeqs(Groups[0]); + if (countfile == "") { size = groupMap.getNumSeqs(Groups[0]); } + else { size = ct.getGroupCount(Groups[0]); } + for (int i = 1; i < Groups.size(); i++) { - int thisSize = groupMap->getNumSeqs(Groups[i]); + int thisSize = 0; + if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); } + else { thisSize = ct.getGroupCount(Groups[i]); } if (thisSize < size) { size = thisSize; } } }else { //make sure size is not too large vector newGroups; for (int i = 0; i < Groups.size(); i++) { - int thisSize = groupMap->getNumSeqs(Groups[i]); + int thisSize = 0; + if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); } + else { thisSize = ct.getGroupCount(Groups[i]); } if (thisSize >= size) { newGroups.push_back(Groups[i]); } else { m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); } } Groups = newGroups; + if (newGroups.size() == 0) { m->mothurOut("[ERROR]: all groups removed."); m->mothurOutEndLine(); m->control_pressed = true; } } - m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); + m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); }else{ - if (pickedGroups) { + if (pickedGroups) { int total = 0; for(int i = 0; i < Groups.size(); i++) { - total += groupMap->getNumSeqs(Groups[i]); + if (countfile == "") { total += groupMap.getNumSeqs(Groups[i]); } + else { total += ct.getGroupCount(Groups[i]); } } if (size == 0) { //user has not set size, set size = 10% samples size @@ -926,122 +1018,110 @@ int SubSampleCommand::getSubSampleList() { } if (total < size) { - m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine(); + if (size != 0) { + m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine(); + } size = int (total * 0.10); } m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine(); - }else{ - - if (size == 0) { //user has not set size, set size = 10% samples size - size = int (list->getNumSeqs() * 0.10); + }else { + if (size == 0) { //user has not set size, set size = 10% samples size + if (countfile == "") { size = int (list->getNumSeqs() * 0.10); } + else { size = int (ct.getNumSeqs() * 0.10); } } - int thisSize = list->getNumSeqs(); + int thisSize = 0; + if (countfile == "") { thisSize = list->getNumSeqs(); } + else { thisSize = ct.getNumSeqs(); } + if (size > thisSize) { m->mothurOut("Your list file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); size = thisSize; } - m->mothurOut("Sampling " + toString(size) + " from " + toString(list->getNumSeqs()) + "."); m->mothurOutEndLine(); - } - } - - - //fill names - for (int i = 0; i < list->getNumBins(); i++) { - string binnames = list->get(i); - - //parse names - string individual = ""; - int length = binnames.length(); - for(int j=0;jgetGroup(individual); - if (group == "not found") { m->mothurOut("[ERROR]: " + individual + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { - names.push_back(individual); - } - }else{ - names.push_back(individual); - } - }else{ //save everyone, group - names.push_back(individual); - } - individual = ""; - } - else{ - individual += binnames[j]; - } - } - //save last name - if (groupfile != "") { //if there is a groupfile given fill in group info - string group = groupMap->getGroup(individual); - if (group == "not found") { m->mothurOut("[ERROR]: " + individual + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { - names.push_back(individual); - } - }else{ - names.push_back(individual); - } - }else{ //save everyone, group - names.push_back(individual); - } - } - - random_shuffle(names.begin(), names.end()); - - //randomly select a subset of those names to include in the subsample - set subset; //dont want repeat sequence names added - if (persample) { - //initialize counts - map groupCounts; - map::iterator itGroupCounts; - for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } - - for (int j = 0; j < names.size(); j++) { - - if (m->control_pressed) { return 0; } - - string group = groupMap->getGroup(names[j]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - else{ - itGroupCounts = groupCounts.find(group); - if (itGroupCounts != groupCounts.end()) { - if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } - } - } - } - }else{ - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { break; } - - subset.insert(names[j]); - } - } - - if (groupfile != "") { - //write out new groupfile - for (set::iterator it = subset.begin(); it != subset.end(); it++) { - string group = groupMap->getGroup(*it); - if (group == "not found") { group = "NOTFOUND"; } - - outGroup << *it << '\t' << group << endl; - } - outGroup.close(); delete groupMap; - } + m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); + } + } + set subset; //dont want repeat sequence names added + if (countfile == "") { + //fill names + for (int i = 0; i < list->getNumBins(); i++) { + string binnames = list->get(i); + vector thisBin; + m->splitAtComma(binnames, thisBin); + + for(int j=0;jmothurOut("[ERROR]: " + thisBin[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + + //if hte user picked groups, we only want to keep the names of sequences from those groups + if (pickedGroups) { if (m->inUsersGroups(group, Groups)) { names.push_back(thisBin[j]); } } + else{ names.push_back(thisBin[j]); } + }//save everyone, group + else{ names.push_back(thisBin[j]); } + } + } + + random_shuffle(names.begin(), names.end()); + + //randomly select a subset of those names to include in the subsample + if (persample) { + //initialize counts + map groupCounts; + map::iterator itGroupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + + for (int j = 0; j < names.size(); j++) { + + if (m->control_pressed) { delete list; delete input; return 0; } + + string group = groupMap.getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + itGroupCounts = groupCounts.find(group); + if (itGroupCounts != groupCounts.end()) { + if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } + } + } + } + }else{ + for (int j = 0; j < size; j++) { + if (m->control_pressed) { break; } + subset.insert(names[j]); + } + } + + if (groupfile != "") { + //write out new groupfile + for (set::iterator it = subset.begin(); it != subset.end(); it++) { + string group = groupMap.getGroup(*it); + if (group == "not found") { group = "NOTFOUND"; } + outGroup << *it << '\t' << group << endl; + } + outGroup.close(); + } + }else { + SubSample sample; CountTable sampledCt; + + if (persample) { sampledCt = sample.getSample(ct, size, Groups); } + else { sampledCt = sample.getSample(ct, size, Groups, pickedGroups); } + + vector sampledSeqs = sampledCt.getNamesOfSeqs(); + for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); } + + string countOutputDir = outputDir; + if (outputDir == "") { countOutputDir += m->hasPath(countfile); } + string countOutputFileName = countOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + outputTypes["count"].push_back(countOutputFileName); outputNames.push_back(countOutputFileName); + sampledCt.printTable(countOutputFileName); + } //as long as you are not at the end of the file or done wih the lines you want while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { - if (m->control_pressed) { delete list; delete input; out.close(); return 0; } + if (m->control_pressed) { delete list; delete input; out.close(); return 0; } if(allLines == 1 || labels.count(list->getLabel()) == 1){ @@ -1132,22 +1212,12 @@ int SubSampleCommand::processList(ListVector*& list, ofstream& out, set& if (m->control_pressed) { break; } - string binnames = list->get(i); - - //parse names - string individual = ""; - string newNames = ""; - int length = binnames.length(); - for(int j=0;jget(i); + vector binnames; + m->splitAtComma(bin, binnames); + string newNames = ""; + for(int j=0;j labels; //holds labels to be used string groups, label, outputDir; vector Groups, outputNames; int size; vector names; map > nameMap; + CountTable ct; int getSubSampleShared(); int getSubSampleList(); diff --git a/summaryqualcommand.cpp b/summaryqualcommand.cpp index 5d79713..5a07367 100644 --- a/summaryqualcommand.cpp +++ b/summaryqualcommand.cpp @@ -8,13 +8,14 @@ */ #include "summaryqualcommand.h" - +#include "counttable.h" //********************************************************************************************************************** vector SummaryQualCommand::setParameters(){ try { CommandParameter pqual("qfile", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pqual); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -32,9 +33,10 @@ vector SummaryQualCommand::setParameters(){ string SummaryQualCommand::getHelpString(){ try { string helpString = ""; - helpString += "The summary.qual command reads a quality file and an optional name file, and summarizes the quality information.\n"; - helpString += "The summary.tax command parameters are qfile, name and processors. qfile is required, unless you have a valid current quality file.\n"; + helpString += "The summary.qual command reads a quality file and an optional name or count file, and summarizes the quality information.\n"; + helpString += "The summary.tax command parameters are qfile, name, count and processors. qfile is required, unless you have a valid current quality file.\n"; helpString += "The name parameter allows you to enter a name file associated with your quality file. \n"; + helpString += "The count parameter allows you to enter a count file associated with your quality file. \n"; helpString += "The summary.qual command should be in the following format: \n"; helpString += "summary.qual(qfile=yourQualityFile) \n"; helpString += "Note: No spaces between parameter labels (i.e. qfile), '=' and parameters (i.e.yourQualityFile).\n"; @@ -122,6 +124,14 @@ SummaryQualCommand::SummaryQualCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //initialize outputTypes @@ -141,6 +151,13 @@ SummaryQualCommand::SummaryQualCommand(string option) { if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ @@ -152,10 +169,13 @@ SummaryQualCommand::SummaryQualCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - if (namefile == "") { - vector files; files.push_back(qualfile); - parser.getNameFile(files); - } + + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(qualfile); + parser.getNameFile(files); + } + } } } catch(exception& e) { @@ -179,7 +199,12 @@ int SummaryQualCommand::execute(){ if (m->control_pressed) { return 0; } if (namefile != "") { nameMap = m->readNames(namefile); } - + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } + vector positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) positions = m->divideFile(qualfile, processors); @@ -257,7 +282,7 @@ int SummaryQualCommand::driverCreateSummary(vector& position, vector& if (current.getName() != "") { int num = 1; - if (namefile != "") { + if ((namefile != "") || (countfile != "")) { //make sure this sequence is in the namefile, else error map::iterator it = nameMap.find(current.getName()); @@ -400,11 +425,14 @@ int SummaryQualCommand::createProcessesCreateSummary(vector& position, vect DWORD dwThreadIdArray[processors]; HANDLE hThreadArray[processors]; + bool hasNameMap = false; + if ((namefile !="") || (countfile != "")) { hasNameMap = true; } + //Create processor worker threads. for( int i=0; i& position, vector< if (m->control_pressed) { out.close(); return 0; } - float average = averageQ[i] / (float) position[i]; + double average = averageQ[i] / (float) position[i]; out << i << '\t' << position[i] << '\t' << average << '\t'; for (int j = 0; j < 41; j++) { diff --git a/summaryqualcommand.h b/summaryqualcommand.h index 31390b4..ac65938 100644 --- a/summaryqualcommand.h +++ b/summaryqualcommand.h @@ -35,7 +35,7 @@ public: private: bool abort; - string qualfile, outputDir, namefile; + string qualfile, outputDir, namefile, countfile; vector outputNames; map nameMap; int processors; @@ -62,20 +62,21 @@ struct seqSumQualData { vector position; vector averageQ; vector< vector > scores; - string filename, namefile; + string filename; unsigned long long start; unsigned long long end; int count; MothurOut* m; + bool hasNameMap; map nameMap; ~seqSumQualData(){} - seqSumQualData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string n, map nam) { + seqSumQualData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, bool n, map nam) { filename = f; m = mout; start = st; end = en; - namefile = n; + hasNameMap = n; nameMap = nam; count = 0; } @@ -109,7 +110,7 @@ static DWORD WINAPI MySeqSumQualThreadFunction(LPVOID lpParam){ if (current.getName() != "") { int num = 1; - if (pDataArray->namefile != "") { + if (pDataArray->hasNameMap) { //make sure this sequence is in the namefile, else error map::iterator it = pDataArray->nameMap.find(current.getName()); diff --git a/summarytaxcommand.cpp b/summarytaxcommand.cpp index 3e16e74..e932eee 100644 --- a/summarytaxcommand.cpp +++ b/summarytaxcommand.cpp @@ -14,8 +14,9 @@ vector SummaryTaxCommand::setParameters(){ try { CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -34,9 +35,10 @@ string SummaryTaxCommand::getHelpString(){ try { string helpString = ""; helpString += "The summary.tax command reads a taxonomy file and an optional name file, and summarizes the taxonomy information.\n"; - helpString += "The summary.tax command parameters are taxonomy, group and name. taxonomy is required, unless you have a valid current taxonomy file.\n"; + helpString += "The summary.tax command parameters are taxonomy, count, group and name. taxonomy is required, unless you have a valid current taxonomy file.\n"; helpString += "The name parameter allows you to enter a name file associated with your taxonomy file. \n"; helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"; + helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n"; helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. It is not required, but providing it will keep the rankIDs in the summary file static.\n"; helpString += "The summary.tax command should be in the following format: \n"; helpString += "summary.tax(taxonomy=yourTaxonomyFile) \n"; @@ -142,6 +144,14 @@ SummaryTaxCommand::SummaryTaxCommand(string option) { if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; } } + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } + } //initialize outputTypes @@ -166,7 +176,20 @@ SummaryTaxCommand::SummaryTaxCommand(string option) { if (groupfile == "not open") { groupfile = ""; abort = true; } else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true); if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); } else if (refTaxonomy == "not open") { refTaxonomy = ""; abort = true; } @@ -177,11 +200,12 @@ SummaryTaxCommand::SummaryTaxCommand(string option) { outputDir += m->hasPath(taxfile); //if user entered a file with a path then preserve it } - if (namefile == "") { - vector files; files.push_back(taxfile); - parser.getNameFile(files); + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(taxfile); + parser.getNameFile(files); + } } - } } catch(exception& e) { @@ -197,23 +221,35 @@ int SummaryTaxCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); - PhyloSummary* taxaSum; - if (refTaxonomy != "") { - taxaSum = new PhyloSummary(refTaxonomy, groupfile); - }else { - taxaSum = new PhyloSummary(groupfile); - } + GroupMap* groupMap = NULL; + CountTable* ct = NULL; + if (groupfile != "") { + groupMap = new GroupMap(groupfile); + groupMap->readMap(); + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(countfile); + } - if (m->control_pressed) { delete taxaSum; return 0; } + PhyloSummary* taxaSum; + if (countfile != "") { + if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct); } + else { taxaSum = new PhyloSummary(ct); } + }else { + if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap); } + else { taxaSum = new PhyloSummary(groupMap); } + } + + if (m->control_pressed) { if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } delete taxaSum; return 0; } int numSeqs = 0; - if (namefile == "") { numSeqs = taxaSum->summarize(taxfile); } - else { + if ((namefile == "") || (countfile != "")) { numSeqs = taxaSum->summarize(taxfile); } + else if (namefile != "") { map > nameMap; map >::iterator itNames; m->readNames(namefile, nameMap); - if (m->control_pressed) { delete taxaSum; return 0; } + if (m->control_pressed) { if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } delete taxaSum; return 0; } ifstream in; m->openInputFile(taxfile, in); @@ -222,6 +258,9 @@ int SummaryTaxCommand::execute(){ string name, taxon; while(!in.eof()){ + + if (m->control_pressed) { break; } + in >> name >> taxon; m->gobble(in); itNames = nameMap.find(name); @@ -240,7 +279,7 @@ int SummaryTaxCommand::execute(){ in.close(); } - if (m->control_pressed) { delete taxaSum; return 0; } + if (m->control_pressed) { if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } delete taxaSum; return 0; } //print summary file ofstream outTaxTree; @@ -250,6 +289,7 @@ int SummaryTaxCommand::execute(){ outTaxTree.close(); delete taxaSum; + if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } diff --git a/summarytaxcommand.h b/summarytaxcommand.h index 5f0630f..e8033e2 100644 --- a/summarytaxcommand.h +++ b/summarytaxcommand.h @@ -11,6 +11,7 @@ */ #include "command.hpp" +#include "counttable.h" /**************************************************************************************************/ @@ -33,7 +34,7 @@ class SummaryTaxCommand : public Command { private: bool abort; - string taxfile, outputDir, namefile, groupfile, refTaxonomy; + string taxfile, outputDir, namefile, groupfile, refTaxonomy, countfile; vector outputNames; map nameMap; }; diff --git a/taxonomynode.cpp b/taxonomynode.cpp new file mode 100755 index 0000000..b90bda1 --- /dev/null +++ b/taxonomynode.cpp @@ -0,0 +1,72 @@ +/* + * taxonomynode.cpp + * + * + * Created by Pat Schloss on 7/8/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +/**************************************************************************************************/ + +#include "taxonomynode.h" + +/**************************************************************************************************/ + +TaxonomyNode::TaxonomyNode(string n, int l): name(n), level(l){ + m = MothurOut::getInstance(); + parent = -1; + numChildren = 0; + numSeqs = 0; +} + +/**************************************************************************************************/ + +void TaxonomyNode::setName(string n) { name = n; } + +/**************************************************************************************************/ + +string TaxonomyNode::getName() { return name; } + +/**************************************************************************************************/ + +void TaxonomyNode::setParent(int p) { parent = p; } + +/**************************************************************************************************/ + +int TaxonomyNode::getParent() { return parent; } + +/**************************************************************************************************/ + +void TaxonomyNode::makeChild(string c, int i) { children[c] = i; } + + +/**************************************************************************************************/ + +map TaxonomyNode::getChildren() { return children; } + +/**************************************************************************************************/ + +int TaxonomyNode::getChildIndex(string c){ + map::iterator it = children.find(c); + if(it != children.end()) { return it->second; } + else { return -1; } +} + +/**************************************************************************************************/ + +int TaxonomyNode::getNumKids() { return (int)children.size(); } + +/**************************************************************************************************/ + +int TaxonomyNode::getNumSeqs() { return numSeqs; } + +/**************************************************************************************************/ + +void TaxonomyNode::setTotalSeqs(int n) { totalSeqs = n; } + +/**************************************************************************************************/ + +int TaxonomyNode::getLevel() { return level; } + +/**************************************************************************************************/ diff --git a/taxonomynode.h b/taxonomynode.h new file mode 100755 index 0000000..08bad3e --- /dev/null +++ b/taxonomynode.h @@ -0,0 +1,53 @@ +#ifndef TAXONOMYNODE +#define TAXONOMYNODE + +/* + * taxonomynode.h + * + * + * Created by Pat Schloss on 7/8/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +/**************************************************************************************************/ + +#include "mothurout.h" +/**************************************************************************************************/ + +class TaxonomyNode { + +public: + TaxonomyNode(); + TaxonomyNode(string, int); + void setName(string); + string getName(); + + + void setParent(int); + int getParent(); + + void makeChild(string, int); + map getChildren(); + int getChildIndex(string); + int getNumKids(); + int getNumSeqs(); + void setTotalSeqs(int); + int getLevel(); + +private: + int parent; + map children; + int numChildren; + int level; + +protected: + MothurOut* m; + int numSeqs; + int totalSeqs; + string name; +}; + +/**************************************************************************************************/ + +#endif diff --git a/tree.cpp b/tree.cpp index 44ecadd..0bd98e0 100644 --- a/tree.cpp +++ b/tree.cpp @@ -10,7 +10,7 @@ #include "tree.h" /*****************************************************************/ -Tree::Tree(int num, TreeMap* t) : tmap(t) { +Tree::Tree(int num, CountTable* t) : ct(t) { try { m = MothurOut::getInstance(); @@ -36,21 +36,20 @@ Tree::Tree(string g) { //do not use tree generated by this its just to extract t } } /*****************************************************************/ -Tree::Tree(TreeMap* t) : tmap(t) { +Tree::Tree(CountTable* t) : ct(t) { try { m = MothurOut::getInstance(); if (m->runParse == true) { parseTreeFile(); m->runParse = false; } -//for(int i = 0; i < globaldata->Treenames.size(); i++) { cout << i << '\t' << globaldata->Treenames[i] << endl; } + numLeaves = m->Treenames.size(); numNodes = 2*numLeaves - 1; tree.resize(numNodes); //initialize groupNodeInfo - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0); - } + vector namesOfGroups = ct->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { groupNodeInfo[namesOfGroups[i]].resize(0); } //initialize tree with correct number of nodes, name and group info. for (int i = 0; i < numNodes; i++) { @@ -59,19 +58,35 @@ Tree::Tree(TreeMap* t) : tmap(t) { tree[i].setName(m->Treenames[i]); //save group info - string group = tmap->getGroup(m->Treenames[i]); - - vector tempGroups; tempGroups.push_back(group); - tree[i].setGroup(tempGroups); - groupNodeInfo[group].push_back(i); - - //set pcount and pGroup for groupname to 1. - tree[i].pcount[group] = 1; - tree[i].pGroups[group] = 1; - - //Treemap knows name, group and index to speed up search - tmap->setIndex(m->Treenames[i], i); - + int maxPars = 1; + vector group; + vector counts = ct->getGroupCounts(m->Treenames[i]); + for (int j = 0; j < namesOfGroups.size(); j++) { + if (counts[j] != 0) { //you have seqs from this group + groupNodeInfo[namesOfGroups[j]].push_back(i); + group.push_back(namesOfGroups[j]); + tree[i].pGroups[namesOfGroups[j]] = counts[j]; + tree[i].pcount[namesOfGroups[j]] = counts[j]; + //keep highest group + if(counts[j] > maxPars){ maxPars = counts[j]; } + } + } + tree[i].setGroup(group); + setIndex(m->Treenames[i], i); + + if (maxPars > 1) { //then we have some more dominant groups + //erase all the groups that are less than maxPars because you found a more dominant group. + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){ + if(it->second < maxPars){ + tree[i].pGroups.erase(it++); + }else { it++; } + } + //set one remaining groups to 1 + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){ + tree[i].pGroups[it->first] = 1; + } + }//end if + //intialize non leaf nodes }else if (i > (numLeaves-1)) { tree[i].setName(""); @@ -87,7 +102,7 @@ Tree::Tree(TreeMap* t) : tmap(t) { } } /*****************************************************************/ -Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { +Tree::Tree(CountTable* t, vector< vector >& sims) : ct(t) { try { m = MothurOut::getInstance(); @@ -98,9 +113,8 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { tree.resize(numNodes); //initialize groupNodeInfo - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0); - } + vector namesOfGroups = ct->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { groupNodeInfo[namesOfGroups[i]].resize(0); } //initialize tree with correct number of nodes, name and group info. for (int i = 0; i < numNodes; i++) { @@ -109,18 +123,34 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { tree[i].setName(m->Treenames[i]); //save group info - string group = tmap->getGroup(m->Treenames[i]); - - vector tempGroups; tempGroups.push_back(group); - tree[i].setGroup(tempGroups); - groupNodeInfo[group].push_back(i); - - //set pcount and pGroup for groupname to 1. - tree[i].pcount[group] = 1; - tree[i].pGroups[group] = 1; - - //Treemap knows name, group and index to speed up search - tmap->setIndex(m->Treenames[i], i); + int maxPars = 1; + vector group; + vector counts = ct->getGroupCounts(m->Treenames[i]); + for (int j = 0; j < namesOfGroups.size(); j++) { + if (counts[j] != 0) { //you have seqs from this group + groupNodeInfo[namesOfGroups[j]].push_back(i); + group.push_back(namesOfGroups[j]); + tree[i].pGroups[namesOfGroups[j]] = counts[j]; + tree[i].pcount[namesOfGroups[j]] = counts[j]; + //keep highest group + if(counts[j] > maxPars){ maxPars = counts[j]; } + } + } + tree[i].setGroup(group); + setIndex(m->Treenames[i], i); + + if (maxPars > 1) { //then we have some more dominant groups + //erase all the groups that are less than maxPars because you found a more dominant group. + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){ + if(it->second < maxPars){ + tree[i].pGroups.erase(it++); + }else { it++; } + } + //set one remaining groups to 1 + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){ + tree[i].pGroups[it->first] = 1; + } + }//end if //intialize non leaf nodes }else if (i > (numLeaves-1)) { @@ -129,11 +159,12 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { tree[i].setGroup(tempGroups); } } + //build tree from matrix //initialize indexes - map indexes; //maps row in simMatrix to vector index in the tree - for (int g = 0; g < numLeaves; g++) { indexes[g] = g; } + map thisIndexes; //maps row in simMatrix to vector index in the tree + for (int g = 0; g < numLeaves; g++) { thisIndexes[g] = g; } //do merges and create tree structure by setting parents and children //there are numGroups - 1 merges to do @@ -152,26 +183,26 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { //set non-leaf node info and update leaves to know their parents //non-leaf - tree[numLeaves + i].setChildren(indexes[row], indexes[column]); + tree[numLeaves + i].setChildren(thisIndexes[row], thisIndexes[column]); //parents - tree[indexes[row]].setParent(numLeaves + i); - tree[indexes[column]].setParent(numLeaves + i); + tree[thisIndexes[row]].setParent(numLeaves + i); + tree[thisIndexes[column]].setParent(numLeaves + i); //blength = distance / 2; float blength = ((1.0 - largest) / 2); //branchlengths - tree[indexes[row]].setBranchLength(blength - tree[indexes[row]].getLengthToLeaves()); - tree[indexes[column]].setBranchLength(blength - tree[indexes[column]].getLengthToLeaves()); + tree[thisIndexes[row]].setBranchLength(blength - tree[thisIndexes[row]].getLengthToLeaves()); + tree[thisIndexes[column]].setBranchLength(blength - tree[thisIndexes[column]].getLengthToLeaves()); //set your length to leaves to your childs length plus branchlength - tree[numLeaves + i].setLengthToLeaves(tree[indexes[row]].getLengthToLeaves() + tree[indexes[row]].getBranchLength()); + tree[numLeaves + i].setLengthToLeaves(tree[thisIndexes[row]].getLengthToLeaves() + tree[thisIndexes[row]].getBranchLength()); //update index - indexes[row] = numLeaves+i; - indexes[column] = numLeaves+i; + thisIndexes[row] = numLeaves+i; + thisIndexes[column] = numLeaves+i; //remove highest value that caused the merge. sims[row][column] = -1000.0; @@ -200,7 +231,7 @@ Tree::Tree(TreeMap* t, vector< vector >& sims) : tmap(t) { } /*****************************************************************/ Tree::~Tree() {} -/*****************************************************************/ +/***************************************************************** void Tree::addNamesToCounts(map nameMap) { try { //ex. seq1 seq2,seq3,se4 @@ -297,15 +328,15 @@ void Tree::addNamesToCounts(map nameMap) { m->errorOut(e, "Tree", "addNamesToCounts"); exit(1); } -} +}*/ /*****************************************************************/ int Tree::getIndex(string searchName) { try { - //Treemap knows name, group and index to speed up search - // getIndex function will return the vector index or -1 if seq is not found. - int index = tmap->getIndex(searchName); - return index; - + map::iterator itIndex = indexes.find(searchName); + if (itIndex != indexes.end()) { + return itIndex->second; + } + return -1; } catch(exception& e) { m->errorOut(e, "Tree", "getIndex"); @@ -316,8 +347,10 @@ int Tree::getIndex(string searchName) { void Tree::setIndex(string searchName, int index) { try { - //set index in treemap - tmap->setIndex(searchName, index); + map::iterator itIndex = indexes.find(searchName); + if (itIndex == indexes.end()) { + indexes[searchName] = index; + } } catch(exception& e) { m->errorOut(e, "Tree", "setIndex"); @@ -325,14 +358,8 @@ void Tree::setIndex(string searchName, int index) { } } /*****************************************************************/ -int Tree::assembleTree(map nameMap) { - try { - //save for later - names = nameMap; - - //if user has given a names file we want to include that info in the pgroups and pcount info. - if(nameMap.size() != 0) { addNamesToCounts(nameMap); } - +int Tree::assembleTree() { + try { //build the pGroups in non leaf nodes to be used in the parsimony calcs. for (int i = numLeaves; i < numNodes; i++) { if (m->control_pressed) { return 1; } @@ -348,66 +375,66 @@ int Tree::assembleTree(map nameMap) { exit(1); } } -/***************************************************************** -int Tree::assembleTree(string n) { - try { - - //build the pGroups in non leaf nodes to be used in the parsimony calcs. - for (int i = numLeaves; i < numNodes; i++) { - if (m->control_pressed) { return 1; } - - tree[i].pGroups = (mergeGroups(i)); - tree[i].pcount = (mergeGcounts(i)); - } - //float B = clock(); - //cout << "assembleTree\t" << (B-A) / CLOCKS_PER_SEC << endl; - return 0; - } - catch(exception& e) { - m->errorOut(e, "Tree", "assembleTree"); - exit(1); - } -} /*****************************************************************/ //assumes leaf node names are in groups and no names file - used by indicator command void Tree::getSubTree(Tree* Ctree, vector Groups) { try { //copy Tree since we are going to destroy it - Tree* copy = new Tree(tmap); + Tree* copy = new Tree(ct); copy->getCopy(Ctree); - map empty; - copy->assembleTree(empty); + copy->assembleTree(); //we want to select some of the leaf nodes to create the output tree //go through the input Tree starting at parents of leaves + //initialize groupNodeInfo + vector namesOfGroups = ct->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { groupNodeInfo[namesOfGroups[i]].resize(0); } + + //initialize tree with correct number of nodes, name and group info. for (int i = 0; i < numNodes; i++) { - //initialize leaf nodes if (i <= (numLeaves-1)) { tree[i].setName(Groups[i]); //save group info - string group = tmap->getGroup(Groups[i]); - vector tempGroups; tempGroups.push_back(group); - tree[i].setGroup(tempGroups); - groupNodeInfo[group].push_back(i); - - //set pcount and pGroup for groupname to 1. - tree[i].pcount[group] = 1; - tree[i].pGroups[group] = 1; - - //Treemap knows name, group and index to speed up search - tmap->setIndex(Groups[i], i); - - //intialize non leaf nodes + int maxPars = 1; + vector group; + vector counts = ct->getGroupCounts(Groups[i]); + for (int j = 0; j < namesOfGroups.size(); j++) { + if (counts[j] != 0) { //you have seqs from this group + groupNodeInfo[namesOfGroups[j]].push_back(i); + group.push_back(namesOfGroups[j]); + tree[i].pGroups[namesOfGroups[j]] = counts[j]; + tree[i].pcount[namesOfGroups[j]] = counts[j]; + //keep highest group + if(counts[j] > maxPars){ maxPars = counts[j]; } + } + } + tree[i].setGroup(group); + setIndex(Groups[i], i); + + if (maxPars > 1) { //then we have some more dominant groups + //erase all the groups that are less than maxPars because you found a more dominant group. + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){ + if(it->second < maxPars){ + tree[i].pGroups.erase(it++); + }else { it++; } + } + //set one remaining groups to 1 + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){ + tree[i].pGroups[it->first] = 1; + } + }//end if + + //intialize non leaf nodes }else if (i > (numLeaves-1)) { tree[i].setName(""); vector tempGroups; tree[i].setGroup(tempGroups); } } - + set removedLeaves; for (int i = 0; i < copy->getNumLeaves(); i++) { @@ -534,7 +561,7 @@ void Tree::getSubTree(Tree* Ctree, vector Groups) { exit(1); } } -/*****************************************************************/ +/***************************************************************** //assumes nameMap contains unique names as key or is empty. //assumes numLeaves defined in tree constructor equals size of seqsToInclude and seqsToInclude only contains unique seqs. int Tree::getSubTree(Tree* copy, vector seqsToInclude, map nameMap) { @@ -578,7 +605,7 @@ int Tree::populateNewTree(vector& oldtree, int node, int& index) { return (index++); }else { //you are a leaf - int indexInNewTree = tmap->getIndex(oldtree[node].getName()); + int indexInNewTree = getIndex(oldtree[node].getName()); return indexInNewTree; } } @@ -588,7 +615,7 @@ int Tree::populateNewTree(vector& oldtree, int node, int& index) { } } /*****************************************************************/ -void Tree::getCopy(Tree* copy, map nameMap) { +void Tree::getCopy(Tree* copy, bool subsample) { try { //for each node in the tree copy its info @@ -602,8 +629,6 @@ void Tree::getCopy(Tree* copy, map nameMap) { //copy children tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild()); } - - if (nameMap.size() != 0) { addNamesToCounts(nameMap); } //build the pGroups in non leaf nodes to be used in the parsimony calcs. for (int i = numLeaves; i < numNodes; i++) { @@ -640,8 +665,8 @@ void Tree::getCopy(Tree* copy) { tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild()); //copy index in node and tmap + setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName())); tree[i].setIndex(copy->tree[i].getIndex()); - setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName())); //copy pGroups tree[i].pGroups = copy->tree[i].pGroups; @@ -805,8 +830,8 @@ void Tree::randomLabels(vector g) { try { //initialize groupNodeInfo - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0); + for (int i = 0; i < (ct->getNamesOfGroups()).size(); i++) { + groupNodeInfo[(ct->getNamesOfGroups())[i]].resize(0); } for(int i = 0; i < numLeaves; i++){ @@ -868,23 +893,20 @@ void Tree::randomBlengths() { /*************************************************************************************************/ void Tree::assembleRandomUnifracTree(vector g) { randomLabels(g); - map empty; - assembleTree(empty); + assembleTree(); } /*************************************************************************************************/ void Tree::assembleRandomUnifracTree(string groupA, string groupB) { vector temp; temp.push_back(groupA); temp.push_back(groupB); randomLabels(temp); - map empty; - assembleTree(empty); + assembleTree(); } /*************************************************************************************************/ //for now it's just random topology but may become random labels as well later that why this is such a simple function now... void Tree::assembleRandomTree() { randomTopology(); - map empty; - assembleTree(empty); + assembleTree(); } /**************************************************************************************************/ @@ -1103,16 +1125,16 @@ void Tree::printBranch(int node, ostream& out, string mode) { } } }else { //you are a leaf - string leafGroup = tmap->getGroup(tree[node].getName()); + vector leafGroup = ct->getGroups(tree[node].getName()); if (mode == "branch") { - out << leafGroup; + out << leafGroup[0]; //if there is a branch length then print it if (tree[node].getBranchLength() != -1) { out << ":" << tree[node].getBranchLength(); } }else if (mode == "boot") { - out << leafGroup; + out << leafGroup[0]; //if there is a label then print it if (tree[node].getLabel() != -1) { out << tree[node].getLabel(); @@ -1166,16 +1188,16 @@ void Tree::printBranch(int node, ostream& out, string mode, vector& theseN } } }else { //you are a leaf - string leafGroup = tmap->getGroup(theseNodes[node].getName()); + vector leafGroup = ct->getGroups(theseNodes[node].getName()); if (mode == "branch") { - out << leafGroup; + out << leafGroup[0]; //if there is a branch length then print it if (theseNodes[node].getBranchLength() != -1) { out << ":" << theseNodes[node].getBranchLength(); } }else if (mode == "boot") { - out << leafGroup; + out << leafGroup[0]; //if there is a label then print it if (theseNodes[node].getLabel() != -1) { out << theseNodes[node].getLabel(); diff --git a/tree.h b/tree.h index 03da5f6..88e49c0 100644 --- a/tree.h +++ b/tree.h @@ -11,22 +11,22 @@ */ #include "treenode.h" -#include "treemap.h" +#include "counttable.h" /* This class represents the treefile. */ class Tree { public: Tree(string); //do not use tree generated by this constructor its just to extract the treenames, its a chicken before the egg thing that needs to be revisited. - Tree(int, TreeMap*); - Tree(TreeMap*); //to generate a tree from a file - Tree(TreeMap*, vector< vector >&); //create tree from sim matrix + Tree(int, CountTable*); + Tree(CountTable*); //to generate a tree from a file + Tree(CountTable*, vector< vector >&); //create tree from sim matrix ~Tree(); - TreeMap* getTreeMap() { return tmap; } + CountTable* getCountTable() { return ct; } void getCopy(Tree*); //makes tree a copy of the one passed in. - void getCopy(Tree* copy, map); //makes a copy of the tree structure passed in, (just parents, children and br). Used with the Tree(TreeMap*) constructor. Assumes the tmap already has set seqs groups you want. Used by subsample to reassign seqs you don't want included to group "doNotIncludeMe". + void getCopy(Tree* copy, bool); //makes a copy of the tree structure passed in, (just parents, children and br). Used with the Tree(TreeMap*) constructor. Assumes the tmap already has set seqs groups you want. Used by subsample to reassign seqs you don't want included to group "doNotIncludeMe". void getSubTree(Tree*, vector); //makes tree a that contains only the names passed in. - int getSubTree(Tree* originalToCopy, vector seqToInclude, map nameMap); //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. + //int getSubTree(Tree* originalToCopy, vector seqToInclude, map nameMap); //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. void assembleRandomTree(); void assembleRandomUnifracTree(vector); @@ -45,21 +45,22 @@ public: int findRoot(); //return index of root node //this function takes the leaf info and populates the non leaf nodes - int assembleTree(map); + int assembleTree(); vector tree; //the first n nodes are the leaves, where n is the number of sequences. map< string, vector > groupNodeInfo; //maps group to indexes of leaf nodes with that group, different groups may contain same node because of names file. private: - TreeMap* tmap; + CountTable* ct; int numNodes, numLeaves; ofstream out; string filename; - map names; + //map names; map::iterator it, it2; map mergeGroups(int); //returns a map with a groupname and the number of times that group was seen in the children map mergeGcounts(int); + map indexes; //maps seqName -> index in tree vector void addNamesToCounts(map); void randomTopology(); diff --git a/treegroupscommand.cpp b/treegroupscommand.cpp index 6633e51..bba6289 100644 --- a/treegroupscommand.cpp +++ b/treegroupscommand.cpp @@ -16,8 +16,9 @@ vector TreeGroupCommand::setParameters(){ try { CommandParameter pshared("shared", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pshared); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pphylip); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname); - CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "ColumnName",false,false); parameters.push_back(pcolumn); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "countcolumn",false,false); parameters.push_back(pcount); + CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "ColumnName-countcolumn",false,false); parameters.push_back(pcolumn); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample); CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff); @@ -160,6 +161,14 @@ TreeGroupCommand::TreeGroupCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -182,6 +191,11 @@ TreeGroupCommand::TreeGroupCommand(string option) { if (namefile == "not open") { abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } if ((phylipfile == "") && (columnfile == "") && (sharedfile == "")) { //is there are current file available for either of these? @@ -204,15 +218,20 @@ TreeGroupCommand::TreeGroupCommand(string option) { else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When running the tree.shared command with a distance file you may not use both the column and the phylip parameters."); m->mothurOutEndLine(); abort = true; } if (columnfile != "") { - if (namefile == "") { + if ((namefile == "") && (countfile == "")){ namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); + abort = true; + } } } } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -287,7 +306,7 @@ TreeGroupCommand::~TreeGroupCommand(){ if (abort == false) { if (format == "sharedfile") { delete input; } else { delete list; } - delete tmap; + delete ct; } } @@ -400,8 +419,16 @@ int TreeGroupCommand::execute(){ m->runParse = false; //create treemap class from groupmap for tree class to use - tmap = new TreeMap(); - tmap->makeSim(m->getAllGroups()); + ct = new CountTable(); + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->getAllGroups().size(); i++) { + nameMap.insert(m->getAllGroups()[i]); + gps.insert(m->getAllGroups()[i]); + groupMap[m->getAllGroups()[i]] = m->getAllGroups()[i]; + } + ct->createTable(nameMap, groupMap, gps); //clear globaldatas old tree names if any m->Treenames.clear(); @@ -425,31 +452,40 @@ int TreeGroupCommand::execute(){ readMatrix->setCutoff(cutoff); - if(namefile != ""){ - nameMap = new NameAssignment(namefile); - nameMap->readMap(); - } - else{ - nameMap = NULL; - } - - readMatrix->read(nameMap); + ct = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + readMatrix->read(nameMap); + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(countfile); + readMatrix->read(ct); + } + list = readMatrix->getListVector(); SparseDistanceMatrix* dMatrix = readMatrix->getDMatrix(); //make treemap - tmap = new TreeMap(); - - if (m->control_pressed) { return 0; } - - tmap->makeSim(list); + if (ct != NULL) { delete ct; } + ct = new CountTable(); + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < list->getNumBins(); i++) { + string bin = list->get(i); + nameMap.insert(bin); + gps.insert(bin); + groupMap[bin] = bin; + } + ct->createTable(nameMap, groupMap, gps); - vector namesGroups = tmap->getNamesOfGroups(); + vector namesGroups = ct->getNamesOfGroups(); m->setGroups(namesGroups); //clear globaldatas old tree names if any m->Treenames.clear(); - + //fills globaldatas tree names m->Treenames = m->getGroups(); @@ -505,13 +541,12 @@ int TreeGroupCommand::execute(){ Tree* TreeGroupCommand::createTree(vector< vector >& simMatrix){ try { //create tree - t = new Tree(tmap, simMatrix); + t = new Tree(ct, simMatrix); if (m->control_pressed) { delete t; t = NULL; return t; } //assemble tree - map empty; - t->assembleTree(empty); + t->assembleTree(); return t; } diff --git a/treegroupscommand.h b/treegroupscommand.h index b0ae730..b29670a 100644 --- a/treegroupscommand.h +++ b/treegroupscommand.h @@ -15,11 +15,10 @@ #include "groupmap.h" #include "validcalculator.h" #include "tree.h" -#include "treemap.h" +#include "counttable.h" #include "readmatrix.hpp" #include "readcolumn.h" #include "readphylip.h" -#include "sparsematrix.hpp" #include "sharedsobscollectsummary.h" #include "sharedchao1.h" #include "sharedace.h" @@ -69,8 +68,6 @@ They can also use as many or as few calculators as they wish. */ -typedef list::iterator MatData; - class TreeGroupCommand : public Command { public: @@ -107,13 +104,13 @@ private: NameAssignment* nameMap; ListVector* list; - TreeMap* tmap; + CountTable* ct; Tree* t; InputData* input; vector treeCalculators; vector lookup; string lastLabel; - string format, groupNames, filename, sharedfile, inputfile; + string format, groupNames, filename, sharedfile, countfile, inputfile; int numGroups, subsampleSize, iters, processors; ofstream out; float precision, cutoff; diff --git a/treemap.cpp b/treemap.cpp index 42ec336..47b7cf3 100644 --- a/treemap.cpp +++ b/treemap.cpp @@ -13,6 +13,9 @@ TreeMap::TreeMap(string filename) { m = MothurOut::getInstance(); + ofstream out2; + m->openOutputFileAppend(filename, out2); + out2 << endl; out2.close(); groupFileName = filename; m->openInputFile(filename, fileHandle); } @@ -22,6 +25,10 @@ /************************************************************/ int TreeMap::readMap(string gf) { try { + ofstream out2; + m->openOutputFileAppend(gf, out2); + out2 << endl; out2.close(); + groupFileName = gf; m->openInputFile(gf, fileHandle); @@ -65,6 +72,34 @@ int TreeMap::readMap(string gf) { } fileHandle.close(); + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { seqName = pieces[i]; columnOne=false; } + else { seqGroup = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + setNamesOfGroups(seqGroup); + + map::iterator itCheck = treemap.find(seqName); + if (itCheck != treemap.end()) { error = 1; m->mothurOut("[WARNING]: Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } + else { + namesOfSeqs.push_back(seqName); + treemap[seqName].groupname = seqGroup; //store data in map + + it2 = seqsPerGroup.find(seqGroup); + if (it2 == seqsPerGroup.end()) { //if it's a new group + seqsPerGroup[seqGroup] = 1; + }else {//it's a group we already have + seqsPerGroup[seqGroup]++; + } + } + pairDone = false; + } + } + } + return error; } catch(exception& e) { @@ -116,6 +151,34 @@ int TreeMap::readMap() { } fileHandle.close(); + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { seqName = pieces[i]; columnOne=false; } + else { seqGroup = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + setNamesOfGroups(seqGroup); + + map::iterator itCheck = treemap.find(seqName); + if (itCheck != treemap.end()) { error = 1; m->mothurOut("[WARNING]: Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } + else { + namesOfSeqs.push_back(seqName); + treemap[seqName].groupname = seqGroup; //store data in map + + it2 = seqsPerGroup.find(seqGroup); + if (it2 == seqsPerGroup.end()) { //if it's a new group + seqsPerGroup[seqGroup] = 1; + }else {//it's a group we already have + seqsPerGroup[seqGroup]++; + } + } + pairDone = false; + } + } + } + return error; } catch(exception& e) { @@ -183,26 +246,6 @@ string TreeMap::getGroup(string sequenceName) { return "not found"; } -} -/************************************************************/ -void TreeMap::setIndex(string seq, int index) { - it = treemap.find(seq); - if (it != treemap.end()) { //sequence name was in group file - treemap[seq].vectorIndex = index; - }else { - treemap[seq].vectorIndex = index; - treemap[seq].groupname = "not found"; - } -} -/************************************************************/ -int TreeMap::getIndex(string seq) { - - it = treemap.find(seq); - // if it is a valid sequence name then return index - if (it != treemap.end()) { return treemap[seq].vectorIndex; } - // if not return error code - else { return -1; } - } /************************************************************/ diff --git a/treemap.h b/treemap.h index 57822e0..7ffd1e7 100644 --- a/treemap.h +++ b/treemap.h @@ -29,8 +29,8 @@ public: int readMap(string); int getNumGroups(); int getNumSeqs(); - void setIndex(string, int); //sequencename, index - int getIndex(string); //returns vector index of sequence + //void setIndex(string, int); //sequencename, index + //int getIndex(string); //returns vector index of sequence bool isValidGroup(string); //return true if string is a valid group void removeSeq(string); //removes a sequence, this is to accomadate trees that do not contain all the seqs in your groupfile string getGroup(string); diff --git a/treereader.cpp b/treereader.cpp index b385d21..0e25f12 100644 --- a/treereader.cpp +++ b/treereader.cpp @@ -8,12 +8,23 @@ #include "treereader.h" #include "readtree.h" +#include "groupmap.h" /***********************************************************************/ - -TreeReader::TreeReader(string tf) : treefile(tf) { +TreeReader::TreeReader(string tf, string cf) : treefile(tf), countfile(cf) { try { m = MothurOut::getInstance(); + ct = new CountTable(); + ct->readTable(cf); + + //if no groupinfo in count file we need to add it + if (!ct->hasGroupInfo()) { + ct->addGroup("Group1"); + vector namesOfSeqs = ct->getNamesOfSeqs(); + for (int i = 0; i < namesOfSeqs.size(); i++) { + ct->setAbund(namesOfSeqs[i], "Group1", ct->getNumSeqs(namesOfSeqs[i])); + } + } namefile = ""; groupfile = ""; readTrees(); @@ -24,22 +35,32 @@ TreeReader::TreeReader(string tf) : treefile(tf) { } } /***********************************************************************/ - -TreeReader::TreeReader(string tf, string gf) : treefile(tf), groupfile(gf) { - try { - m = MothurOut::getInstance(); - namefile = ""; - readTrees(); - } - catch(exception& e) { - m->errorOut(e, "TreeReader", "TreeReader"); - exit(1); - } -} -/***********************************************************************/ TreeReader::TreeReader(string tf, string gf, string nf) : treefile(tf), groupfile(gf), namefile(nf) { try { m = MothurOut::getInstance(); + countfile = ""; + ct = new CountTable(); + if (namefile != "") { ct->createTable(namefile, groupfile, true); } + else { + Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->Treenames.size(); i++) { nameMap.insert(m->Treenames[i]); } + if (groupfile == "") { gps.insert("Group1"); for (int i = 0; i < m->Treenames.size(); i++) { groupMap[m->Treenames[i]] = "Group1"; } } + else { + GroupMap g(groupfile); + g.readMap(); + vector seqs = g.getNamesSeqs(); + for (int i = 0; i < seqs.size(); i++) { + string group = g.getGroup(seqs[i]); + groupMap[seqs[i]] = group; + gps.insert(group); + } + } + ct->createTable(nameMap, groupMap, gps); + } + readTrees(); } catch(exception& e) { @@ -51,22 +72,15 @@ TreeReader::TreeReader(string tf, string gf, string nf) : treefile(tf), groupfi bool TreeReader::readTrees() { try { - tmap = new TreeMap(); - if (groupfile != "") { tmap->readMap(groupfile); } - else{ //fake out by putting everyone in one group - Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap - for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); } - } - - int numUniquesInName = 0; - if (namefile != "") { numUniquesInName = readNamesFile(); } + int numUniquesInName = ct->getNumUniqueSeqs(); + //if (namefile != "") { numUniquesInName = readNamesFile(); } ReadTree* read = new ReadNewickTree(treefile); - int readOk = read->read(tmap); + int readOk = read->read(ct); if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete read; m->control_pressed=true; return 0; } - read->AssembleTrees(names); + read->AssembleTrees(); trees = read->getTrees(); delete read; @@ -74,18 +88,19 @@ bool TreeReader::readTrees() { //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size. int numNamesInTree; if (namefile != "") { - if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); } + if (numUniquesInName == m->Treenames.size()) { numNamesInTree = ct->getNumSeqs(); } else { numNamesInTree = m->Treenames.size(); } }else { numNamesInTree = m->Treenames.size(); } //output any names that are in group file but not in tree - if (numNamesInTree < tmap->getNumSeqs()) { - for (int i = 0; i < tmap->namesOfSeqs.size(); i++) { + if (numNamesInTree < ct->getNumSeqs()) { + vector namesSeqsCt = ct->getNamesOfSeqs(); + for (int i = 0; i < namesSeqsCt.size(); i++) { //is that name in the tree? int count = 0; for (int j = 0; j < m->Treenames.size(); j++) { - if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it + if (namesSeqsCt[i] == m->Treenames[j]) { break; } //found it count++; } @@ -93,14 +108,8 @@ bool TreeReader::readTrees() { //then you did not find it so report it if (count == m->Treenames.size()) { - //if it is in your namefile then don't remove - map::iterator it = nameMap.find(tmap->namesOfSeqs[i]); - - if (it == nameMap.end()) { - m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine(); - tmap->removeSeq(tmap->namesOfSeqs[i]); - i--; //need this because removeSeq removes name from namesOfSeqs - } + m->mothurOut(namesSeqsCt[i] + " is in your name or group file and not in your tree. It will be disregarded."); m->mothurOutEndLine(); + ct->remove(namesSeqsCt[i]); } } } @@ -112,47 +121,6 @@ bool TreeReader::readTrees() { exit(1); } } -/*****************************************************************/ -int TreeReader::readNamesFile() { - try { - nameMap.clear(); - names.clear(); - int numUniquesInName = 0; - - ifstream in; - m->openInputFile(namefile, in); - - string first, second; - map::iterator itNames; - - while(!in.eof()) { - in >> first >> second; m->gobble(in); - - numUniquesInName++; - - itNames = nameMap.find(first); - if (itNames == nameMap.end()) { - names[first] = second; - - //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them - vector dupNames; - m->splitAtComma(second, dupNames); - - for (int i = 0; i < dupNames.size(); i++) { - nameMap[dupNames[i]] = first; - if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } - } - }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); nameMap.clear(); names.clear(); namefile = ""; return 1; } - } - in.close(); - - return numUniquesInName; - } - catch(exception& e) { - m->errorOut(e, "TreeReader", "readNamesFile"); - exit(1); - } -} /***********************************************************************/ diff --git a/treereader.h b/treereader.h index fb9c791..ac24eb0 100644 --- a/treereader.h +++ b/treereader.h @@ -11,29 +11,26 @@ #include "mothurout.h" #include "tree.h" +#include "counttable.h" class TreeReader { public: - TreeReader(string tf); - TreeReader(string tf, string gf); + TreeReader(string tf, string cf); TreeReader(string tf, string gf, string nf); ~TreeReader() {} vector getTrees() { return trees; } - map getNames() { return nameMap; } //dups -> unique - map getNameMap() { return names; } //unique -> dups list - private: MothurOut* m; vector trees; - TreeMap* tmap; - map nameMap; //dupName -> uniqueName - map names; + CountTable* ct; + //map nameMap; //dupName -> uniqueName + // map names; - string treefile, groupfile, namefile; + string treefile, groupfile, namefile, countfile; bool readTrees(); int readNamesFile(); diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp index d45f20c..296a6fe 100644 --- a/trimflowscommand.cpp +++ b/trimflowscommand.cpp @@ -28,7 +28,7 @@ vector TrimFlowsCommand::setParameters(){ CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal); CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise); CommandParameter pallfiles("allfiles", "Boolean", "", "t", "", "", "",false,false); parameters.push_back(pallfiles); - CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder); + CommandParameter porder("order", "String", "", "TACG", "", "", "",false,false); parameters.push_back(porder); CommandParameter pfasta("fasta", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pfasta); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -347,7 +347,7 @@ int TrimFlowsCommand::execute(){ output.close(); } - outputTypes["flow.files"].push_back(flowFilesFileName); + outputTypes["file"].push_back(flowFilesFileName); outputNames.push_back(flowFilesFileName); // set fasta file as new current fastafile @@ -423,11 +423,9 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN string trashCode = ""; flowData.getNext(flowFile); - //cout << "driver good bit " << flowFile.good() << endl; flowData.capFlows(maxFlows); Sequence currSeq = flowData.getSequence(); - if(!flowData.hasMinFlows(minFlows)){ //screen to see if sequence is of a minimum number of flows success = 0; trashCode += 'l'; @@ -443,6 +441,8 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN } + if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + " " + currSeq.getUnaligned() + "\n"); } + if(barcodes.size() != 0){ success = trimOligos.stripBarcode(currSeq, barcodeIndex); if(success > bdiffs) { trashCode += 'b'; } diff --git a/trimoligos.cpp b/trimoligos.cpp index 2f92cc8..8f4cbe9 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -14,7 +14,7 @@ /********************************************************************/ //strip, pdiffs, bdiffs, primers, barcodes, revPrimers -TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, map rbr, vector r, vector lk, vector sp){ +TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, vector r, vector lk, vector sp){ try { m = MothurOut::getInstance(); @@ -24,7 +24,6 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map pr, map pr, map br, vector r, vector lk, vector sp){ +TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, vector lk, vector sp){ try { m = MothurOut::getInstance(); @@ -46,9 +45,8 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, maperrorOut(e, "TrimOligos", "stripBarcode"); exit(1); } +} +//*******************************************************************/ +int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){ + try { + //look for forward barcode + string rawFSequence = forwardSeq.getUnaligned(); + string rawRSequence = reverseSeq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the forward barcode + for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ + string foligo = it->second.forward; + string roligo = it->second.reverse; + + if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { + group = it->first; + forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); + reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); + forwardQual.trimQScores(foligo.length(), -1); + reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length()); + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + else { //try aligning and see if you can find it + + //look for forward + int maxLength = 0; + + Alignment* alignment; + if (ibarcodes.size() > 0) { + for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ + if(it->second.forward.length() > maxLength){ maxLength = it->second.forward.length(); } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minFGroup = -1; + int minFPos = 0; + + for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ + string oligo = it->second.forward; + + if(rawFSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minFGroup = it->first; + minFPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ + //check for reverse match + if (alignment != NULL) { delete alignment; } + maxLength = 0; + + if (ibarcodes.size() > 0) { + for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ + if(it->second.reverse.length() > maxLength){ maxLength = it->second.reverse.length(); } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + }else{ alignment = NULL; } + + //can you find the barcode + minDiff = 1e6; + minCount = 1; + int minRGroup = -1; + int minRPos = 0; + + for(map::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){ + string oligo = it->second.reverse; + + if(rawRSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawRSequence.substr((rawRSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + for(int i=0;ifirst; + minRPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ + //we have an acceptable match for the forward and reverse, but do they match? + if (minFGroup == minRGroup) { + group = minFGroup; + forwardSeq.setUnaligned(rawFSequence.substr(minFPos)); + reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-minRPos))); + forwardQual.trimQScores(minFPos, -1); + reverseQual.trimQScores(-1, rawRSequence.length()-minRPos); + success = minDiff; + }else { success = bdiffs + 100; } + } + } + + if (alignment != NULL) { delete alignment; } + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripIBarcode"); + exit(1); + } } //*******************************************************************/ @@ -308,7 +477,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){ } } -//*******************************************************************/ +/******************************************************************* int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){ try { @@ -428,7 +597,7 @@ int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){ } } -//*******************************************************************/ +/******************************************************************* int TrimOligos::stripRBarcode(Sequence& seq, int& group){ try { diff --git a/trimoligos.h b/trimoligos.h index a32b3d8..fb8f74d 100644 --- a/trimoligos.h +++ b/trimoligos.h @@ -15,23 +15,30 @@ #include "sequence.hpp" #include "qualityscores.h" +struct oligosPair { + string forward; + string reverse; + + oligosPair() { forward = ""; reverse = ""; } + oligosPair(string f, string r) : forward(f), reverse(r) {} + ~oligosPair() {} +}; class TrimOligos { public: TrimOligos(int,int, map, map, vector); //pdiffs, bdiffs, primers, barcodes, revPrimers - TrimOligos(int,int, int, int, map, map, map, vector, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer - TrimOligos(int,int, int, int, map, map, vector, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer + TrimOligos(int,int, int, int, map, map, vector, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer + TrimOligos(int,int, int, int, map, map, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, linker, spacer ~TrimOligos(); int stripBarcode(Sequence&, int&); int stripBarcode(Sequence&, QualityScores&, int&); - - int stripRBarcode(Sequence&, int&); - int stripRBarcode(Sequence&, QualityScores&, int&); - + int stripBarcode(Sequence&, Sequence&, QualityScores&, QualityScores&, int&); + int stripForward(Sequence&, int&); int stripForward(Sequence&, QualityScores&, int&, bool); + int stripForward(Sequence&, Sequence&, QualityScores&, QualityScores&, int&); bool stripReverse(Sequence&); bool stripReverse(Sequence&, QualityScores&); @@ -47,11 +54,12 @@ class TrimOligos { int pdiffs, bdiffs, ldiffs, sdiffs; map barcodes; - map rbarcodes; map primers; vector revPrimer; vector linker; vector spacer; + map ibarcodes; + map iprimers; MothurOut* m; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 0036769..0c21c89 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -11,13 +11,15 @@ #include "needlemanoverlap.hpp" #include "trimoligos.h" + //********************************************************************************************************************** vector TrimSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip); CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig); CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop); @@ -58,11 +60,12 @@ string TrimSeqsCommand::getHelpString(){ string helpString = ""; helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n"; helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n"; - helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; + helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; helpString += "The fasta parameter is required.\n"; helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"; helpString += "The oligos parameter allows you to provide an oligos file.\n"; helpString += "The name parameter allows you to provide a names file with your fasta file.\n"; + helpString += "The count parameter allows you to provide a count file with your fasta file.\n"; helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"; helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; @@ -111,6 +114,7 @@ string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ else if (type == "fasta") { outputFileName = "fasta"; } else if (type == "group") { outputFileName = "groups"; } else if (type == "name") { outputFileName = "names"; } + else if (type == "count") { outputFileName = "count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -133,6 +137,7 @@ TrimSeqsCommand::TrimSeqsCommand(){ outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); @@ -171,6 +176,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -208,6 +214,14 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -279,6 +293,13 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { if (temp == "not found") { nameFile = ""; } else if(temp == "not open") { nameFile = ""; abort = true; } else { nameFile = temp; m->setNameFile(nameFile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } m->mothurConvert(temp, qThreshold); @@ -331,10 +352,12 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { abort = true; } - if (nameFile == "") { - vector files; files.push_back(fastaFile); - parser.getNameFile(files); - } + if (countfile == "") { + if (nameFile == "") { + vector files; files.push_back(fastaFile); + parser.getNameFile(files); + } + } } } @@ -385,13 +408,27 @@ int TrimSeqsCommand::execute(){ outputTypes["name"].push_back(trimNameFile); outputTypes["name"].push_back(scrapNameFile); } + + string trimCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "trim." + getOutputFileNameTag("count"); + string scrapCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "scrap." + getOutputFileNameTag("count"); + + if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameCount = ct.getNameMap(); + outputNames.push_back(trimCountFile); + outputNames.push_back(scrapCountFile); + outputTypes["count"].push_back(trimCountFile); + outputTypes["count"].push_back(scrapCountFile); + } + if (m->control_pressed) { return 0; } string outputGroupFileName; if(oligoFile != ""){ createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames); - if (createGroup) { + if ((createGroup) && (countfile == "")){ outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group"); outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); } @@ -401,9 +438,9 @@ int TrimSeqsCommand::execute(){ setLines(fastaFile, qFileName); if(processors == 1){ - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); }else{ - createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); } @@ -446,35 +483,62 @@ int TrimSeqsCommand::execute(){ for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } } outputNames = outputNames2; - for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) { - ifstream in; - m->openInputFile(it->first, in); - - ofstream out; - string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + getOutputFileNameTag("group"); - outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); - m->openOutputFile(thisGroupName, out); - - while (!in.eof()){ - if (m->control_pressed) { break; } - - Sequence currSeq(in); m->gobble(in); - out << currSeq.getName() << '\t' << it->second << endl; + for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) { + ifstream in; + m->openInputFile(it->first, in); + + ofstream out; + string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)); + if (countfile == "") { thisGroupName += getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); } + else { thisGroupName += getOutputFileNameTag("count"); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); } + m->openOutputFile(thisGroupName, out); + + if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; } + + while (!in.eof()){ + if (m->control_pressed) { break; } - if (nameFile != "") { - map::iterator itName = nameMap.find(currSeq.getName()); - if (itName != nameMap.end()) { - vector thisSeqsNames; - m->splitAtChar(itName->second, thisSeqsNames, ','); - for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self - out << thisSeqsNames[k] << '\t' << it->second << endl; - } - }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + Sequence currSeq(in); m->gobble(in); + if (countfile == "") { + out << currSeq.getName() << '\t' << it->second << endl; + + if (nameFile != "") { + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + vector thisSeqsNames; + m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + out << thisSeqsNames[k] << '\t' << it->second << endl; + } + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } + }else { + map::iterator itTotalReps = nameCount.find(currSeq.getName()); + if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; } + else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } } - } - in.close(); - out.close(); - } + } + in.close(); + out.close(); + } + + if (countfile != "") { //create countfile with group info included + CountTable* ct = new CountTable(); + ct->readTable(trimCountFile); + map justTrimmedNames = ct->getNameMap(); + delete ct; + + CountTable newCt; + for (map::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); } + vector tempCounts; tempCounts.resize(groupCounts.size(), 0); + for (map::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) { + newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance + map::iterator it2 = groupMap.find(itNames->first); + if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); } + else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; } + } + newCt.printTable(trimCountFile); + } } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -511,6 +575,11 @@ int TrimSeqsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); @@ -527,8 +596,7 @@ int TrimSeqsCommand::execute(){ } /**************************************************************************************/ - -int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair line, linePair qline) { +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair line, linePair qline) { try { @@ -552,9 +620,16 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string m->openOutputFile(scrapNFileName, scrapNameFile); } + ofstream trimCountFile; + ofstream scrapCountFile; + if(countfile != ""){ + m->openOutputFile(trimCFileName, trimCountFile); + m->openOutputFile(scrapCFileName, scrapCountFile); + if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; } + } ofstream outGroupsFile; - if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); } + if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); } if(allFiles){ for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file @@ -585,20 +660,17 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int count = 0; bool moreSeqs = 1; - TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer); + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); while (moreSeqs) { if (m->control_pressed) { inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (createGroup) { outGroupsFile.close(); } - - if(qFileName != ""){ - qFile.close(); - } - for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } - - return 0; + if ((createGroup) && (countfile == "")) { outGroupsFile.close(); } + if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); } + for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } int success = 1; @@ -611,7 +683,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string QualityScores currQual; if(qFileName != ""){ currQual = QualityScores(qFile); m->gobble(qFile); - if ((m->debug)&&(count>15800)) { m->mothurOut("[DEBUG]: " + toString(count) + " fasta = " + currSeq.getName() + '\n'); m->mothurOut("[DEBUG]: " + toString(getpid()) + '\n'); } + //cout << currQual.getName() << endl; } string origSeq = currSeq.getUnaligned(); @@ -632,12 +704,6 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } - - if(rbarcodes.size() != 0){ - success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex); - if(success > bdiffs) { trashCode += 'b'; } - else{ currentSeqsDiffs += success; } - } if(numSpacers != 0){ success = trimOligos.stripSpacer(currSeq, currQual); @@ -704,6 +770,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } } + if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } } + if(trashCode.length() == 0){ currSeq.setAligned(currSeq.getUnaligned()); currSeq.printSequence(trimFASTAFile); @@ -718,6 +786,15 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } } + + int numRedundants = 0; + if (countfile != "") { + map::iterator itCount = nameCount.find(currSeq.getName()); + if (itCount != nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + numRedundants = itCount->second-1; + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } if (createGroup) { if(barcodes.size() != 0){ @@ -732,9 +809,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } } - outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); } + + if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; } + else { groupMap[currSeq.getName()] = thisGroup; } - int numRedundants = 0; if (nameFile != "") { map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { @@ -782,6 +861,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } } + if (countfile != "") { + map::iterator itCount = nameCount.find(currSeq.getName()); + if (itCount != nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } + currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); currSeq.setAligned(origSeq); @@ -815,6 +901,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (createGroup) { outGroupsFile.close(); } if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); } return count; } @@ -826,7 +913,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string /**************************************************************************************************/ -int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { +int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { try { int process = 1; @@ -877,6 +964,8 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName (scrapQualFileName + toString(getpid()) + ".temp"), (trimNameFileName + toString(getpid()) + ".temp"), (scrapNameFileName + toString(getpid()) + ".temp"), + (trimCountFileName + toString(getpid()) + ".temp"), + (scrapCountFileName + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, @@ -897,6 +986,11 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { out << it->first << '\t' << it->second << endl; } + + out << groupMap.size() << endl; + for (map::iterator it = groupMap.begin(); it != groupMap.end(); it++) { + out << it->first << '\t' << it->second << endl; + } out.close(); } exit(0); @@ -919,8 +1013,12 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName m->openOutputFile(trimNameFileName, temp); temp.close(); m->openOutputFile(scrapNameFileName, temp); temp.close(); } + if (countfile != "") { + m->openOutputFile(trimCountFileName, temp); temp.close(); + m->openOutputFile(scrapCountFileName, temp); temp.close(); + } - driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); + driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); //force parent to wait until all the processes are done for (int i=0;iopenOutputFile(scrapNameFileName, temp); temp.close(); } - driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]); + driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]); processIDS.push_back(processors-1); @@ -1018,6 +1118,11 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; } else { groupCounts[it->first] += it->second; } } + for (map::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) { + map::iterator it2 = groupMap.find(it->first); + if (it2 == groupMap.end()) { groupMap[it->first] = it->second; } + else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } CloseHandle(hThreadArray[i]); delete pDataArray[i]; } @@ -1048,8 +1153,15 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName); m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp")); } + + if(countfile != ""){ + m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName); + m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp")); + m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName); + m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp")); + } - if(createGroup){ + if((createGroup)&&(countfile == "")){ m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile); m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp")); } @@ -1087,14 +1199,27 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName in >> tempNum; m->gobble(in); if (tempNum != 0) { - while (!in.eof()) { - in >> group >> tempNum; m->gobble(in); + for (int i = 0; i < tempNum; i++) { + int groupNum; + in >> group >> groupNum; m->gobble(in); map::iterator it = groupCounts.find(group); - if (it == groupCounts.end()) { groupCounts[group] = tempNum; } - else { groupCounts[it->first] += tempNum; } + if (it == groupCounts.end()) { groupCounts[group] = groupNum; } + else { groupCounts[it->first] += groupNum; } } } + in >> tempNum; m->gobble(in); + if (tempNum != 0) { + for (int i = 0; i < tempNum; i++) { + string group, seqName; + in >> seqName >> group; m->gobble(in); + + map::iterator it = groupMap.find(seqName); + if (it == groupMap.end()) { groupMap[seqName] = group; } + else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } + } + in.close(); m->mothurRemove(tempFile); } #endif @@ -1255,7 +1380,9 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< while(!inOligos.eof()){ inOligos >> type; - + + if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); } + if(type[0] == '#'){ while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there m->gobble(inOligos); @@ -1266,6 +1393,8 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< for(int i=0;i> oligo; + + if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); } for(int i=0;i >& fastaFileNames, vector< map::iterator itPrime = primers.find(oligo); if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } } + primers[oligo]=indexPrimer; indexPrimer++; primerNameVector.push_back(group); } @@ -1298,33 +1429,11 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< } else if(type == "BARCODE"){ inOligos >> group; - - //barcode lines can look like BARCODE atgcatgc groupName - for 454 seqs - //or BARCODE atgcatgc atgcatgc groupName - for illumina data that has forward and reverse info - string temp = ""; - while (!inOligos.eof()) { - char c = inOligos.get(); - if (c == 10 || c == 13){ break; } - else if (c == 32 || c == 9){;} //space or tab - else { temp += c; } - } - //then this is illumina data with 4 columns - if (temp != "") { - string reverseBarcode = reverseOligo(group); //reverse barcode - group = temp; - - //check for repeat barcodes - map::iterator itBar = rbarcodes.find(reverseBarcode); - if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine(); } - - rbarcodes[reverseBarcode]=indexBarcode; - } - //check for repeat barcodes map::iterator itBar = barcodes.find(oligo); if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } - + barcodes[oligo]=indexBarcode; indexBarcode++; barcodeNameVector.push_back(group); }else if(type == "LINKER"){ @@ -1332,7 +1441,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< }else if(type == "SPACER"){ spacer.push_back(oligo); } - else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } + else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } m->gobble(inOligos); } @@ -1370,6 +1479,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< string fastaFileName = ""; string qualFileName = ""; string nameFileName = ""; + string countFileName = ""; if(primerName == ""){ comboGroupName = barcodeNameVector[itBar->second]; @@ -1416,7 +1526,6 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< nameFileNames[itBar->second][itPrimer->second] = nameFileName; m->openOutputFile(nameFileName, temp); temp.close(); } - } } } @@ -1438,7 +1547,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< break; } } - + if (allBlank) { m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine(); allFiles = false; diff --git a/trimseqscommand.h b/trimseqscommand.h index 957f37a..1ffad21 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -14,8 +14,8 @@ #include "command.hpp" #include "sequence.hpp" #include "qualityscores.h" -#include "groupmap.h" #include "trimoligos.h" +#include "counttable.h" class TrimSeqsCommand : public Command { @@ -36,16 +36,13 @@ public: void help() { m->mothurOut(getHelpString()); } private: - - GroupMap* groupMap; - struct linePair { unsigned long long start; unsigned long long end; linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} linePair() {} }; - + bool getOligos(vector >&, vector >&, vector >&); bool keepFirstTrim(Sequence&, QualityScores&); bool removeLastTrim(Sequence&, QualityScores&); @@ -55,7 +52,7 @@ private: string reverseOligo(string); bool abort, createGroup; - string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir; + string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir; bool flip, allFiles, qtrim, keepforward; int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts; @@ -64,7 +61,6 @@ private: vector revPrimer, outputNames; set filesToRemove; map barcodes; - map rbarcodes; vector groupVector; map primers; vector linker; @@ -75,13 +71,15 @@ private: vector barcodeNameVector; //needed here? map groupCounts; map nameMap; + map nameCount; //for countfile name -> repCount + map groupMap; //for countfile name -> group vector processIDS; //processid vector lines; vector qLines; - int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair, linePair); - int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >); + int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair, linePair); + int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector >, vector >, vector >); int setLines(string, string); }; @@ -92,7 +90,7 @@ private: struct trimData { unsigned long long start, end; MothurOut* m; - string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile; + string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile; vector > fastaFileNames; vector > qualFileNames; vector > nameFileNames; @@ -103,8 +101,8 @@ struct trimData { double qRollAverage, qThreshold, qWindowAverage, qAverage; vector revPrimer; map barcodes; - map rbarcodes; map primers; + map nameCount; vector linker; vector spacer; map combos; @@ -112,22 +110,26 @@ struct trimData { vector barcodeNameVector; map groupCounts; map nameMap; + map groupMap; trimData(){} - trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, - int pd, int bd, int ld, int sd, int td, map pri, map bar, map rbar, vector revP, vector li, vector spa, + trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, + int pd, int bd, int ld, int sd, int td, map pri, map bar, vector revP, vector li, vector spa, vector priNameVector, vector barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL, int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage, - int minL, int maxA, int maxH, int maxL, bool fli, map nm) { + int minL, int maxA, int maxH, int maxL, bool fli, map nm, map ncount) { filename = fn; qFileName = qn; nameFile = nf; + countfile = cf; trimFileName = tn; scrapFileName = sn; trimQFileName = tqn; scrapQFileName = sqn; trimNFileName = tnn; scrapNFileName = snn; + trimCFileName = tcn; + scrapCFileName = scn; groupFileName = gn; fastaFileNames = ffn; qualFileNames = qfn; @@ -137,6 +139,7 @@ struct trimData { qlineStart = qstart; qlineEnd = qend; m = mout; + nameCount = ncount; pdiffs = pd; bdiffs = bd; @@ -144,7 +147,6 @@ struct trimData { sdiffs = sd; tdiffs = td; barcodes = bar; - rbarcodes = rbar; primers = pri; numFPrimers = primers.size(); revPrimer = revP; numRPrimers = revPrimer.size(); linker = li; numLinkers = linker.size(); @@ -203,7 +205,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ ofstream outGroupsFile; - if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); } + if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); } if(pDataArray->allFiles){ for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file @@ -222,6 +224,14 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } } + ofstream trimCountFile; + ofstream scrapCountFile; + if(pDataArray->countfile != ""){ + pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile); + pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile); + if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; } + } + ifstream inFASTA; pDataArray->m->openInputFile(pDataArray->filename, inFASTA); if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { @@ -241,14 +251,18 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } - TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); + TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); pDataArray->count = pDataArray->lineEnd; for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process if (pDataArray->m->control_pressed) { inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (pDataArray->createGroup) { outGroupsFile.close(); } + if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); } + if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); } + if(pDataArray->qFileName != ""){ qFile.close(); } return 0; } @@ -282,12 +296,6 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ else{ currentSeqsDiffs += success; } } - if(pDataArray->rbarcodes.size() != 0){ - success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex); - if(success > pDataArray->bdiffs) { trashCode += 'b'; } - else{ currentSeqsDiffs += success; } - } - if(pDataArray->numSpacers != 0){ success = trimOligos.stripSpacer(currSeq, currQual); if(success > pDataArray->sdiffs) { trashCode += 's'; } @@ -399,6 +407,15 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } } + int numRedundants = 0; + if (pDataArray->countfile != "") { + map::iterator itCount = pDataArray->nameCount.find(currSeq.getName()); + if (itCount != pDataArray->nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + numRedundants = itCount->second-1; + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } + } + if (pDataArray->createGroup) { if(pDataArray->barcodes.size() != 0){ string thisGroup = pDataArray->barcodeNameVector[barcodeIndex]; @@ -412,9 +429,9 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } } - outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; } + else { pDataArray->groupMap[currSeq.getName()] = thisGroup; } - int numRedundants = 0; if (pDataArray->nameFile != "") { map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); if (itName != pDataArray->nameMap.end()) { @@ -462,6 +479,12 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } } + if (pDataArray->countfile != "") { + map::iterator itCount = pDataArray->nameCount.find(currSeq.getName()); + if (itCount != pDataArray->nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } + } currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); currSeq.setAligned(origSeq); diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 0749cb7..edc4bbc 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -16,8 +16,9 @@ vector UnifracUnweightedCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); @@ -42,7 +43,7 @@ vector UnifracUnweightedCommand::setParameters(){ string UnifracUnweightedCommand::getHelpString(){ try { string helpString = ""; - helpString += "The unifrac.unweighted command parameters are tree, group, name, groups, iters, distance, processors, root and random. tree parameter is required unless you have valid current tree file.\n"; + helpString += "The unifrac.unweighted command parameters are tree, group, name, count, groups, iters, distance, processors, root and random. tree parameter is required unless you have valid current tree file.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n"; helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"; helpString += "The distance parameter allows you to create a distance file from the results. The default is false. You may set distance to lt, square or column.\n"; @@ -165,6 +166,14 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -186,6 +195,19 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(treefile); } @@ -233,7 +255,13 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { consensus = m->isTrue(temp); if (subsample && random) { m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true; } - if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; } + if (countfile == "") { if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; } } + else { + CountTable testCt; + if ((!testCt.testGroups(countfile)) && (subsample)) { + m->mothurOut("[ERROR]: if subsample=t, a count file with group info must be provided.\n"); abort=true; + } + } if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; } if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; } @@ -246,10 +274,12 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { m->setGroups(Groups); } - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); - } + if (countfile=="") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } } @@ -267,12 +297,12 @@ int UnifracUnweightedCommand::execute() { m->setTreeFile(treefile); - TreeReader* reader = new TreeReader(treefile, groupfile, namefile); + TreeReader* reader; + if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); } + else { reader = new TreeReader(treefile, countfile); } T = reader->getTrees(); - tmap = T[0]->getTreeMap(); - map nameMap = reader->getNames(); - map unique2Dup = reader->getNameMap(); - delete reader; + ct = T[0]->getCountTable(); + delete reader; sumFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + getOutputFileNameTag("uwsummary"); outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile); @@ -280,7 +310,7 @@ int UnifracUnweightedCommand::execute() { SharedUtil util; Groups = m->getGroups(); - vector namesGroups = tmap->getNamesOfGroups(); + vector namesGroups = ct->getNamesOfGroups(); util.setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted"); //sets the groups the user wants to analyze Unweighted unweighted(includeRoot); @@ -292,10 +322,9 @@ int UnifracUnweightedCommand::execute() { //user has not set size, set size = smallest samples size if (subsampleSize == -1) { vector temp; temp.push_back(Groups[0]); - subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group + subsampleSize = ct->getGroupCount(Groups[0]); //num in first group for (int i = 1; i < Groups.size(); i++) { - temp.clear(); temp.push_back(Groups[i]); - int thisSize = (tmap->getNamesSeqs(temp)).size(); + int thisSize = ct->getGroupCount(Groups[i]); if (thisSize < subsampleSize) { subsampleSize = thisSize; } } m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n"); @@ -303,9 +332,7 @@ int UnifracUnweightedCommand::execute() { vector newGroups = Groups; Groups.clear(); for (int i = 0; i < newGroups.size(); i++) { - vector thisGroup; thisGroup.push_back(newGroups[i]); - vector thisGroupsSeqs = tmap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); + int thisSize = ct->getGroupCount(newGroups[i]); if (thisSize >= subsampleSize) { Groups.push_back(newGroups[i]); } else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); } @@ -330,7 +357,7 @@ int UnifracUnweightedCommand::execute() { //get pscores for users trees for (int i = 0; i < T.size(); i++) { - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } counter = 0; @@ -351,7 +378,7 @@ int UnifracUnweightedCommand::execute() { userData = unweighted.getValues(T[i], processors, outputDir); //userData[0] = unweightedscore - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }return 0; } //output scores for each combination for(int k = 0; k < numComp; k++) { @@ -366,7 +393,7 @@ int UnifracUnweightedCommand::execute() { if (random) { runRandomCalcs(T[i], userData); } - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } int startSubsample = time(NULL); @@ -376,32 +403,28 @@ int UnifracUnweightedCommand::execute() { if (m->control_pressed) { break; } //copy to preserve old one - would do this in subsample but memory cleanup becomes messy. - TreeMap* newTmap = new TreeMap(); - //newTmap->getCopy(*tmap); - - //SubSample sample; - //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize); - + CountTable* newCt = new CountTable(); + //uses method of setting groups to doNotIncludeMe SubSample sample; - Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup); + Tree* subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize); //call new weighted function vector iterData; iterData.resize(numComp,0); Unweighted thisUnweighted(includeRoot); iterData = thisUnweighted.getValues(subSampleTree, processors, outputDir); //userData[0] = weightedscore - + //save data to make ave dist, std dist calcDistsTotals.push_back(iterData); - delete newTmap; + delete newCt; delete subSampleTree; if((thisIter+1) % 100 == 0){ m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine(); } } - m->mothurOut("It took " + toString(time(NULL) - startSubsample) + " secs to run the subsampling."); m->mothurOutEndLine(); + if (subsample) { m->mothurOut("It took " + toString(time(NULL) - startSubsample) + " secs to run the subsampling."); m->mothurOutEndLine(); } - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (subsample) { getAverageSTDMatrices(calcDistsTotals, i); } if (consensus) { getConsensusTrees(calcDistsTotals, i); } @@ -420,7 +443,7 @@ int UnifracUnweightedCommand::execute() { outSum.close(); - delete tmap; + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -472,7 +495,7 @@ int UnifracUnweightedCommand::getAverageSTDMatrices(vector< vector >& di //find standard deviation vector stdDev; stdDev.resize(numComp, 0); - for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each for (int j = 0; j < dists[thisIter].size(); j++) { stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j])); } @@ -578,8 +601,16 @@ int UnifracUnweightedCommand::getConsensusTrees(vector< vector >& dists, m->runParse = false; //create treemap class from groupmap for tree class to use - TreeMap newTmap; - newTmap.makeSim(m->getGroups()); + CountTable newCt; + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->getGroups().size(); i++) { + nameMap.insert(m->getGroups()[i]); + gps.insert(m->getGroups()[i]); + groupMap[m->getGroups()[i]] = m->getGroups()[i]; + } + newCt.createTable(nameMap, groupMap, gps); //clear old tree names if any m->Treenames.clear(); @@ -587,7 +618,7 @@ int UnifracUnweightedCommand::getConsensusTrees(vector< vector >& dists, //fills globaldatas tree names m->Treenames = m->getGroups(); - vector newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created + vector newTrees = buildTrees(dists, treeNum, newCt); //also creates .all.tre file containing the trees created if (m->control_pressed) { return 0; } @@ -613,7 +644,7 @@ int UnifracUnweightedCommand::getConsensusTrees(vector< vector >& dists, } /**************************************************************************************************/ -vector UnifracUnweightedCommand::buildTrees(vector< vector >& dists, int treeNum, TreeMap& mytmap) { +vector UnifracUnweightedCommand::buildTrees(vector< vector >& dists, int treeNum, CountTable& myct) { try { vector trees; @@ -647,9 +678,8 @@ vector UnifracUnweightedCommand::buildTrees(vector< vector >& dis } //create tree - Tree* tempTree = new Tree(&mytmap, sims); - map empty; - tempTree->assembleTree(empty); + Tree* tempTree = new Tree(&myct, sims); + tempTree->assembleTree(); trees.push_back(tempTree); diff --git a/unifracunweightedcommand.h b/unifracunweightedcommand.h index 15c3b96..107083f 100644 --- a/unifracunweightedcommand.h +++ b/unifracunweightedcommand.h @@ -12,7 +12,7 @@ #include "command.hpp" #include "unweighted.h" -#include "treemap.h" +#include "counttable.h" #include "sharedutilities.h" #include "fileoutput.h" #include "readtree.h" @@ -39,7 +39,7 @@ class UnifracUnweightedCommand : public Command { private: FileOutput* output; vector T; //user trees - TreeMap* tmap; + CountTable* ct; string sumFile, allGroups; vector groupComb; // AB. AC, BC... int iters, numGroups, numComp, counter, processors, subsampleSize, subsampleIters; @@ -50,7 +50,7 @@ class UnifracUnweightedCommand : public Command { vector< map > rCumul; //map -vector entry for each combination. bool abort, phylip, random, includeRoot, consensus, subsample; - string groups, itersString, outputDir, outputForm, treefile, groupfile, namefile; + string groups, itersString, outputDir, outputForm, treefile, groupfile, namefile, countfile; vector Groups, outputNames; //holds groups to be used ofstream outSum, out; @@ -60,7 +60,7 @@ class UnifracUnweightedCommand : public Command { void printUWSummaryFile(int); void printUnweightedFile(); void createPhylipFile(int); - vector buildTrees(vector< vector >&, int, TreeMap&); + vector buildTrees(vector< vector >&, int, CountTable&); int getConsensusTrees(vector< vector >&, int); int getAverageSTDMatrices(vector< vector >&, int); diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index d1e8833..cbec749 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -16,8 +16,9 @@ vector UnifracWeightedCommand::setParameters(){ try { CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); @@ -42,7 +43,7 @@ vector UnifracWeightedCommand::setParameters(){ string UnifracWeightedCommand::getHelpString(){ try { string helpString = ""; - helpString += "The unifrac.weighted command parameters are tree, group, name, groups, iters, distance, processors, root, subsample, consensus and random. tree parameter is required unless you have valid current tree file.\n"; + helpString += "The unifrac.weighted command parameters are tree, group, name, count, groups, iters, distance, processors, root, subsample, consensus and random. tree parameter is required unless you have valid current tree file.\n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n"; helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"; helpString += "The distance parameter allows you to create a distance file from the results. The default is false.\n"; @@ -164,6 +165,14 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -186,6 +195,19 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(treefile); } @@ -233,14 +255,22 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { consensus = m->isTrue(temp); if (subsample && random) { m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true; } - if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; } + if (countfile == "") { if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; } } + else { + CountTable testCt; + if ((!testCt.testGroups(countfile)) && (subsample)) { + m->mothurOut("[ERROR]: if subsample=t, a count file with group info must be provided.\n"); abort=true; + } + } if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; } if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; } - if (namefile == "") { - vector files; files.push_back(treefile); - parser.getNameFile(files); - } + if (countfile=="") { + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } @@ -258,14 +288,14 @@ int UnifracWeightedCommand::execute() { m->setTreeFile(treefile); - TreeReader* reader = new TreeReader(treefile, groupfile, namefile); + TreeReader* reader; + if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); } + else { reader = new TreeReader(treefile, countfile); } T = reader->getTrees(); - tmap = T[0]->getTreeMap(); - map nameMap = reader->getNames(); - map unique2Dup = reader->getNameMap(); + ct = T[0]->getCountTable(); delete reader; - - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; } + + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; } sumFile = outputDir + m->getSimpleName(treefile) + getOutputFileNameTag("wsummary"); m->openOutputFile(sumFile, outSum); @@ -274,11 +304,11 @@ int UnifracWeightedCommand::execute() { SharedUtil util; string s; //to make work with setgroups Groups = m->getGroups(); - vector nameGroups = tmap->getNamesOfGroups(); + vector nameGroups = ct->getNamesOfGroups(); util.setGroups(Groups, nameGroups, s, numGroups, "weighted"); //sets the groups the user wants to analyze m->setGroups(Groups); - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; } Weighted weighted(includeRoot); @@ -289,10 +319,9 @@ int UnifracWeightedCommand::execute() { //user has not set size, set size = smallest samples size if (subsampleSize == -1) { vector temp; temp.push_back(Groups[0]); - subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group + subsampleSize = ct->getGroupCount(Groups[0]); //num in first group for (int i = 1; i < Groups.size(); i++) { - temp.clear(); temp.push_back(Groups[i]); - int thisSize = (tmap->getNamesSeqs(temp)).size(); + int thisSize = ct->getGroupCount(Groups[i]); if (thisSize < subsampleSize) { subsampleSize = thisSize; } } m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n"); @@ -300,12 +329,10 @@ int UnifracWeightedCommand::execute() { vector newGroups = Groups; Groups.clear(); for (int i = 0; i < newGroups.size(); i++) { - vector thisGroup; thisGroup.push_back(newGroups[i]); - vector thisGroupsSeqs = tmap->getNamesSeqs(thisGroup); - int thisSize = thisGroupsSeqs.size(); + int thisSize = ct->getGroupCount(newGroups[i]); if (thisSize >= subsampleSize) { Groups.push_back(newGroups[i]); } - else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); } + else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); } } m->setGroups(Groups); } @@ -321,7 +348,7 @@ int UnifracWeightedCommand::execute() { //get weighted scores for users trees for (int i = 0; i < T.size(); i++) { - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } counter = 0; rScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC... @@ -337,7 +364,7 @@ int UnifracWeightedCommand::execute() { } userData = weighted.getValues(T[i], processors, outputDir); //userData[0] = weightedscore - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //save users score for (int s=0; scontrol_pressed) { break; } //copy to preserve old one - would do this in subsample but memory cleanup becomes messy. - TreeMap* newTmap = new TreeMap(); - //newTmap->getCopy(*tmap); - - //SubSample sample; - //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize); + CountTable* newCt = new CountTable(); //uses method of setting groups to doNotIncludeMe SubSample sample; - Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup); - + Tree* subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize); + //call new weighted function vector iterData; iterData.resize(numComp,0); Weighted thisWeighted(includeRoot); @@ -379,20 +402,20 @@ int UnifracWeightedCommand::execute() { //save data to make ave dist, std dist calcDistsTotals.push_back(iterData); - delete newTmap; + delete newCt; delete subSampleTree; if((thisIter+1) % 100 == 0){ m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine(); } } - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (subsample) { getAverageSTDMatrices(calcDistsTotals, i); } if (consensus) { getConsensusTrees(calcDistsTotals, i); } } - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (phylip) { createPhylipFile(); } @@ -400,7 +423,7 @@ int UnifracWeightedCommand::execute() { //clear out users groups m->clearGroups(); - delete tmap; + delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -557,9 +580,17 @@ int UnifracWeightedCommand::getConsensusTrees(vector< vector >& dists, i //used in tree constructor m->runParse = false; - //create treemap class from groupmap for tree class to use - TreeMap newTmap; - newTmap.makeSim(m->getGroups()); + ///create treemap class from groupmap for tree class to use + CountTable newCt; + set nameMap; + map groupMap; + set gps; + for (int i = 0; i < m->getGroups().size(); i++) { + nameMap.insert(m->getGroups()[i]); + gps.insert(m->getGroups()[i]); + groupMap[m->getGroups()[i]] = m->getGroups()[i]; + } + newCt.createTable(nameMap, groupMap, gps); //clear old tree names if any m->Treenames.clear(); @@ -567,7 +598,7 @@ int UnifracWeightedCommand::getConsensusTrees(vector< vector >& dists, i //fills globaldatas tree names m->Treenames = m->getGroups(); - vector newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created + vector newTrees = buildTrees(dists, treeNum, newCt); //also creates .all.tre file containing the trees created if (m->control_pressed) { return 0; } @@ -593,7 +624,7 @@ int UnifracWeightedCommand::getConsensusTrees(vector< vector >& dists, i } /**************************************************************************************************/ -vector UnifracWeightedCommand::buildTrees(vector< vector >& dists, int treeNum, TreeMap& mytmap) { +vector UnifracWeightedCommand::buildTrees(vector< vector >& dists, int treeNum, CountTable& myct) { try { vector trees; @@ -627,9 +658,8 @@ vector UnifracWeightedCommand::buildTrees(vector< vector >& dists } //create tree - Tree* tempTree = new Tree(&mytmap, sims); - map empty; - tempTree->assembleTree(empty); + Tree* tempTree = new Tree(&myct, sims); + tempTree->assembleTree(); trees.push_back(tempTree); @@ -682,7 +712,7 @@ int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, vector usersS //get scores for random trees for (int j = 0; j < iters; j++) { - + cout << j << endl; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ driver(thisTree, namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores); @@ -693,7 +723,7 @@ int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, vector usersS driver(thisTree, namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores); #endif - if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //report progress // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine(); @@ -796,7 +826,7 @@ int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector > na /**************************************************************************************************/ int UnifracWeightedCommand::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, vector< vector >& scores) { try { - Tree* randT = new Tree(tmap); + Tree* randT = new Tree(ct); Weighted weighted(includeRoot); diff --git a/unifracweightedcommand.h b/unifracweightedcommand.h index 06354ce..fead41b 100644 --- a/unifracweightedcommand.h +++ b/unifracweightedcommand.h @@ -12,7 +12,7 @@ #include "command.hpp" #include "weighted.h" -#include "treemap.h" +#include "counttable.h" #include "progress.hpp" #include "sharedutilities.h" #include "fileoutput.h" @@ -43,7 +43,7 @@ class UnifracWeightedCommand : public Command { linePair(int i, int j) : start(i), num(j) {} }; vector lines; - TreeMap* tmap; + CountTable* ct; FileOutput* output; vector T; //user trees vector utreeScores; //user tree unweighted scores @@ -58,7 +58,7 @@ class UnifracWeightedCommand : public Command { map validScores; //map contains scores from random bool abort, phylip, random, includeRoot, subsample, consensus; - string groups, itersString, outputForm, treefile, groupfile, namefile; + string groups, itersString, outputForm, treefile, groupfile, namefile, countfile; vector Groups, outputNames; //holds groups to be used int processors, subsampleSize, subsampleIters; ofstream outSum; @@ -73,7 +73,7 @@ class UnifracWeightedCommand : public Command { int createProcesses(Tree*, vector< vector >, vector< vector >&); int driver(Tree*, vector< vector >, int, int, vector< vector >&); int runRandomCalcs(Tree*, vector); - vector buildTrees(vector< vector >&, int, TreeMap&); + vector buildTrees(vector< vector >&, int, CountTable&); int getConsensusTrees(vector< vector >&, int); int getAverageSTDMatrices(vector< vector >&, int); diff --git a/unweighted.cpp b/unweighted.cpp index 864a9f8..e95834f 100644 --- a/unweighted.cpp +++ b/unweighted.cpp @@ -16,7 +16,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { processors = p; outputDir = o; - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); //if the users enters no groups then give them the score of all groups int numGroups = m->getNumGroups(); @@ -36,9 +36,9 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { vector groups; if (numGroups == 0) { //get score for all users groups - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - if ((tmap->getNamesOfGroups())[i] != "xxx") { - groups.push_back((tmap->getNamesOfGroups())[i]); + for (int i = 0; i < (ct->getNamesOfGroups()).size(); i++) { + if ((ct->getNamesOfGroups())[i] != "xxx") { + groups.push_back((ct->getNamesOfGroups())[i]); } } namesOfGroupCombos.push_back(groups); @@ -52,7 +52,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); }else{ int numPairs = namesOfGroupCombos.size(); @@ -67,11 +67,11 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { lines.push_back(linePair(startPos, numPairsPerProcessor)); } - data = createProcesses(t, namesOfGroupCombos, tmap); + data = createProcesses(t, namesOfGroupCombos, ct); lines.clear(); } #else - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); #endif return data; @@ -83,7 +83,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) { } /**************************************************************************************************/ -EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, TreeMap* tmap) { +EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, CountTable* ct) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; @@ -100,7 +100,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG process++; }else if (pid == 0){ EstOutput myresults; - myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap); + myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, ct); if (m->control_pressed) { exit(0); } @@ -122,7 +122,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG } } - results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap); + results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, ct); //force parent to wait until all the processes are done for (int i=0;i<(processors-1);i++) { @@ -167,7 +167,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG } } /**************************************************************************************************/ -EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, TreeMap* tmap) { +EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, CountTable* ct) { try { @@ -261,7 +261,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st processors = p; outputDir = o; - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); //if the users enters no groups then give them the score of all groups int numGroups = m->getNumGroups(); @@ -281,9 +281,9 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st vector groups; if (numGroups == 0) { //get score for all users groups - for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) { - if ((tmap->getNamesOfGroups())[i] != "xxx") { - groups.push_back((tmap->getNamesOfGroups())[i]); + for (int i = 0; i < (ct->getNamesOfGroups()).size(); i++) { + if ((ct->getNamesOfGroups())[i] != "xxx") { + groups.push_back((ct->getNamesOfGroups())[i]); } } namesOfGroupCombos.push_back(groups); @@ -297,7 +297,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, ct); }else{ int numPairs = namesOfGroupCombos.size(); @@ -311,12 +311,12 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st lines.push_back(linePair(startPos, numPairsPerProcessor)); } - data = createProcesses(t, namesOfGroupCombos, true, tmap); + data = createProcesses(t, namesOfGroupCombos, true, ct); lines.clear(); } #else - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, ct); #endif return data; @@ -328,7 +328,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st } /**************************************************************************************************/ -EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, bool usingGroups, TreeMap* tmap) { +EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, bool usingGroups, CountTable* ct) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; @@ -345,7 +345,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG process++; }else if (pid == 0){ EstOutput myresults; - myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups, tmap); + myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups, ct); if (m->control_pressed) { exit(0); } @@ -365,7 +365,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG } } - results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups, tmap); + results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups, ct); //force parent to wait until all the processes are done for (int i=0;i<(processors-1);i++) { @@ -409,14 +409,14 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector > namesOfG } } /**************************************************************************************************/ -EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, bool usingGroups, TreeMap* tmap) { +EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, bool usingGroups, CountTable* ct) { try { EstOutput results; results.resize(num); int count = 0; - Tree* copyTree = new Tree(tmap); + Tree* copyTree = new Tree(ct); for (int h = start; h < (start+num); h++) { diff --git a/unweighted.h b/unweighted.h index c6c13bb..b136b00 100644 --- a/unweighted.h +++ b/unweighted.h @@ -12,7 +12,7 @@ */ #include "treecalculator.h" -#include "treemap.h" +#include "counttable.h" /***********************************************************************/ @@ -38,10 +38,10 @@ class Unweighted : public TreeCalculator { map< vector, set > rootForGrouping; //maps a grouping combo to the roots for that combo bool includeRoot; - EstOutput driver(Tree*, vector< vector >, int, int, TreeMap*); - EstOutput createProcesses(Tree*, vector< vector >, TreeMap*); - EstOutput driver(Tree*, vector< vector >, int, int, bool, TreeMap*); - EstOutput createProcesses(Tree*, vector< vector >, bool, TreeMap*); + EstOutput driver(Tree*, vector< vector >, int, int, CountTable*); + EstOutput createProcesses(Tree*, vector< vector >, CountTable*); + EstOutput driver(Tree*, vector< vector >, int, int, bool, CountTable*); + EstOutput createProcesses(Tree*, vector< vector >, bool, CountTable*); int getRoot(Tree*, int, vector); }; diff --git a/validparameter.cpp b/validparameter.cpp index 3e1f349..7d1af25 100644 --- a/validparameter.cpp +++ b/validparameter.cpp @@ -307,6 +307,14 @@ string ValidParameters::validFile(map& container, string paramet if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ". I suspect you entered a column formatted file as a phylip file, aborting."); m->mothurOutEndLine(); return "not found"; } } + + //check for blank file + if (ableToOpen != 1) { + if (m->isBlank(container[parameter])) { + m->mothurOut("[ERROR]: " + container[parameter] + " is blank, aborting."); m->mothurOutEndLine(); return "not found"; + } + } + } }else { return "not found"; } diff --git a/weighted.cpp b/weighted.cpp index 85eed52..cf1291d 100644 --- a/weighted.cpp +++ b/weighted.cpp @@ -19,7 +19,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) { processors = p; outputDir = o; - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); numGroups = m->getNumGroups(); @@ -38,7 +38,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); }else{ int numPairs = namesOfGroupCombos.size(); @@ -52,12 +52,12 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) { lines.push_back(linePair(startPos, numPairsPerProcessor)); } - data = createProcesses(t, namesOfGroupCombos, tmap); + data = createProcesses(t, namesOfGroupCombos, ct); lines.clear(); } #else - data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap); + data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct); #endif return data; @@ -69,7 +69,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) { } /**************************************************************************************************/ -EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, TreeMap* tmap) { +EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGroupCombos, CountTable* ct) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) int process = 1; @@ -87,7 +87,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGro }else if (pid == 0){ EstOutput Myresults; - Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap); + Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, ct); //m->mothurOut("Merging results."); m->mothurOutEndLine(); @@ -110,7 +110,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGro } } - results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap); + results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, ct); //force parent to wait until all the processes are done for (int i=0;i<(processors-1);i++) { @@ -155,7 +155,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector > namesOfGro } } /**************************************************************************************************/ -EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, TreeMap* tmap) { +EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int start, int num, CountTable* ct) { try { EstOutput results; vector D; @@ -179,7 +179,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int numSeqsInGroupI = it->second; double sum = getLengthToRoot(t, t->groupNodeInfo[groupA][j], groupA, groupB); - double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groupA]); + double weightedSum = ((numSeqsInGroupI * sum) / (double)ct->getGroupCount(groupA)); D[count] += weightedSum; } @@ -190,7 +190,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, int numSeqsInGroupL = it->second; double sum = getLengthToRoot(t, t->groupNodeInfo[groupB][j], groupA, groupB); - double weightedSum = ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groupB]); + double weightedSum = ((numSeqsInGroupL * sum) / (double)ct->getGroupCount(groupB)); D[count] += weightedSum; } @@ -216,7 +216,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, it = t->tree[i].pcount.find(groupA); //if it does u = # of its descendants with a certain group / total number in tree with a certain group if (it != t->tree[i].pcount.end()) { - u = (double) t->tree[i].pcount[groupA] / (double) tmap->seqsPerGroup[groupA]; + u = (double) t->tree[i].pcount[groupA] / (double) ct->getGroupCount(groupA); }else { u = 0.00; } @@ -225,7 +225,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector > namesOfGroupCombos, //if it does subtract their percentage from u if (it != t->tree[i].pcount.end()) { - u -= (double) t->tree[i].pcount[groupB] / (double) tmap->seqsPerGroup[groupB]; + u -= (double) t->tree[i].pcount[groupB] / (double) ct->getGroupCount(groupB); } if (includeRoot) { @@ -270,7 +270,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { data.clear(); //clear out old values - TreeMap* tmap = t->getTreeMap(); + CountTable* ct = t->getCountTable(); if (m->control_pressed) { return data; } @@ -287,7 +287,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { int numSeqsInGroupI = it->second; double sum = getLengthToRoot(t, t->groupNodeInfo[groups[0]][j], groups[0], groups[1]); - double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groups[0]]); + double weightedSum = ((numSeqsInGroupI * sum) / (double)ct->getGroupCount(groups[0])); D += weightedSum; } @@ -298,7 +298,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { int numSeqsInGroupL = it->second; double sum = getLengthToRoot(t, t->groupNodeInfo[groups[1]][j], groups[0], groups[1]); - double weightedSum = ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groups[1]]); + double weightedSum = ((numSeqsInGroupL * sum) / (double)ct->getGroupCount(groups[1])); D += weightedSum; } @@ -314,7 +314,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { it = t->tree[i].pcount.find(groupA); //if it does u = # of its descendants with a certain group / total number in tree with a certain group if (it != t->tree[i].pcount.end()) { - u = (double) t->tree[i].pcount[groupA] / (double) tmap->seqsPerGroup[groupA]; + u = (double) t->tree[i].pcount[groupA] / (double) ct->getGroupCount(groupA); }else { u = 0.00; } @@ -322,7 +322,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { it = t->tree[i].pcount.find(groupB); //if it does subtract their percentage from u if (it != t->tree[i].pcount.end()) { - u -= (double) t->tree[i].pcount[groupB] / (double) tmap->seqsPerGroup[groupB]; + u -= (double) t->tree[i].pcount[groupB] / (double) ct->getGroupCount(groupB); } if (includeRoot) { diff --git a/weighted.h b/weighted.h index 180409c..d4082fe 100644 --- a/weighted.h +++ b/weighted.h @@ -12,7 +12,7 @@ */ #include "treecalculator.h" -#include "treemap.h" +#include "counttable.h" /***********************************************************************/ @@ -41,8 +41,8 @@ class Weighted : public TreeCalculator { map< vector, set > rootForGrouping; //maps a grouping combo to the root for that combo bool includeRoot; - EstOutput driver(Tree*, vector< vector >, int, int, TreeMap*); - EstOutput createProcesses(Tree*, vector< vector >, TreeMap*); + EstOutput driver(Tree*, vector< vector >, int, int, CountTable*); + EstOutput createProcesses(Tree*, vector< vector >, CountTable*); double getLengthToRoot(Tree*, int, string, string); }; diff --git a/weightedlinkage.cpp b/weightedlinkage.cpp index 19c41ce..c1e4d51 100644 --- a/weightedlinkage.cpp +++ b/weightedlinkage.cpp @@ -5,7 +5,6 @@ #include "mothur.h" #include "cluster.hpp" #include "rabundvector.hpp" -#include "sparsematrix.hpp" /* This class implements the WPGMA, weighted average neighbor clustering algorithm */