Merge remote-tracking branch 'mothur/master'

author Pat Schloss <pschloss@umich.edu>

Mon, 22 Oct 2012 13:18:06 +0000 (09:18 -0400)

committer Pat Schloss <pschloss@umich.edu>

Mon, 22 Oct 2012 13:18:06 +0000 (09:18 -0400)
author Pat Schloss <pschloss@umich.edu>
Mon, 22 Oct 2012 13:18:06 +0000 (09:18 -0400)
committer Pat Schloss <pschloss@umich.edu>
Mon, 22 Oct 2012 13:18:06 +0000 (09:18 -0400)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index 979b1e69ca8c3c9938e4ab873601b96a5c9e1ecc..ecb0619a39d993201f9bd102813c9b10b7cc3f0b 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -19,11 +19,21 @@
                 A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; };
                 A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; };
                 A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; };
+               A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB66161C570F009860A1 /* alignnode.cpp */; };
+               A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB68161C570F009860A1 /* aligntree.cpp */; };
+               A721AB71161C572A009860A1 /* kmernode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6D161C572A009860A1 /* kmernode.cpp */; };
+               A721AB72161C572A009860A1 /* kmertree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6F161C572A009860A1 /* kmertree.cpp */; };
+               A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB73161C573B009860A1 /* taxonomynode.cpp */; };
                 A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A724D2B6153C8628000A826F /* makebiomcommand.cpp */; };
                 A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; };
+               A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C211619CCE600651424 /* classifysharedcommand.cpp */; };
+               A7386C251619E52300651424 /* abstractdecisiontree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C241619E52200651424 /* abstractdecisiontree.cpp */; };
+               A7386C27161A0F9D00651424 /* abstractrandomforest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C26161A0F9C00651424 /* abstractrandomforest.cpp */; };
+               A7386C29161A110800651424 /* decisiontree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C28161A110700651424 /* decisiontree.cpp */; };
                 A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73901071588C40900ED2ED6 /* loadlogfilecommand.cpp */; };
                 A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */; };
                 A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */; };
+               A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */; };
                 A74A9A9F148E881E00AB5E3E /* spline.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74A9A9E148E881E00AB5E3E /* spline.cpp */; };
                 A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; };
                 A74D59A4159A1E2000043046 /* counttable.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D59A3159A1E2000043046 /* counttable.cpp */; };
@@ -36,6 +46,8 @@
                 A77410F614697C300098E6AC /* seqnoise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77410F414697C300098E6AC /* seqnoise.cpp */; };
                 A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; };
                 A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; };
+               A77E1938161B201E00DB1A2A /* randomforest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77E1937161B201E00DB1A2A /* randomforest.cpp */; };
+               A77E193B161B289600DB1A2A /* rftreenode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77E193A161B289600DB1A2A /* rftreenode.cpp */; };
                 A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */; };
                 A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7876A25152A017C00A0AE86 /* subsample.cpp */; };
                 A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79234D613C74BF6002B08E2 /* mothurfisher.cpp */; };
@@ -50,6 +62,7 @@
                 A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */; };
                 A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; };
                 A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */; };
+               A7C7DAB915DA758B0059B0CF /* sffmultiplecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */; };
                 A7D755DA1535F679009BF21A /* treereader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D755D91535F679009BF21A /* treereader.cpp */; };
                 A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */; };
                 A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; };
@@ -387,16 +400,39 @@
                 A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = "<group>"; };
                 A721765513BB9F7D0014DAAE /* referencedb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = referencedb.h; sourceTree = "<group>"; };
                 A721765613BB9F7D0014DAAE /* referencedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = referencedb.cpp; sourceTree = "<group>"; };
+               A721AB66161C570F009860A1 /* alignnode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignnode.cpp; sourceTree = "<group>"; };
+               A721AB67161C570F009860A1 /* alignnode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignnode.h; sourceTree = "<group>"; };
+               A721AB68161C570F009860A1 /* aligntree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligntree.cpp; sourceTree = "<group>"; };
+               A721AB69161C570F009860A1 /* aligntree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligntree.h; sourceTree = "<group>"; };
+               A721AB6D161C572A009860A1 /* kmernode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmernode.cpp; sourceTree = "<group>"; };
+               A721AB6E161C572A009860A1 /* kmernode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmernode.h; sourceTree = "<group>"; };
+               A721AB6F161C572A009860A1 /* kmertree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmertree.cpp; sourceTree = "<group>"; };
+               A721AB70161C572A009860A1 /* kmertree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmertree.h; sourceTree = "<group>"; };
+               A721AB73161C573B009860A1 /* taxonomynode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = taxonomynode.cpp; sourceTree = "<group>"; };
+               A721AB74161C573B009860A1 /* taxonomynode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = taxonomynode.h; sourceTree = "<group>"; };
                 A724D2B4153C8600000A826F /* makebiomcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makebiomcommand.h; sourceTree = "<group>"; };
                 A724D2B6153C8628000A826F /* makebiomcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makebiomcommand.cpp; sourceTree = "<group>"; };
                 A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = "<group>"; };
                 A727864312E9E28C00F86ABA /* removerarecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removerarecommand.cpp; sourceTree = "<group>"; };
+               A7386C1B1619CACB00651424 /* abstractdecisiontree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = abstractdecisiontree.hpp; sourceTree = "<group>"; };
+               A7386C1C1619CACB00651424 /* abstractrandomforest.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = abstractrandomforest.hpp; sourceTree = "<group>"; };
+               A7386C1D1619CACB00651424 /* decisiontree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = decisiontree.hpp; sourceTree = "<group>"; };
+               A7386C1E1619CACB00651424 /* macros.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = macros.h; sourceTree = "<group>"; };
+               A7386C1F1619CACB00651424 /* randomforest.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = randomforest.hpp; sourceTree = "<group>"; };
+               A7386C201619CACB00651424 /* rftreenode.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = rftreenode.hpp; sourceTree = "<group>"; };
+               A7386C211619CCE600651424 /* classifysharedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifysharedcommand.cpp; sourceTree = "<group>"; };
+               A7386C221619CCE600651424 /* classifysharedcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifysharedcommand.h; sourceTree = "<group>"; };
+               A7386C241619E52200651424 /* abstractdecisiontree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstractdecisiontree.cpp; sourceTree = "<group>"; };
+               A7386C26161A0F9C00651424 /* abstractrandomforest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstractrandomforest.cpp; sourceTree = "<group>"; };
+               A7386C28161A110700651424 /* decisiontree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decisiontree.cpp; sourceTree = "<group>"; };
                 A73901051588C3EF00ED2ED6 /* loadlogfilecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = loadlogfilecommand.h; sourceTree = "<group>"; };
                 A73901071588C40900ED2ED6 /* loadlogfilecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = loadlogfilecommand.cpp; sourceTree = "<group>"; };
                 A73DDBB813C4A0D1006AAE38 /* clearmemorycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clearmemorycommand.h; sourceTree = "<group>"; };
                 A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clearmemorycommand.cpp; sourceTree = "<group>"; };
                 A73DDC3613C4BF64006AAE38 /* mothurmetastats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurmetastats.h; sourceTree = "<group>"; };
                 A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurmetastats.cpp; sourceTree = "<group>"; };
+               A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequencecountparser.cpp; sourceTree = "<group>"; };
+               A741FAD415D168A00067BCC5 /* sequencecountparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequencecountparser.h; sourceTree = "<group>"; };
                 A74A9A9D148E881E00AB5E3E /* spline.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = spline.h; sourceTree = "<group>"; };
                 A74A9A9E148E881E00AB5E3E /* spline.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = spline.cpp; sourceTree = "<group>"; };
                 A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerauchimecommand.h; sourceTree = "<group>"; };
@@ -421,6 +457,8 @@
                 A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcommandinfocommand.cpp; sourceTree = "<group>"; };
                 A77A221D139001B600B0BE70 /* deuniquetreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deuniquetreecommand.h; sourceTree = "<group>"; };
                 A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deuniquetreecommand.cpp; sourceTree = "<group>"; };
+               A77E1937161B201E00DB1A2A /* randomforest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = randomforest.cpp; sourceTree = "<group>"; };
+               A77E193A161B289600DB1A2A /* rftreenode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rftreenode.cpp; sourceTree = "<group>"; };
                 A77EBD2C1523707F00ED407C /* createdatabasecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = createdatabasecommand.h; sourceTree = "<group>"; };
                 A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = createdatabasecommand.cpp; sourceTree = "<group>"; };
                 A7876A25152A017C00A0AE86 /* subsample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = subsample.cpp; sourceTree = "<group>"; };
@@ -451,6 +489,8 @@
                 A7C3DC0A14FE457500FE1924 /* cooccurrencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cooccurrencecommand.h; sourceTree = "<group>"; };
                 A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trialSwap2.cpp; sourceTree = "<group>"; };
                 A7C3DC0E14FE469500FE1924 /* trialswap2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trialswap2.h; sourceTree = "<group>"; };
+               A7C7DAB615DA75760059B0CF /* sffmultiplecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sffmultiplecommand.h; sourceTree = "<group>"; };
+               A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sffmultiplecommand.cpp; sourceTree = "<group>"; };
                 A7D755D71535F665009BF21A /* treereader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treereader.h; sourceTree = "<group>"; };
                 A7D755D91535F679009BF21A /* treereader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treereader.cpp; sourceTree = "<group>"; };
                 A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = "<group>"; };
@@ -1126,6 +1166,7 @@
                                 A7E9B79B12D37EC400DA6239 /* progress.cpp */,
                                 A7E9B79C12D37EC400DA6239 /* progress.hpp */,
                                 A7E9B7A512D37EC400DA6239 /* rarecalc.cpp */,
+                               A7386C191619C9FB00651424 /* randomforest */,
                                 A7E9B7A612D37EC400DA6239 /* rarecalc.h */,
                                 A7E9B7A712D37EC400DA6239 /* raredisplay.cpp */,
                                 A7E9B7A812D37EC400DA6239 /* raredisplay.h */,
@@ -1167,6 +1208,24 @@
                         name = Products;
                         sourceTree = "<group>";
                 };
+               A7386C191619C9FB00651424 /* randomforest */ = {
+                       isa = PBXGroup;
+                       children = (
+                               A7386C1B1619CACB00651424 /* abstractdecisiontree.hpp */,
+                               A7386C241619E52200651424 /* abstractdecisiontree.cpp */,
+                               A7386C1C1619CACB00651424 /* abstractrandomforest.hpp */,
+                               A7386C26161A0F9C00651424 /* abstractrandomforest.cpp */,
+                               A7386C1D1619CACB00651424 /* decisiontree.hpp */,
+                               A7386C28161A110700651424 /* decisiontree.cpp */,
+                               A7386C1E1619CACB00651424 /* macros.h */,
+                               A7386C1F1619CACB00651424 /* randomforest.hpp */,
+                               A77E1937161B201E00DB1A2A /* randomforest.cpp */,
+                               A7386C201619CACB00651424 /* rftreenode.hpp */,
+                               A77E193A161B289600DB1A2A /* rftreenode.cpp */,
+                       );
+                       name = randomforest;
+                       sourceTree = "<group>";
+               };
                 A7D161E7149F7F50000523E8 /* fortran */ = {
                         isa = PBXGroup;
                         children = (
@@ -1223,6 +1282,8 @@
                                 A7E9B69012D37EC400DA6239 /* classifyotucommand.cpp */,
                                 A7E9B69312D37EC400DA6239 /* classifyseqscommand.h */,
                                 A7E9B69212D37EC400DA6239 /* classifyseqscommand.cpp */,
+                               A7386C221619CCE600651424 /* classifysharedcommand.h */,
+                               A7386C211619CCE600651424 /* classifysharedcommand.cpp */,
                                 A7EEB0F714F29C1B00344B83 /* classifytreecommand.h */,
                                 A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */,
                                 A7E9B69712D37EC400DA6239 /* clearcutcommand.h */,
@@ -1407,6 +1468,8 @@
                                 A7E9B7E112D37EC400DA6239 /* setlogfilecommand.cpp */,
                                 A7E9B7E412D37EC400DA6239 /* sffinfocommand.h */,
                                 A7E9B7E312D37EC400DA6239 /* sffinfocommand.cpp */,
+                               A7C7DAB615DA75760059B0CF /* sffmultiplecommand.h */,
+                               A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */,
                                 A7E9B7F312D37EC400DA6239 /* sharedcommand.h */,
                                 A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */,
                                 A7E9B82812D37EC400DA6239 /* shhhercommand.h */,
@@ -1659,6 +1722,8 @@
                                 A7E9B7D012D37EC400DA6239 /* sabundvector.hpp */,
                                 A7E9B7DB12D37EC400DA6239 /* sequence.cpp */,
                                 A7E9B7DC12D37EC400DA6239 /* sequence.hpp */,
+                               A741FAD415D168A00067BCC5 /* sequencecountparser.h */,
+                               A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */,
                                 A7E9B7DD12D37EC400DA6239 /* sequencedb.cpp */,
                                 A7E9B7DE12D37EC400DA6239 /* sequencedb.h */,
                                 A7F9F5CD141A5E500032F693 /* sequenceparser.h */,
@@ -1725,10 +1790,18 @@
                 A7E9BA4B12D3966900DA6239 /* classifier */ = {
                         isa = PBXGroup;
                         children = (
-                               A7E9B65A12D37EC300DA6239 /* bayesian.cpp */,
+                               A721AB67161C570F009860A1 /* alignnode.h */,
+                               A721AB66161C570F009860A1 /* alignnode.cpp */,
+                               A721AB69161C570F009860A1 /* aligntree.h */,
+                               A721AB68161C570F009860A1 /* aligntree.cpp */,
                                 A7E9B65B12D37EC300DA6239 /* bayesian.h */,
+                               A7E9B65A12D37EC300DA6239 /* bayesian.cpp */,
                                 A7E9B68E12D37EC400DA6239 /* classify.cpp */,
                                 A7E9B68F12D37EC400DA6239 /* classify.h */,
+                               A721AB6E161C572A009860A1 /* kmernode.h */,
+                               A721AB6D161C572A009860A1 /* kmernode.cpp */,
+                               A721AB70161C572A009860A1 /* kmertree.h */,
+                               A721AB6F161C572A009860A1 /* kmertree.cpp */,
                                 A7E9B73812D37EC400DA6239 /* knn.h */,
                                 A7E9B73712D37EC400DA6239 /* knn.cpp */,
                                 A7E9B78D12D37EC400DA6239 /* phylosummary.cpp */,
@@ -1737,6 +1810,8 @@
                                 A7E9B79012D37EC400DA6239 /* phylotree.h */,
                                 A7E9B85D12D37EC400DA6239 /* taxonomyequalizer.cpp */,
                                 A7E9B85E12D37EC400DA6239 /* taxonomyequalizer.h */,
+                               A721AB74161C573B009860A1 /* taxonomynode.h */,
+                               A721AB73161C573B009860A1 /* taxonomynode.cpp */,
                         );
                         name = classifier;
                         sourceTree = "<group>";
@@ -2192,6 +2267,19 @@
                                 A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */,
                                 A74D59A4159A1E2000043046 /* counttable.cpp in Sources */,
                                 A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */,
+                               A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */,
+                               A7C7DAB915DA758B0059B0CF /* sffmultiplecommand.cpp in Sources */,
+                               A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */,
+                               A7386C251619E52300651424 /* abstractdecisiontree.cpp in Sources */,
+                               A7386C27161A0F9D00651424 /* abstractrandomforest.cpp in Sources */,
+                               A7386C29161A110800651424 /* decisiontree.cpp in Sources */,
+                               A77E1938161B201E00DB1A2A /* randomforest.cpp in Sources */,
+                               A77E193B161B289600DB1A2A /* rftreenode.cpp in Sources */,
+                               A721AB6A161C570F009860A1 /* alignnode.cpp in Sources */,
+                               A721AB6B161C570F009860A1 /* aligntree.cpp in Sources */,
+                               A721AB71161C572A009860A1 /* kmernode.cpp in Sources */,
+                               A721AB72161C572A009860A1 /* kmertree.cpp in Sources */,
+                               A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;
                 };
@@ -2275,8 +2363,8 @@
                                 GCC_MODEL_TUNING = "";
                                 GCC_OPTIMIZATION_LEVEL = 3;
                                 GCC_PREPROCESSOR_DEFINITIONS = (
-                                       "VERSION=\"\\\"1.26.0\\\"\"",
-                                       "RELEASE_DATE=\"\\\"7/9/2012\\\"\"",
+                                       "VERSION=\"\\\"1.27.0\\\"\"",
+                                       "RELEASE_DATE=\"\\\"8/8/2012\\\"\"",
                                 );
                                 GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
                                 GCC_WARN_ABOUT_RETURN_TYPE = YES;
diff --git a/abstractdecisiontree.cpp b/abstractdecisiontree.cpp

new file mode 100644 (file)

index 0000000..085cd31
--- /dev/null
+++ b/abstractdecisiontree.cpp
@@ -0,0 +1,285 @@
+//
+//  abstractdecisiontree.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 10/1/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "abstractdecisiontree.hpp"
+
+/**************************************************************************************************/
+
+AbstractDecisionTree::AbstractDecisionTree(vector<vector<int> >baseDataSet, 
+                     vector<int> globalDiscardedFeatureIndices, 
+                     OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, 
+                     string treeSplitCriterion) : baseDataSet(baseDataSet),
+numSamples((int)baseDataSet.size()),
+numFeatures((int)(baseDataSet[0].size() - 1)),
+numOutputClasses(0),
+rootNode(NULL),
+globalDiscardedFeatureIndices(globalDiscardedFeatureIndices),
+optimumFeatureSubsetSize(optimumFeatureSubsetSelector.getOptimumFeatureSubsetSize(numFeatures)),
+treeSplitCriterion(treeSplitCriterion) {
+
+    try {
+    // TODO: istead of calculating this for every DecisionTree
+    // clacualte this once in the RandomForest class and pass the values
+    m = MothurOut::getInstance();
+    for (int i = 0;  i < numSamples; i++) {
+        if (m->control_pressed) { break; }
+        int outcome = baseDataSet[i][numFeatures];
+        vector<int>::iterator it = find(outputClasses.begin(), outputClasses.end(), outcome);
+        if (it == outputClasses.end()){       // find() will return classes.end() if the element is not found
+            outputClasses.push_back(outcome);
+            numOutputClasses++;
+        }
+    }
+    
+    if (m->debug) {
+        //m->mothurOut("outputClasses = " + toStringVectorInt(outputClasses));
+        m->mothurOut("numOutputClasses = " + toString(numOutputClasses) + '\n');
+    }
+
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AbstractDecisionTree", "AbstractDecisionTree");
+               exit(1);
+       } 
+}
+/**************************************************************************************************/
+int AbstractDecisionTree::createBootStrappedSamples(){
+    try {    
+    vector<bool> isInTrainingSamples(numSamples, false);
+    
+    for (int i = 0; i < numSamples; i++) {
+        if (m->control_pressed) { return 0; }
+        // TODO: optimize the rand() function call + double check if it's working properly
+        int randomIndex = rand() % numSamples;
+        bootstrappedTrainingSamples.push_back(baseDataSet[randomIndex]);
+        isInTrainingSamples[randomIndex] = true;
+    }
+    
+    for (int i = 0; i < numSamples; i++) {
+        if (m->control_pressed) { return 0; }
+        if (isInTrainingSamples[i]){ bootstrappedTrainingSampleIndices.push_back(i); }
+        else{
+            bootstrappedTestSamples.push_back(baseDataSet[i]);
+            bootstrappedTestSampleIndices.push_back(i);
+        }
+    }
+    
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AbstractDecisionTree", "createBootStrappedSamples");
+               exit(1);
+       } 
+}
+/**************************************************************************************************/
+int AbstractDecisionTree::getMinEntropyOfFeature(vector<int> featureVector, vector<int> outputVector, double& minEntropy, int& featureSplitValue, double& intrinsicValue){
+    try {
+
+        vector< vector<int> > featureOutputPair(featureVector.size(), vector<int>(2, 0));
+        for (int i = 0; i < featureVector.size(); i++) { 
+            if (m->control_pressed) { return 0; }
+            featureOutputPair[i][0] = featureVector[i];
+            featureOutputPair[i][1] = outputVector[i];
+        }
+        // TODO: using default behavior to sort(), need to specify the comparator for added safety and compiler portability
+        sort(featureOutputPair.begin(), featureOutputPair.end());
+        
+        
+        vector<int> splitPoints;
+        vector<int> uniqueFeatureValues(1, featureOutputPair[0][0]);
+        
+        for (int i = 0; i < featureOutputPair.size(); i++) {
+            if (m->control_pressed) { return 0; }
+            int featureValue = featureOutputPair[i][0];
+            vector<int>::iterator it = find(uniqueFeatureValues.begin(), uniqueFeatureValues.end(), featureValue);
+            if (it == uniqueFeatureValues.end()){                 // NOT FOUND
+                uniqueFeatureValues.push_back(featureValue);
+                splitPoints.push_back(i);
+            }
+        }
+        
+
+        
+        int bestSplitIndex = -1;
+        if (splitPoints.size() == 0){
+            // TODO: trying out C++'s infitinity, don't know if this will work properly
+            // TODO: check the caller function of this function, there check the value if minEntropy and comapre to inf
+            // so that no wrong calculation is done
+            minEntropy = numeric_limits<double>::infinity();                          // OUTPUT
+            intrinsicValue = numeric_limits<double>::infinity();                      // OUTPUT
+            featureSplitValue = -1;                                                   // OUTPUT
+        }else{
+            getBestSplitAndMinEntropy(featureOutputPair, splitPoints, minEntropy, bestSplitIndex, intrinsicValue);  // OUTPUT
+            featureSplitValue = featureOutputPair[splitPoints[bestSplitIndex]][0];    // OUTPUT
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AbstractDecisionTree", "getMinEntropyOfFeature");
+               exit(1);
+       } 
+}
+/**************************************************************************************************/
+double AbstractDecisionTree::calcIntrinsicValue(int numLessThanValueAtSplitPoint, int numGreaterThanValueAtSplitPoint, int numSamples) {
+    try {
+        double upperSplitEntropy = 0.0, lowerSplitEntropy = 0.0;
+        if (numLessThanValueAtSplitPoint > 0) {
+            upperSplitEntropy = numLessThanValueAtSplitPoint * log2((double) numLessThanValueAtSplitPoint / (double) numSamples);
+        }
+        
+        if (numGreaterThanValueAtSplitPoint > 0) {
+            lowerSplitEntropy = numGreaterThanValueAtSplitPoint * log2((double) numGreaterThanValueAtSplitPoint / (double) numSamples);
+        }
+        
+        double intrinsicValue = - ((double)(upperSplitEntropy + lowerSplitEntropy) / (double)numSamples);
+        return intrinsicValue;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AbstractDecisionTree", "calcIntrinsicValue");
+               exit(1);
+       } 
+}
+/**************************************************************************************************/
+int AbstractDecisionTree::getBestSplitAndMinEntropy(vector< vector<int> > featureOutputPairs, vector<int> splitPoints,
+                               double& minEntropy, int& minEntropyIndex, double& relatedIntrinsicValue){
+    try {
+        
+        int numSamples = (int)featureOutputPairs.size();
+        vector<double> entropies;
+        vector<double> intrinsicValues;
+        
+        for (int i = 0; i < splitPoints.size(); i++) {
+             if (m->control_pressed) { return 0; }
+            int index = splitPoints[i];
+            int valueAtSplitPoint = featureOutputPairs[index][0];
+            int numLessThanValueAtSplitPoint = 0;
+            int numGreaterThanValueAtSplitPoint = 0;
+            
+            for (int j = 0; j < featureOutputPairs.size(); j++) {
+                 if (m->control_pressed) { return 0; }
+                vector<int> record = featureOutputPairs[j];
+                if (record[0] < valueAtSplitPoint){ numLessThanValueAtSplitPoint++; }
+                else{ numGreaterThanValueAtSplitPoint++; }
+            }
+            
+            double upperEntropyOfSplit = calcSplitEntropy(featureOutputPairs, index, numOutputClasses, true);
+            double lowerEntropyOfSplit = calcSplitEntropy(featureOutputPairs, index, numOutputClasses, false);
+            
+            double totalEntropy = (numLessThanValueAtSplitPoint * upperEntropyOfSplit + numGreaterThanValueAtSplitPoint * lowerEntropyOfSplit) / (double)numSamples;
+            double intrinsicValue = calcIntrinsicValue(numLessThanValueAtSplitPoint, numGreaterThanValueAtSplitPoint, numSamples);
+            entropies.push_back(totalEntropy);
+            intrinsicValues.push_back(intrinsicValue);
+            
+        }
+                
+        // set output values
+        vector<double>::iterator it = min_element(entropies.begin(), entropies.end());
+        minEntropy = *it;                                                         // OUTPUT
+        minEntropyIndex = (int)(it - entropies.begin());                          // OUTPUT
+        relatedIntrinsicValue = intrinsicValues[minEntropyIndex];                 // OUTPUT
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AbstractDecisionTree", "getBestSplitAndMinEntropy");
+               exit(1);
+       } 
+}
+/**************************************************************************************************/
+
+double AbstractDecisionTree::calcSplitEntropy(vector< vector<int> > featureOutputPairs, int splitIndex, int numOutputClasses, bool isUpperSplit = true) {
+    try {
+        vector<int> classCounts(numOutputClasses, 0);
+        
+        if (isUpperSplit) { 
+            for (int i = 0; i < splitIndex; i++) { 
+                if (m->control_pressed) { return 0; }
+                classCounts[featureOutputPairs[i][1]]++; 
+            }
+        } else {
+            for (int i = splitIndex; i < featureOutputPairs.size(); i++) { 
+                if (m->control_pressed) { return 0; }
+                classCounts[featureOutputPairs[i][1]]++; 
+            }
+        }
+        
+        int totalClassCounts = accumulate(classCounts.begin(), classCounts.end(), 0);
+        
+        double splitEntropy = 0.0;
+        
+        for (int i = 0; i < classCounts.size(); i++) {
+            if (m->control_pressed) { return 0; }
+            if (classCounts[i] == 0) { continue; }
+            double probability = (double) classCounts[i] / (double) totalClassCounts;
+            splitEntropy += -(probability * log2(probability));
+        }
+        
+        return splitEntropy;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AbstractDecisionTree", "calcSplitEntropy");
+               exit(1);
+       } 
+}
+
+/**************************************************************************************************/
+
+int AbstractDecisionTree::getSplitPopulation(RFTreeNode* node, vector< vector<int> >& leftChildSamples, vector< vector<int> >& rightChildSamples){    
+    try {
+        // TODO: there is a possibility of optimization if we can recycle the samples in each nodes
+        // we just need to pointers to the samples i.e. vector<int> and use it everywhere and not create the sample 
+        // sample over and over again
+        // we need to make this const so that it is not modified by all the function calling
+        // currently purgeTreeNodesDataRecursively() is used for the same purpose, but this can be avoided altogher
+        // if re-using the same data over the classes
+        
+        int splitFeatureGlobalIndex = node->getSplitFeatureIndex();
+        
+        for (int i = 0; i < node->getBootstrappedTrainingSamples().size(); i++) {
+            if (m->control_pressed) { return 0; }
+            vector<int> sample =  node->getBootstrappedTrainingSamples()[i];
+            if (m->control_pressed) { return 0; }
+            if (sample[splitFeatureGlobalIndex] < node->getSplitFeatureValue()){ leftChildSamples.push_back(sample); }
+            else{ rightChildSamples.push_back(sample); }
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AbstractDecisionTree", "getSplitPopulation");
+               exit(1);
+       } 
+}
+/**************************************************************************************************/
+// TODO: checkIfAlreadyClassified() verify code
+// TODO: use bootstrappedOutputVector for easier calculation instead of using getBootstrappedTrainingSamples()
+bool AbstractDecisionTree::checkIfAlreadyClassified(RFTreeNode* treeNode, int& outputClass) {
+    try {
+
+        vector<int> tempOutputClasses;
+        for (int i = 0; i < treeNode->getBootstrappedTrainingSamples().size(); i++) {
+            if (m->control_pressed) { return 0; }
+            int sampleOutputClass = treeNode->getBootstrappedTrainingSamples()[i][numFeatures];
+            vector<int>::iterator it = find(tempOutputClasses.begin(), tempOutputClasses.end(), sampleOutputClass);
+            if (it == tempOutputClasses.end()) {               // NOT FOUND
+                tempOutputClasses.push_back(sampleOutputClass);
+            }
+        }
+        
+        if (tempOutputClasses.size() < 2) { outputClass = tempOutputClasses[0]; return true; }
+        else { outputClass = -1; return false; }
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AbstractDecisionTree", "checkIfAlreadyClassified");
+               exit(1);
+       } 
+}
+
+/**************************************************************************************************/
diff --git a/abstractdecisiontree.hpp b/abstractdecisiontree.hpp

new file mode 100755 (executable)

index 0000000..3445db4
--- /dev/null
+++ b/abstractdecisiontree.hpp
@@ -0,0 +1,63 @@
+//
+//  abstractdecisiontree.hpp
+//  rrf-fs-prototype
+//
+//  Created by Abu Zaher Faridee on 7/22/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef rrf_fs_prototype_abstractdecisiontree_hpp
+#define rrf_fs_prototype_abstractdecisiontree_hpp
+
+#include "mothurout.h"
+#include "macros.h"
+#include "rftreenode.hpp"
+
+#define DEBUG_MODE
+
+/**************************************************************************************************/
+
+class AbstractDecisionTree{
+  
+public:
+  
+    AbstractDecisionTree(vector<vector<int> >baseDataSet, 
+                       vector<int> globalDiscardedFeatureIndices, 
+                       OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, 
+                       string treeSplitCriterion);    
+    virtual ~AbstractDecisionTree(){}
+    
+  
+protected:
+  
+    virtual int createBootStrappedSamples();
+    virtual int getMinEntropyOfFeature(vector<int> featureVector, vector<int> outputVector, double& minEntropy, int& featureSplitValue, double& intrinsicValue);
+    virtual int getBestSplitAndMinEntropy(vector< vector<int> > featureOutputPairs, vector<int> splitPoints, double& minEntropy, int& minEntropyIndex, double& relatedIntrinsicValue);
+    virtual double calcIntrinsicValue(int numLessThanValueAtSplitPoint, int numGreaterThanValueAtSplitPoint, int numSamples);
+    virtual double calcSplitEntropy(vector< vector<int> > featureOutputPairs, int splitIndex, int numOutputClasses, bool);
+    virtual int getSplitPopulation(RFTreeNode* node, vector< vector<int> >& leftChildSamples, vector< vector<int> >& rightChildSamples);
+    virtual bool checkIfAlreadyClassified(RFTreeNode* treeNode, int& outputClass);
+
+    vector< vector<int> > baseDataSet;
+    int numSamples;
+    int numFeatures;
+    int numOutputClasses;
+    vector<int> outputClasses;
+    vector< vector<int> > bootstrappedTrainingSamples;
+    vector<int> bootstrappedTrainingSampleIndices;
+    vector< vector<int> > bootstrappedTestSamples;
+    vector<int> bootstrappedTestSampleIndices;
+    
+    RFTreeNode* rootNode;
+    vector<int> globalDiscardedFeatureIndices;
+    int optimumFeatureSubsetSize;
+    string treeSplitCriterion;
+    MothurOut* m;
+  
+private:
+    
+  
+};
+/**************************************************************************************************/
+
+#endif
diff --git a/abstractrandomforest.cpp b/abstractrandomforest.cpp

new file mode 100644 (file)

index 0000000..ae60b77
--- /dev/null
+++ b/abstractrandomforest.cpp
@@ -0,0 +1,58 @@
+//
+//  abstractrandomforest.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 10/1/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "abstractrandomforest.hpp"
+
+/***********************************************************************/
+AbstractRandomForest::AbstractRandomForest(const std::vector < std::vector<int> > dataSet, 
+                     const int numDecisionTrees, 
+                     const string treeSplitCriterion = "informationGain")
+: dataSet(dataSet), 
+numDecisionTrees(numDecisionTrees),
+numSamples((int)dataSet.size()),
+numFeatures((int)(dataSet[0].size() - 1)),
+globalDiscardedFeatureIndices(getGlobalDiscardedFeatureIndices()),
+globalVariableImportanceList(numFeatures, 0),
+treeSplitCriterion(treeSplitCriterion) {
+    m = MothurOut::getInstance();
+    // TODO: double check if the implemenatation of 'globalOutOfBagEstimates' is correct
+}
+
+/***********************************************************************/
+
+vector<int> AbstractRandomForest::getGlobalDiscardedFeatureIndices() {
+    try {
+        vector<int> globalDiscardedFeatureIndices;
+        
+        // calculate feature vectors
+        vector< vector<int> > featureVectors(numFeatures, vector<int>(numSamples, 0));
+        for (int i = 0; i < numSamples; i++) {
+            if (m->control_pressed) { return globalDiscardedFeatureIndices; }
+            for (int j = 0; j < numFeatures; j++) { featureVectors[j][i] = dataSet[i][j]; }
+        }
+        
+        for (int i = 0; i < featureVectors.size(); i++) {
+            if (m->control_pressed) { return globalDiscardedFeatureIndices; }
+            double standardDeviation = m->getStandardDeviation(featureVectors[i]);
+            if (standardDeviation <= 0){ globalDiscardedFeatureIndices.push_back(i); }
+        }
+        
+        if (m->debug) {
+            m->mothurOut("number of global discarded features:  " + toString(globalDiscardedFeatureIndices.size())+ "\n");
+            m->mothurOut("total features: " + toString(featureVectors.size())+ "\n");
+        }
+        
+        return globalDiscardedFeatureIndices;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AbstractRandomForest", "getGlobalDiscardedFeatureIndices");
+               exit(1);
+       } 
+}
+
+/***********************************************************************/
+\ No newline at end of file
diff --git a/abstractrandomforest.hpp b/abstractrandomforest.hpp

new file mode 100755 (executable)

index 0000000..3be91b9
--- /dev/null
+++ b/abstractrandomforest.hpp
@@ -0,0 +1,67 @@
+//
+//  abstractrandomforest.hpp
+//  rrf-fs-prototype
+//
+//  Created by Abu Zaher Faridee on 7/20/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef rrf_fs_prototype_abstractrandomforest_hpp
+#define rrf_fs_prototype_abstractrandomforest_hpp
+
+#include "mothurout.h"
+#include "macros.h"
+#include "abstractdecisiontree.hpp"
+
+#define DEBUG_MODE
+
+/***********************************************************************/
+
+class AbstractRandomForest{
+public:
+    // intialization with vectors
+    AbstractRandomForest(const std::vector < std::vector<int> > dataSet, 
+                       const int numDecisionTrees, 
+                       const string);
+    virtual ~AbstractRandomForest(){ }
+    virtual int populateDecisionTrees() = 0;
+    virtual int calcForrestErrorRate() = 0;
+    virtual int calcForrestVariableImportance(string) = 0;
+ 
+/***********************************************************************/
+  
+protected:
+  
+    // TODO: create a better way of discarding feature
+    // currently we just set FEATURE_DISCARD_SD_THRESHOLD to 0 to solved this
+    // it can be tuned for better selection
+    // also, there might be other factors like Mean or other stuffs
+    // same would apply for createLocalDiscardedFeatureList in the TreeNode class
+  
+    // TODO: Another idea is getting an aggregated discarded feature indices after the run, from combining
+    // the local discarded feature indices
+    // this would penalize a feature, even if in global space the feature looks quite good
+    // the penalization would be averaged, so this woould unlikely to create a local optmina
+    
+    vector<int> getGlobalDiscardedFeatureIndices();
+    
+    int numDecisionTrees;
+    int numSamples;
+    int numFeatures;
+    vector< vector<int> > dataSet;
+    vector<int> globalDiscardedFeatureIndices;
+    vector<double> globalVariableImportanceList;
+    string treeSplitCriterion;
+    // This is a map of each feature to outcome count of each classes
+    // e.g. 1 => [2 7] means feature 1 has 2 outcome of 0 and 7 outcome of 1
+    map<int, vector<int> > globalOutOfBagEstimates;
+    
+    // TODO: fix this, do we use pointers?
+    vector<AbstractDecisionTree*> decisionTrees;
+    
+    MothurOut* m;
+  
+private:
+
+};
+#endif
diff --git a/aligncommand.cpp b/aligncommand.cpp

index a68fbfcba365207d7847c7eeaff12d9e4a4914e2..efc8ce489e79ac5e1d17084f5e952b39590894ba 100644 (file)
--- a/aligncommand.cpp
+++ b/aligncommand.cpp
@@ -572,7 +572,6 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam
                                 if (candidateSeq->getUnaligned().length() > alignment->getnRows()) {
                                         alignment->resize(candidateSeq->getUnaligned().length()+1);
                                 }
-                                                               
                                 Sequence temp = templateDB->findClosestSequence(candidateSeq);
                                 Sequence* templateSeq = &temp;
                                 
diff --git a/alignnode.cpp b/alignnode.cpp

new file mode 100755 (executable)

index 0000000..ccd8fb0
--- /dev/null
+++ b/alignnode.cpp
@@ -0,0 +1,257 @@
+/*
+ *  alignNode.cpp
+ *  bayesian
+ *
+ *  Created by Pat Schloss on 10/11/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "alignNode.h"
+#include "taxonomynode.h"
+
+#include "bayesian.h"
+
+/**************************************************************************************************/
+
+AlignNode::AlignNode(string n, int l): TaxonomyNode(n, l){
+
+       alignLength = 0;
+}
+
+/**************************************************************************************************/
+
+void AlignNode::printTheta(){
+    try {
+        m->mothurOut("A:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].A)+ '\t');               }       m->mothurOutEndLine();
+        m->mothurOut("T:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].T)+ '\t');               }       m->mothurOutEndLine();
+        m->mothurOut("G:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].G)+ '\t');               }       m->mothurOutEndLine();
+        m->mothurOut("C:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].C)+ '\t');               }       m->mothurOutEndLine();
+        m->mothurOut("I:\t"); for(int i=0;i<alignLength;i++){  m->mothurOut(toString(theta[i].gap)+ '\t');             }       m->mothurOutEndLine();
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AlignNode", "printTheta");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int AlignNode::loadSequence(string& sequence){
+       try {
+        alignLength = (int)sequence.length();  //      this function runs through the alignment and increments the frequency
+        //     of each base for a particular taxon.  we are building the thetas
+        
+        if(theta.size() == 0){
+            theta.resize(alignLength);
+            columnCounts.resize(alignLength, 0);
+        }
+        
+        for(int i=0;i<alignLength;i++){
+            
+            if (m->control_pressed) { return 0; }
+            
+            char base = sequence[i];
+            
+            if(base == 'A')            {       theta[i].A++;   columnCounts[i]++;              }       //      our thetas will be alignLength x 5  
+            else if(base == 'T'){      theta[i].T++;   columnCounts[i]++;              }       //      and we ignore any position that has  
+            else if(base == 'G'){      theta[i].G++;   columnCounts[i]++;              }       //      an ambiguous base call
+            else if(base == 'C'){      theta[i].C++;   columnCounts[i]++;              }
+            else if(base == '-'){      theta[i].gap++; columnCounts[i]++;              }
+            else if(base == 'U'){      theta[i].T++;   columnCounts[i]++;              }
+        }
+        
+        numSeqs++;
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AlignNode", "loadSequence");
+               exit(1);
+       }
+}      
+
+/**************************************************************************************************/
+
+int AlignNode::checkTheta(){
+    try {
+        for(int i=0;i<alignLength;i++){
+            
+            if (m->control_pressed) { return 0; }
+            
+            if(theta[i].gap == columnCounts[i]){
+                columnCounts[i] = 0;
+            }
+            //        else{
+            //            int maxCount = theta[i].A;
+            //            
+            //            if(theta[i].T > maxCount)   {    maxCount = theta[i].T;  }
+            //            if(theta[i].G > maxCount)   {    maxCount = theta[i].T;  }
+            //            if(theta[i].C > maxCount)   {    maxCount = theta[i].T;  }
+            //            if(theta[i].gap > maxCount) {    maxCount = theta[i].T;  }
+            //        
+            //            if(maxCount < columnCounts[i] * 0.25){// || maxCount == columnCounts[i]){   //remove any column where the maximum frequency is <50%
+            //                columnCounts[i] = 0;
+            //            }
+            //        }
+            
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AlignNode", "checkTheta");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int AlignNode::addThetas(vector<thetaAlign> newTheta, int newNumSeqs){
+       try {
+        if(alignLength == 0){
+            alignLength = (int)newTheta.size();
+            theta.resize(alignLength);
+            columnCounts.resize(alignLength);
+        }
+        
+        for(int i=0;i<alignLength;i++){        
+            
+            if (m->control_pressed) { return 0; }
+            
+            theta[i].A += newTheta[i].A;               columnCounts[i] += newTheta[i].A;
+            theta[i].T += newTheta[i].T;               columnCounts[i] += newTheta[i].T;
+            theta[i].G += newTheta[i].G;               columnCounts[i] += newTheta[i].G;
+            theta[i].C += newTheta[i].C;               columnCounts[i] += newTheta[i].C;
+            theta[i].gap += newTheta[i].gap;   columnCounts[i] += newTheta[i].gap;
+        }
+        
+        numSeqs += newNumSeqs;
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "AlignNode", "addThetas");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+double AlignNode::getSimToConsensus(string& query){
+       try {
+        double similarity = 0;
+        
+        int length = 0;
+        
+        for(int i=0;i<alignLength;i++){
+            
+            if (m->control_pressed) { return similarity; }
+            
+            char base = query[i];
+            
+            if(base != '.' && base != 'N' && columnCounts[i] != 0){
+                
+                double fraction = 0;
+                
+                if(base == 'A'){
+                    fraction = (int) theta[i].A / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+                else if(base == 'T'){
+                    fraction = (int) theta[i].T / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+                else if(base == 'G'){
+                    fraction = (int) theta[i].G / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+                else if(base == 'C'){
+                    fraction = (int) theta[i].C / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+                else if(base == '-'){
+                    fraction = (int) theta[i].gap / (double) columnCounts[i];
+                    similarity += fraction;
+                    length++;
+                }
+            }
+        }
+        
+        if(length != 0){
+            similarity /= double(length);
+        }
+        else {
+            similarity = 0;
+        }
+        
+        return similarity;     
+        
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignNode", "getSimToConsensus");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+double AlignNode::getPxGivenkj_D_j(string& query){     //P(x | k_j, D, j)
+       try {
+        double PxGivenkj_D_j = 0;
+        
+        int count = 0;
+        double alpha = 1 / (double)totalSeqs;  //flat prior
+        
+        
+        for(int s=0;s<alignLength;s++){
+            
+            if (m->control_pressed) { return PxGivenkj_D_j; }
+            
+            char base = query[s];
+            thetaAlign thetaS = theta[s];
+            
+            if(base != '.' && base != 'N' && columnCounts[s] != 0){
+                double Nkj_s = (double)columnCounts[s];        
+                double nkj_si = 0;
+                
+                
+                if(base == 'A')                {       nkj_si = (double)thetaS.A;              }
+                else if(base == 'T'){  nkj_si = (double)thetaS.T;              }       
+                else if(base == 'G'){  nkj_si = (double)thetaS.G;              }       
+                else if(base == 'C'){  nkj_si = (double)thetaS.C;              }       
+                else if(base == '-'){  nkj_si = (double)thetaS.gap;    }
+                else if(base == 'U'){  nkj_si = (double)thetaS.T;              }       
+                
+                //                     double alpha = pow(0.2, double(Nkj_s)) + 0.0001;        //need to make 1e-4 a variable in future; this is the non-flat prior
+                
+                //                     if(columnCounts[s] != nkj_si){                                          //deal only with segregating sites...
+                               double numerator = nkj_si + alpha;
+                               double denomenator = Nkj_s + 5.0 * alpha;
+                               
+                               PxGivenkj_D_j += log(numerator) - log(denomenator);             
+                               count++;
+                //                     }
+            }
+            if(base != '.' && columnCounts[s] == 0 && thetaS.gap == 0){
+                count = 0;
+                break;
+            }
+            
+        }
+        
+        if(count == 0){        PxGivenkj_D_j = -1e10;  }       
+        
+        return PxGivenkj_D_j;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignNode", "getPxGivenkj_D_j");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
diff --git a/alignnode.h b/alignnode.h

new file mode 100755 (executable)

index 0000000..4aecca7
--- /dev/null
+++ b/alignnode.h
@@ -0,0 +1,49 @@
+#ifndef ALIGNNODE
+#define ALIGNNODE
+
+/*
+ *  alignNode.h
+ *  bayesian
+ *
+ *  Created by Pat Schloss on 10/11/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "taxonomynode.h"
+
+/**************************************************************************************************/
+
+struct thetaAlign {
+       thetaAlign() : A(0), T(0), G(0), C(0), gap(0){}
+       unsigned int A;
+       unsigned int T;
+       unsigned int G;
+       unsigned int C;
+       unsigned int gap;
+};
+
+/**************************************************************************************************/
+
+class AlignNode : public TaxonomyNode {
+       
+public:
+       AlignNode(string, int);
+       int loadSequence(string&);
+       int checkTheta();
+    void printTheta();
+       double getPxGivenkj_D_j(string& query); //P(x | k_j, D, j)
+       double getSimToConsensus(string& query);
+       vector<thetaAlign> getTheta()   {       return theta;   }
+       int addThetas(vector<thetaAlign>, int);
+       
+private:
+       vector<thetaAlign> theta;
+       vector<unsigned int> columnCounts;
+       int alignLength;
+};
+
+/**************************************************************************************************/
+
+#endif
+
diff --git a/aligntree.cpp b/aligntree.cpp

new file mode 100755 (executable)

index 0000000..41667ca
--- /dev/null
+++ b/aligntree.cpp
@@ -0,0 +1,371 @@
+//
+//  alignTree.cpp
+//  pdsBayesian
+//
+//  Created by Patrick Schloss on 4/3/12.
+//  Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#include "alignnode.h"
+#include "aligntree.h"
+
+/**************************************************************************************************/
+
+AlignTree::AlignTree(string referenceFileName, string taxonomyFileName, int cutoff) : Classify(), confidenceThreshold(cutoff){
+       try {
+        AlignNode* newNode = new AlignNode("Root", 0);
+        tree.push_back(newNode);                       //      the tree is stored as a vector of elements of type TaxonomyNode
+        
+        string refTaxonomy;
+        
+        readTaxonomy(taxonomyFileName);
+     
+        ifstream referenceFile;
+        m->openInputFile(referenceFileName, referenceFile);
+        bool error = false;
+        map<int, int> lengths;
+        while(!referenceFile.eof()){
+            
+            if (m->control_pressed) { break; }
+            
+            Sequence seq(referenceFile);  m->gobble(referenceFile);
+            
+            if (seq.getName() != "") {
+                map<string, string>::iterator it = taxonomy.find(seq.getName());
+                
+                if (it != taxonomy.end()) {
+                    refTaxonomy = it->second;          //      lookup the taxonomy string for the current reference sequence
+                    string aligned = seq.getAligned();
+                    lengths[aligned.length()] = 1;
+                    if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: reference sequences must be aligned to use the align method, quitting.\n"); break; }
+                    addTaxonomyToTree(seq.getName(), refTaxonomy, aligned);
+                }else {
+                    m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
+                }
+            }
+        }
+        referenceFile.close();
+        
+        length = (lengths.begin())->first;  
+           
+        if (error) { m->control_pressed = true; }
+        
+        numTaxa = (int)tree.size();
+        
+        numLevels = 0;
+        for(int i=0;i<numTaxa;i++){
+            int level = tree[i]->getLevel();
+            if(level > numLevels){     numLevels = level;      }
+        }
+        numLevels++;
+        
+        aggregateThetas();
+        
+        int dbSize = tree[0]->getNumSeqs();
+        
+        for(int i=0;i<numTaxa;i++){
+            tree[i]->checkTheta();
+            tree[i]->setTotalSeqs(dbSize);
+        }
+        
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "AlignTree");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+AlignTree::~AlignTree(){
+       try {
+        for(int i=0;i<tree.size();i++){
+            delete tree[i];
+        }
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "~AlignTree");
+        exit(1);
+    }
+}      
+
+/**************************************************************************************************/
+
+int AlignTree::addTaxonomyToTree(string seqName, string& taxonomy, string& sequence){
+       try {
+        AlignNode* newNode;
+        string taxonName = "";
+        int treePosition = 0;                                                  //      the root is element 0
+        
+        int level = 1;
+        
+        for(int i=0;i<taxonomy.length();i++){                  //      step through taxonomy string...
+            
+            if (m->control_pressed) { break; }
+            
+            if(taxonomy[i] == ';'){                                            //      looking for semicolons...
+                
+                if (taxonName == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; }
+                
+                int newIndex = tree[treePosition]->getChildIndex(taxonName);   //      look to see if your current node already
+                //     has a child with the new taxonName
+                if(newIndex != -1)     {       treePosition = newIndex;        }               //      if you've seen it before, jump to that
+                else {                                                                                                         //       position in the tree
+                    int newChildIndex = (int)tree.size();                                              //      otherwise, we'll have to create one...
+                    tree[treePosition]->makeChild(taxonName, newChildIndex);
+                    
+                    newNode = new AlignNode(taxonName, level);
+                    
+                    newNode->setParent(treePosition);
+                    
+                    tree.push_back(newNode);
+                    treePosition = newChildIndex;
+                }
+                
+                //     sequence data to that node to update that node's theta - seems slow...                          
+                taxonName = "";                                                                //      clear out the taxon name that we will build as we look 
+                level++;
+            }                                                                                          //      for a semicolon
+            else{
+                taxonName += taxonomy[i];                                      //      keep adding letters until we reach a semicolon
+            }
+        }
+        tree[treePosition]->loadSequence(sequence);    //      now that we've gotten to the correct node, add the
+        
+        return 0;
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "addTaxonomyToTree");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+int AlignTree::aggregateThetas(){
+       try {
+        vector<vector<int> > levelMatrix(numLevels+1);
+        
+        for(int i=0;i<tree.size();i++){
+            if (m->control_pressed) { return 0; }
+            levelMatrix[tree[i]->getLevel()].push_back(i);
+        }
+               
+        for(int i=numLevels-1;i>0;i--){
+            if (m->control_pressed) { return 0; }
+            for(int j=0;j<levelMatrix[i].size();j++){
+                
+                AlignNode* holder = tree[levelMatrix[i][j]];
+                
+                tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());                                
+            }
+        }
+           return 0;
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "aggregateThetas");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+double AlignTree::getOutlierLogProbability(string& sequence){
+       try {
+        double count = 0;
+        
+        for(int i=0;i<sequence.length();i++){
+            
+            if(sequence[i] != '.'){    count++;        }
+            
+        }
+        
+        return count * log(0.2);
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "getOutlierLogProbability");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+int AlignTree::getMinRiskIndexAlign(string& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
+       try {
+        int numProbs = (int)probabilities.size();
+        
+        vector<double> G(numProbs, 0.2);       //a random sequence will, on average, be 20% similar to any other sequence
+        vector<double> risk(numProbs, 0);
+        
+        for(int i=1;i<numProbs;i++){ //use if you want the outlier group
+            if (m->control_pressed) { return 0; }
+            G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
+        }
+        
+        double minRisk = 1e6;
+        int minRiskIndex = 0;
+        
+        for(int i=0;i<numProbs;i++){
+            if (m->control_pressed) { return 0; }
+            for(int j=0;j<numProbs;j++){
+                if(i != j){
+                    risk[i] += probabilities[j] * G[j];
+                }                      
+            }
+            
+            if(risk[i] < minRisk){
+                minRisk = risk[i];
+                minRiskIndex = i;
+            }
+        }
+        
+        return minRiskIndex;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "getMinRiskIndexAlign");
+        exit(1);
+    }
+
+}
+
+/**************************************************************************************************/
+
+int AlignTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
+       try {
+        int finalLevel = (int)indices.size()-1;
+        
+        for(int position=1;position<indices.size();position++){
+            if (m->control_pressed) { return 0; }
+            int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
+            int actualParent = indices[position-1][maxIndices[position-1]];
+            
+            if(predictedParent != actualParent){
+                finalLevel = position - 1;
+                return finalLevel;
+            }
+        }
+        return finalLevel;
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "sanityCheck");
+        exit(1);
+    }
+}
+
+/**************************************************************************************************/
+
+string AlignTree::getTaxonomy(Sequence* seq){
+    try {
+        string seqName = seq->getName(); string querySequence = seq->getAligned(); string taxonProbabilityString = "";
+        if (querySequence.length() != length) {
+            m->mothurOut("[ERROR]: " + seq->getName() + " has length " + toString(querySequence.length()) + ", reference sequences length is " + toString(length) + ". Are your sequences aligned? Sequences must be aligned to use the align search method.\n"); m->control_pressed = true; return "";
+        }
+        double logPOutlier = getOutlierLogProbability(querySequence);
+        
+        vector<vector<double> > pXgivenKj_D_j(numLevels);
+        vector<vector<int> > indices(numLevels);
+        for(int i=0;i<numLevels;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            pXgivenKj_D_j[i].push_back(logPOutlier);
+            indices[i].push_back(-1);
+        }
+        
+        
+        for(int i=0;i<numTaxa;i++){
+            //         cout << i << '\t' << tree[i]->getName() << '\t' << tree[i]->getLevel() << '\t' << tree[i]->getPxGivenkj_D_j(querySequence) << endl;
+            if (m->control_pressed) { return taxonProbabilityString; }
+            pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(querySequence));
+            indices[tree[i]->getLevel()].push_back(i);
+        }
+        
+        vector<double> sumLikelihood(numLevels, 0);
+        vector<double> bestPosterior(numLevels, 0);
+        vector<int> maxIndex(numLevels, 0);
+        int maxPosteriorIndex;
+        
+        
+               //cout << "before best level" << endl;
+        
+        //let's find the best level and taxa within that level
+        for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
+            if (m->control_pressed) { return taxonProbabilityString; }
+            int numTaxaInLevel = (int)indices[i].size();
+            
+                       //cout << "numTaxaInLevel:\t" << numTaxaInLevel << endl;
+            
+            vector<double> posteriors(numTaxaInLevel, 0);              
+            sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
+            
+            maxPosteriorIndex = 0;
+            for(int j=0;j<numTaxaInLevel;j++){
+                posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
+                
+                if(posteriors[j] > posteriors[maxPosteriorIndex]){     
+                    maxPosteriorIndex = j;
+                }
+                
+            }
+            
+            maxIndex[i] = getMinRiskIndexAlign(querySequence, indices[i], posteriors);
+            
+            maxIndex[i] = maxPosteriorIndex;
+            bestPosterior[i] = posteriors[maxIndex[i]];        
+        }
+        
+        //     vector<double> pX_level(numLevels, 0);
+        //     
+        //     for(int i=0;i<numLevels;i++){
+        //             pX_level[i] = pXgivenKj_D_j[i][maxIndex[i]] - tree[indices[i][maxIndex[i]]]->getNumSeqs();
+        //     }
+        //     
+        //     int max_pLevel_X_index = -1;
+        //     double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index);
+        //     double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum);
+        //     
+        //     vector<double> pLevel_X(numLevels, 0);
+        //     for(int i=0;i<numLevels;i++){
+        //             pLevel_X[i] = exp(pX_level[i] - pX_level_sum);
+        //     }
+        
+        
+        
+        
+        int saneDepth = sanityCheck(indices, maxIndex);
+        
+        simpleTax = "";
+        int savedspot = 1;
+        taxonProbabilityString = "";
+        for(int i=1;i<=saneDepth;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            int confidenceScore = (int) (bestPosterior[i] * 100);
+            if (confidenceScore >= confidenceThreshold) {
+            if(indices[i][maxIndex[i]] != -1){
+                taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + '(' + toString(confidenceScore) + ");";
+                simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
+                //                     levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");";
+            }
+            else{
+                taxonProbabilityString + "unclassified" + '(' + toString(confidenceScore) + ");";
+                //                     levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");";
+                simpleTax += "unclassified;";
+            }
+            }else { break; }
+            savedspot = i;
+        }
+        
+        for(int i=savedspot+1;i<numLevels;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            taxonProbabilityString + "unclassified(0);";
+            simpleTax += "unclassified;";
+        }
+        
+        return taxonProbabilityString;
+       }
+    catch(exception& e) {
+        m->errorOut(e, "AlignTree", "getTaxonomy");
+        exit(1);
+    }
+}
+
+
+/**************************************************************************************************/
diff --git a/aligntree.h b/aligntree.h

new file mode 100755 (executable)

index 0000000..51008ff
--- /dev/null
+++ b/aligntree.h
@@ -0,0 +1,34 @@
+//
+//  alignTree.h
+//  pdsBayesian
+//
+//  Created by Patrick Schloss on 4/3/12.
+//  Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#ifndef pdsBayesian_alignTree_h
+#define pdsBayesian_alignTree_h
+
+#include "classify.h"
+
+class AlignNode;
+
+class AlignTree : public Classify {
+
+public:
+       AlignTree(string, string, int);
+       ~AlignTree();
+       string getTaxonomy(Sequence*);
+       
+private:
+    int addTaxonomyToTree(string, string&, string&);
+       double getOutlierLogProbability(string&);
+       int getMinRiskIndexAlign(string&, vector<int>&, vector<double>&);
+       int aggregateThetas();
+       int sanityCheck(vector<vector<int> >&, vector<int>&);
+
+       int numSeqs, confidenceThreshold, length;
+       vector<AlignNode*> tree;
+};
+
+#endif
diff --git a/bayesian.cpp b/bayesian.cpp

index 1dc38337aef1bcc3b695ff56e86061cdab58c13d..49be4af57ff66f46ac04912535d7918518cc75d2 100644 (file)
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -12,13 +12,14 @@
  #include "phylosummary.h"
  #include "referencedb.h"
  /**************************************************************************************************/
-Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f) : 
+Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f, bool sh) : 
  Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
         try {
                 ReferenceDB* rdb = ReferenceDB::getInstance();
                 
                 threadID = tid;
                 flip = f;
+        shortcuts = sh;
                 string baseName = tempFile;
                         
                 if (baseName == "saved") { baseName = rdb->getSavedReference(); }
@@ -63,7 +64,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                         }
                         saveIn.close();                 
                 }
-               
+
                 if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){       
                         if (tempFile == "saved") { m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");     m->mothurOutEndLine(); }
                         
@@ -113,7 +114,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                                 WordPairDiffArr.resize(numKmers);
                         
                                 for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
-                    ofstream out;
+                ofstream out;
                                 ofstream out2;
                                 
                                 #ifdef USE_MPI
@@ -124,23 +125,24 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                                 #endif
  
                                 
-                               m->openOutputFile(probFileName, out);
+                if (shortcuts) { 
+                    m->openOutputFile(probFileName, out); 
                                 
-                               //output mothur version
-                               out << "#" << m->getVersion() << endl;
+                    //output mothur version
+                    out << "#" << m->getVersion() << endl;
                                 
-                               out << numKmers << endl;
+                    out << numKmers << endl;
                                 
-                               m->openOutputFile(probFileName2, out2);
+                    m->openOutputFile(probFileName2, out2);
                                 
-                               //output mothur version
-                               out2 << "#" << m->getVersion() << endl;
+                    //output mothur version
+                    out2 << "#" << m->getVersion() << endl;
+                }
                                 
                                 #ifdef USE_MPI
                                         }
                                 #endif
  
-                               
                                 //for each word
                                 for (int i = 0; i < numKmers; i++) {
                                         if (m->control_pressed) {  break; }
@@ -151,7 +153,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                                                 if (pid == 0) {  
                                         #endif
  
-                                       out << i << '\t';
+                    if (shortcuts) {  out << i << '\t'; }
                                         
                                         #ifdef USE_MPI
                                                 }
@@ -159,12 +161,10 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                                         
                                         vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
                                         
-                                       map<int, int> count;
-                                       for (int k = 0; k < genusNodes.size(); k++) {  count[genusNodes[k]] = 0;  }                     
-                                                       
                                         //for each sequence with that word
+                    vector<int> count; count.resize(genusNodes.size(), 0);
                                         for (int j = 0; j < seqsWithWordi.size(); j++) {
-                                               int temp = phyloTree->getIndex(names[seqsWithWordi[j]]);
+                                               int temp = phyloTree->getGenusIndex(names[seqsWithWordi[j]]);
                                                 count[temp]++;  //increment count of seq in this genus who have this word
                                         }
                                         
@@ -178,9 +178,9 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                                                 //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
                                                 
                                                 
-                                               wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
+                                               wordGenusProb[i][k] = log((count[k] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
                                                                         
-                                               if (count[genusNodes[k]] != 0) { 
+                                               if (count[k] != 0) { 
                                                         #ifdef USE_MPI
                                                                 int pid;
                                                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
@@ -188,7 +188,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                                                                 if (pid == 0) {  
                                                         #endif
  
-                                                       out << k << '\t' << wordGenusProb[i][k] << '\t' ; 
+                            if (shortcuts) { out << k << '\t' << wordGenusProb[i][k] << '\t' ; }
                                                         
                                                         #ifdef USE_MPI
                                                                 }
@@ -204,8 +204,10 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                                                 if (pid == 0) {  
                                         #endif
                                         
-                                       out << endl;
-                                       out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl;
+                            if (shortcuts) { 
+                                out << endl;
+                                out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl;
+                            }
                                         
                                         #ifdef USE_MPI
                                                 }
@@ -218,9 +220,10 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                                         if (pid == 0) {  
                                 #endif
                                 
-                               out.close();
-                               out2.close();
-                               
+                        if (shortcuts) { 
+                            out.close();
+                            out2.close();
+                        }
                                 #ifdef USE_MPI
                                         }
                                 #endif
diff --git a/bayesian.h b/bayesian.h

index 7c884337495dbb1e0213d1171a54ca20e0eb932c..405fee3679df7b94ae8f392f62c27886e0b11e36 100644 (file)
--- a/bayesian.h
+++ b/bayesian.h
@@ -18,7 +18,7 @@
  class Bayesian : public Classify {
         
  public:
-       Bayesian(string, string, string, int, int, int, int, bool);
+       Bayesian(string, string, string, int, int, int, int, bool, bool);
         ~Bayesian();
         
         string getTaxonomy(Sequence*);
diff --git a/binsequencecommand.cpp b/binsequencecommand.cpp

index 7569a4be3113144a96048f5a4d624d271c15b9f4..ad71d1065fb32904b365a7cd2be46a28718a7d98 100644 (file)
--- a/binsequencecommand.cpp
+++ b/binsequencecommand.cpp
@@ -15,8 +15,9 @@ vector<string> BinSeqCommand::setParameters(){
         try {
                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -34,7 +35,7 @@ vector<string> BinSeqCommand::setParameters(){
  string BinSeqCommand::getHelpString(){ 
         try {
                 string helpString = "";
-               helpString += "The bin.seqs command parameters are list, fasta, name, label and group.  The fasta and list are required, unless you have a valid current list and fasta file.\n";
+               helpString += "The bin.seqs command parameters are list, fasta, name, count, label and group.  The fasta and list are required, unless you have a valid current list and fasta file.\n";
                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and are separated by dashes.\n";
                 helpString += "The bin.seqs command should be in the following format: bin.seqs(fasta=yourFastaFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
                 helpString += "Example bin.seqs(fasta=amazon.fasta, group=amazon.groups, name=amazon.names).\n";
@@ -147,6 +148,14 @@ BinSeqCommand::BinSeqCommand(string option) {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -195,11 +204,26 @@ BinSeqCommand::BinSeqCommand(string option) {
                         if (groupfile == "not open") { abort = true; }
                         else if (groupfile == "not found") { groupfile = ""; }
                         else { m->setGroupFile(groupfile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namesfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
                         
-                       if (namesfile == ""){
-                               vector<string> files; files.push_back(fastafile); 
-                               parser.getNameFile(files);
-                       }
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+                       
+            if (countfile == "") {
+                if (namesfile == ""){
+                    vector<string> files; files.push_back(fastafile); 
+                    parser.getNameFile(files);
+                }
+            }
                         
                 }
         }
@@ -229,9 +253,8 @@ int BinSeqCommand::execute(){
                 fasta->readFastaFile(fastafile);
                 
                 //if user gave a namesfile then use it
-               if (namesfile != "") {
-                       readNamesFile();
-               }
+               if (namesfile != "") {  readNamesFile();  }
+        if (countfile != "") {  ct.readTable(countfile);  }
                 
                 input = new InputData(listfile, "list");
                 list = input->getListVector();
@@ -362,79 +385,71 @@ void BinSeqCommand::readNamesFile() {
  //return 1 if error, 0 otherwise
  int BinSeqCommand::process(ListVector* list) {
         try {
-                               string binnames, name, sequence;
-                               
-        string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + list->getLabel() + getOutputFileNameTag("fasta");
+        string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + list->getLabel() + "." + getOutputFileNameTag("fasta");
          m->openOutputFile(outputFileName, out);
-                               
-                               //save to output list of output file names
-                               outputNames.push_back(outputFileName);  outputTypes["fasta"].push_back(outputFileName);
-
-                               m->mothurOut(list->getLabel()); m->mothurOutEndLine();
-                               
-                               //for each bin in the list vector
-                               for (int i = 0; i < list->size(); i++) {
-                                       
-                                       if (m->control_pressed) {  return 1; }
-                                       
-                                       binnames = list->get(i);
-                                       while (binnames.find_first_of(',') != -1) { 
-                                               name = binnames.substr(0,binnames.find_first_of(','));
-                                               binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length());
-                                               
-                                               //do work for that name
-                                               sequence = fasta->getSequence(name);
-                                               if (sequence != "not found") {
-                                                       //if you don't have groups
-                                                       if (groupfile == "") {
-                                                               name = name + "\t" + toString(i+1);
-                                                               out << ">" << name << endl;
-                                                               out << sequence << endl;
-                                                       }else {//if you do have groups
-                                                               string group = groupMap->getGroup(name);
-                                                               if (group == "not found") {  
-                                                                       m->mothurOut(name + " is missing from your group file. Please correct. ");  m->mothurOutEndLine();
-                                                                       return 1;
-                                                               }else{
-                                                                       name = name + "\t" + group + "\t" + toString(i+1);
-                                                                       out << ">" << name << endl;
-                                                                       out << sequence << endl;
-                                                               }
-                                                       }
-                                               }else { 
-                                                       m->mothurOut(name + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine();
-                                                       return 1;
-                                               }
-                                               
-                                       }
-                                       
-                                       //get last name
-                                       sequence = fasta->getSequence(binnames);
-                                       if (sequence != "not found") {
-                                               //if you don't have groups
-                                               if (groupfile == "") {
-                                                       binnames = binnames + "\t" + toString(i+1);
-                                                       out << ">" << binnames << endl;
-                                                       out << sequence << endl;
-                                               }else {//if you do have groups
-                                                       string group = groupMap->getGroup(binnames);
-                                                       if (group == "not found") {  
-                                                               m->mothurOut(binnames + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
-                                                               return 1;
-                                                       }else{
-                                                               binnames = binnames + "\t" + group + "\t" + toString(i+1);
-                                                               out << ">" << binnames << endl;
-                                                               out << sequence << endl;
-                                                       }
-                                               }
-                                       }else { 
-                                               m->mothurOut(binnames + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine();
-                                               return 1;
-                                       }
-                               }
-                                       
-                               out.close();
-                               return 0;
+        outputNames.push_back(outputFileName);  outputTypes["fasta"].push_back(outputFileName);
+        
+        m->mothurOut(list->getLabel()); m->mothurOutEndLine();
+        
+        //for each bin in the list vector
+        for (int i = 0; i < list->size(); i++) {
+            
+            if (m->control_pressed) {  return 1; }
+            
+            string binnames = list->get(i);
+            vector<string> names;
+            m->splitAtComma(binnames, names);
+            for (int j = 0; j < names.size(); j++) {
+                string name = names[j];
+                
+                //do work for that name
+                string sequence = fasta->getSequence(name);
+                
+                if (countfile != "") {
+                    if (sequence != "not found") {
+                        if (ct.hasGroupInfo()) {
+                            vector<string> groups = ct.getGroups(name);
+                            string groupInfo = "";
+                            for (int k = 0; k < groups.size()-1; k++) {
+                                groupInfo += groups[k] + "-";
+                            }
+                            if (groups.size() != 0) { groupInfo += groups[groups.size()-1]; }
+                            else { groupInfo = "not found";  }
+                            name = name + "\t" + groupInfo + "\t" + toString(i+1)+ "\tNumRep=" + toString(ct.getNumSeqs(name));
+                            out << ">" << name << endl;
+                            out << sequence << endl;
+                        }else {
+                            name = name + "\t" + toString(i+1) + "\tNumRep=" + toString(ct.getNumSeqs(name));
+                            out << ">" << name << endl;
+                            out << sequence << endl;
+                        }
+                        
+                    }else { m->mothurOut(name + " is missing from your fasta. Does your list file contain all sequence names or just the uniques?"); m->mothurOutEndLine(); return 1; }
+                }else {
+                    if (sequence != "not found") {
+                        //if you don't have groups
+                        if (groupfile == "") {
+                            name = name + "\t" + toString(i+1);
+                            out << ">" << name << endl;
+                            out << sequence << endl;
+                        }else {//if you do have groups
+                            string group = groupMap->getGroup(name);
+                            if (group == "not found") {  
+                                m->mothurOut(name + " is missing from your group file. Please correct. ");  m->mothurOutEndLine();
+                                return 1;
+                            }else{
+                                name = name + "\t" + group + "\t" + toString(i+1);
+                                out << ">" << name << endl;
+                                out << sequence << endl;
+                            }
+                        }
+                    }else { m->mothurOut(name + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine(); return 1; }
+                }
+            }
+        }
+        
+        out.close();
+        return 0;
  
         }
         catch(exception& e) {
diff --git a/binsequencecommand.h b/binsequencecommand.h

index 1fb5664b0c018c4120881055014ec08a4e6d5559..5bdd401f9411e4abb21aa6a12284bfa98f4b5bda 100644 (file)
--- a/binsequencecommand.h
+++ b/binsequencecommand.h
@@ -16,6 +16,7 @@
  #include "listvector.hpp"
  #include "fastamap.h"
  #include "groupmap.h"
+#include "counttable.h"
  
  class BinSeqCommand : public Command {
         
@@ -36,14 +37,14 @@ public:
         void help() { m->mothurOut(getHelpString()); }          
         
  private:
-       
+       CountTable ct;
         ListVector* list;
         InputData* input;
         FastaMap* fasta;
         GroupMap* groupMap;
         bool abort, allLines;
         set<string> labels; //holds labels to be used
-       string filename, fastafile, listfile, namesfile, groupfile, label, outputDir;
+       string filename, fastafile, listfile, namesfile, groupfile, countfile, label, outputDir;
         ofstream out;
         ifstream in, inNames;
         vector<string> outputNames;
diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp

index e3691e8b942c2f34da91cec14691e717d5c16c1a..7ae5d69141979736bcb2b6fcfb25e1ba4682086d 100644 (file)
--- a/chimeraperseuscommand.cpp
+++ b/chimeraperseuscommand.cpp
@@ -10,12 +10,15 @@
  #include "chimeraperseuscommand.h"
  #include "deconvolutecommand.h"
  #include "sequence.hpp"
+#include "counttable.h"
+#include "sequencecountparser.h"
  //**********************************************************************************************************************
  vector<string> ChimeraPerseusCommand::setParameters(){ 
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "NameCount", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -36,10 +39,11 @@ vector<string> ChimeraPerseusCommand::setParameters(){
  string ChimeraPerseusCommand::getHelpString(){ 
         try {
                 string helpString = "";
-               helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n";
+               helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n";
                 helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
                 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
-               helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n";
+               helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n";
+        helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. \n";
                 helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
                 helpString += "The group parameter allows you to provide a group file.  When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
                 helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
@@ -96,6 +100,8 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(){
  ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
         try {
                 abort = false; calledHelp = false; 
+        hasCount = false;
+        hasName = false;
                 
                 //allow user to run help
                 if(option == "help") { help(); abort = true; calledHelp = true; }
@@ -107,7 +113,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                         OptionParser parser(option);
                         map<string,string> parameters = parser.getParameters();
                         
-                       ValidParameters validParameter("chimera.uchime");
+                       ValidParameters validParameter("chimera.perseus");
                         map<string,string>::iterator it;
                         
                         //check to make sure all parameters are valid for command
@@ -203,15 +209,9 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                         
                         
                         //check for required parameters
-                       bool hasName = true;
                         namefile = validParameter.validFile(parameters, "name", false);
-                       if (namefile == "not found") { 
-                               //if there is a current fasta file, use it
-                               string filename = m->getNameFile(); 
-                               if (filename != "") { nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); }
-                               else {  m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }                                
-                               hasName = false;
-                       }else { 
+                       if (namefile == "not found") { namefile = "";   }
+                       else { 
                                 m->splitAtDash(namefile, nameFileNames);
                                 
                                 //go through files and make sure they are good, if not, then disregard them
@@ -277,12 +277,101 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                                                 }
                                         }
                                 }
+                       }
+            
+            if (nameFileNames.size() != 0) { hasName = true; }
+            
+            //check for required parameters
+            vector<string> countfileNames;
+                       countfile = validParameter.validFile(parameters, "count", false);
+                       if (countfile == "not found") { 
+                countfile = "";  
+                       }else { 
+                               m->splitAtDash(countfile, countfileNames);
                                 
-                               //make sure there is at least one valid file left
-                               if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
+                               //go through files and make sure they are good, if not, then disregard them
+                               for (int i = 0; i < countfileNames.size(); i++) {
+                                       
+                                       bool ignore = false;
+                                       if (countfileNames[i] == "current") { 
+                                               countfileNames[i] = m->getCountTableFile(); 
+                                               if (countfileNames[i] != "") {  m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); }
+                                               else {  
+                                                       m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }
+                                       }
+                                       
+                                       if (!ignore) {
+                                               
+                                               if (inputDir != "") {
+                                                       string path = m->hasPath(countfileNames[i]);
+                                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                                       if (path == "") {       countfileNames[i] = inputDir + countfileNames[i];               }
+                                               }
+                                               
+                                               int ableToOpen;
+                                               ifstream in;
+                                               
+                                               ableToOpen = m->openInputFile(countfileNames[i], in, "noerror");
+                                               
+                                               //if you can't open it, try default location
+                                               if (ableToOpen == 1) {
+                                                       if (m->getDefaultPath() != "") { //default path is set
+                                                               string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               if (ableToOpen == 1) {
+                                                       if (m->getOutputDir() != "") { //default path is set
+                                                               string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               in.close();
+                                               
+                                               if (ableToOpen == 1) { 
+                                                       m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }else {
+                                                       m->setCountTableFile(countfileNames[i]);
+                                               }
+                                       }
+                               }
                         }
-                       
-                       if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+            
+            if (countfileNames.size() != 0) { hasCount = true; }
+            
+                       //make sure there is at least one valid file left
+            if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+            
+            if (!hasName && !hasCount) { 
+                //if there is a current name file, use it, else look for current count file
+                               string filename = m->getNameFile(); 
+                               if (filename != "") { hasName = true; nameFileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the name parameter."); m->mothurOutEndLine(); }
+                               else { 
+                    filename = m->getCountTableFile();
+                    if (filename != "") { hasCount = true; countfileNames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                    else { m->mothurOut("[ERROR]: You must provide a count or name file."); m->mothurOutEndLine(); abort = true;  }
+                }
+            }
+            if (!hasName && hasCount) { nameFileNames = countfileNames; }
+            
+                       if (nameFileNames.size() != fastaFileNames.size()) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
                         
                         bool hasGroup = true;
                         groupfile = validParameter.validFile(parameters, "group", false);
@@ -360,6 +449,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                         
                         if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
                         
+            if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; }
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
@@ -415,41 +505,82 @@ int ChimeraPerseusCommand::execute(){
                         
                         int numSeqs = 0;
                         int numChimeras = 0;
-                       
-                       if (groupFile != "") {
-                               //Parse sequences by group
-                               SequenceParser parser(groupFile, fastaFileNames[s], nameFile);
-                               vector<string> groups = parser.getNamesOfGroups();
-                               
-                               if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        }  return 0; }
-                               
-                               //clears files
-                               ofstream out, out1, out2;
-                               m->openOutputFile(outputFileName, out); out.close(); 
-                               m->openOutputFile(accnosFileName, out1); out1.close();
-                               
-                               if(processors == 1)     {       numSeqs = driverGroups(parser, outputFileName, accnosFileName, 0, groups.size(), groups);       }
-                               else                            {       numSeqs = createProcessesGroups(parser, outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile);                        }
-                               
-                               if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
-                               
-                               numChimeras = deconvoluteResults(parser, outputFileName, accnosFileName);
-                               
-                               m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
-                               
-                               if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
-                               
-                       }else{
-                               if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
-                               
-                               //read sequences and store sorted by frequency
-                               vector<seqData> sequences = readFiles(fastaFileNames[s], nameFile);
-                               
-                               if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        } return 0; }
-                               
-                               numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); 
+            
+            if (hasCount) {
+                CountTable* ct = new CountTable();
+                ct->readTable(nameFile);
+                
+                if (ct->hasGroupInfo()) {
+                    cparser = new SequenceCountParser(fastaFileNames[s], *ct);
+                    
+                    vector<string> groups = cparser->getNamesOfGroups();
+                    
+                    if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        }  return 0; }
+                    
+                    //clears files
+                    ofstream out, out1, out2;
+                    m->openOutputFile(outputFileName, out); out.close(); 
+                    m->openOutputFile(accnosFileName, out1); out1.close();
+                    
+                    if(processors == 1)        {       numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups);       }
+                    else                               {       numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile);                        }
+                    
+                    if (m->control_pressed) {  delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
+                    map<string, string> uniqueNames = cparser->getAllSeqsMap();
+                    numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    delete cparser;
+
+                    m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
+                    
+                    if (m->control_pressed) {  delete ct; for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;  } 
+                    
+                }else {
+                    if (processors != 1) { m->mothurOut("Your count file does not contain group information, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
+                    
+                    //read sequences and store sorted by frequency
+                    vector<seqData> sequences = readFiles(fastaFileNames[s], ct);
+                    
+                    if (m->control_pressed) { delete ct; for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        } return 0; }
+                    
+                    numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras);   
+                }
+                delete ct;
+            }else {
+                if (groupFile != "") {
+                    //Parse sequences by group
+                    parser = new SequenceParser(groupFile, fastaFileNames[s], nameFile);
+                    vector<string> groups = parser->getNamesOfGroups();
+                    
+                    if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) {    m->mothurRemove(outputNames[j]);        }  return 0; }
+                    
+                    //clears files
+                    ofstream out, out1, out2;
+                    m->openOutputFile(outputFileName, out); out.close(); 
+                    m->openOutputFile(accnosFileName, out1); out1.close();
+                    
+                    if(processors == 1)        {       numSeqs = driverGroups(outputFileName, accnosFileName, 0, groups.size(), groups);       }
+                    else                               {       numSeqs = createProcessesGroups(outputFileName, accnosFileName, groups, groupFile, fastaFileNames[s], nameFile);                        }
+                    
+                    if (m->control_pressed) {  delete parser; for (int j = 0; j < outputNames.size(); j++) {   m->mothurRemove(outputNames[j]);        }  return 0;    }                               
+                    map<string, string> uniqueNames = parser->getAllSeqsMap();
+                    numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    delete parser;
+                    
+                    m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
+                    
+                    if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {  m->mothurRemove(outputNames[j]);        }  return 0;  }         
+                }else{
+                    if (processors != 1) { m->mothurOut("Without a groupfile, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
+                    
+                    //read sequences and store sorted by frequency
+                    vector<seqData> sequences = readFiles(fastaFileNames[s], nameFile);
+                    
+                    if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {   m->mothurRemove(outputNames[j]);        } return 0; }
+                    
+                    numSeqs = driver(outputFileName, sequences, accnosFileName, numChimeras); 
+                }
                         }
-                       
+            
                         if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        } return 0; }
                         
                         m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences. " + toString(numChimeras) + " chimeras were found.");      m->mothurOutEndLine();
@@ -510,7 +641,7 @@ string ChimeraPerseusCommand::getNamesFile(string& inputFile){
         }
  }
  //**********************************************************************************************************************
-int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFName, string accnos, int start, int end, vector<string> groups){
+int ChimeraPerseusCommand::driverGroups(string outputFName, string accnos, int start, int end, vector<string> groups){
         try {
                 
                 int totalSeqs = 0;
@@ -522,7 +653,7 @@ int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFNa
                         
                         int start = time(NULL);  if (m->control_pressed) {  return 0; }
                         
-                       vector<seqData> sequences = loadSequences(parser, groups[i]);
+                       vector<seqData> sequences = loadSequences(groups[i]);
                         
                         if (m->control_pressed) { return 0; }
                         
@@ -547,32 +678,48 @@ int ChimeraPerseusCommand::driverGroups(SequenceParser& parser, string outputFNa
         }
  }      
  //**********************************************************************************************************************
-vector<seqData> ChimeraPerseusCommand::loadSequences(SequenceParser& parser, string group){
+vector<seqData> ChimeraPerseusCommand::loadSequences(string group){
         try {
-               
-               vector<Sequence> thisGroupsSeqs = parser.getSeqs(group);
-               map<string, string> nameMap = parser.getNameMap(group);
-               map<string, string>::iterator it;
-               
-               vector<seqData> sequences;
-               bool error = false;
-        alignLength = 0;
-               
-               for (int i = 0; i < thisGroupsSeqs.size(); i++) {
-               
-                       if (m->control_pressed) {  return sequences; }
-                       
-                       it = nameMap.find(thisGroupsSeqs[i].getName());
-                       if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
-                       else {
-                               int num = m->getNumNames(it->second);
-                               sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
-                if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
-                       }
+        bool error = false;
+               alignLength = 0;
+        vector<seqData> sequences;
+        if (hasCount) {
+            vector<Sequence> thisGroupsSeqs = cparser->getSeqs(group);
+            map<string, int> counts = cparser->getCountTable(group);
+            map<string, int>::iterator it;
+            
+            for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+                
+                if (m->control_pressed) {  return sequences; }
+                
+                it = counts.find(thisGroupsSeqs[i].getName());
+                if (it == counts.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); m->mothurOutEndLine(); }
+                else {
+                    sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
+                    if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+                }
+            }
+        }else{
+            vector<Sequence> thisGroupsSeqs = parser->getSeqs(group);
+            map<string, string> nameMap = parser->getNameMap(group);
+            map<string, string>::iterator it;
+           
+            for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+                
+                if (m->control_pressed) {  return sequences; }
+                
+                it = nameMap.find(thisGroupsSeqs[i].getName());
+                if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
+                else {
+                    int num = m->getNumNames(it->second);
+                    sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
+                    if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+                }
+            }
+            
                 }
                 
-               if (error) { m->control_pressed = true; }
-               
+        if (error) { m->control_pressed = true; }
                 //sort by frequency
                 sort(sequences.rbegin(), sequences.rend());
                 
@@ -619,6 +766,37 @@ vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, string name){
                 
                 return sequences;
         }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraPerseusCommand", "readFiles");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, CountTable* ct){
+       try {           
+               //read fasta file and create sequenceData structure - checking for file mismatches
+               vector<seqData> sequences;
+               ifstream in;
+               m->openInputFile(inputFile, in);
+               alignLength = 0;
+        
+               while (!in.eof()) {
+            Sequence temp(in); m->gobble(in);
+                       
+                       int count = ct->getNumSeqs(temp.getName());
+                       if (m->control_pressed) { break; }
+                       else {
+                               sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), count));
+                if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
+                       }
+               }
+               in.close();
+               
+               //sort by frequency
+               sort(sequences.rbegin(), sequences.rend());
+               
+               return sequences;
+       }
         catch(exception& e) {
                 m->errorOut(e, "ChimeraPerseusCommand", "getNamesFile");
                 exit(1);
@@ -771,7 +949,7 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& seque
         }
  }
  /**************************************************************************************************/
-int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string accnos, vector<string> groups, string group, string fasta, string name) {
+int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accnos, vector<string> groups, string group, string fasta, string name) {
         try {
                 
                 vector<int> processIDS;
@@ -801,7 +979,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
                                 process++;
                         }else if (pid == 0){
-                               num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
+                               num = driverGroups(outputFName + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
                                 
                                 //pass numSeqs to parent
                                 ofstream out;
@@ -819,7 +997,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
                 }
                 
                 //do my part
-               num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
+               num = driverGroups(outputFName, accnos, lines[0].start, lines[0].end, groups);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<processIDS.size();i++) { 
@@ -850,7 +1028,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
                         // Allocate memory for thread data.
                         string extension = toString(i) + ".temp";
                         
-                       perseusData* tempPerseus = new perseusData(alpha, beta, cutoff, outputFName+extension, fasta, name, group, accnos+extension, groups, m, lines[i].start, lines[i].end, i);
+                       perseusData* tempPerseus = new perseusData(hasName, hasCount, alpha, beta, cutoff, outputFName+extension, fasta, name, group, accnos+extension, groups, m, lines[i].start, lines[i].end, i);
                         
                         pDataArray.push_back(tempPerseus);
                         processIDS.push_back(i);
@@ -862,7 +1040,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
                 
                 
                 //using the main process as a worker saves time and memory
-               num = driverGroups(parser, outputFName, accnos, lines[0].start, lines[0].end, groups);
+               num = driverGroups(outputFName, accnos, lines[0].start, lines[0].end, groups);
                 
                 //Wait until all threads have terminated.
                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
@@ -894,9 +1072,8 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
         }
  }
  //**********************************************************************************************************************
-int ChimeraPerseusCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName){
+int ChimeraPerseusCommand::deconvoluteResults(map<string, string>& uniqueNames, string outputFileName, string accnosFileName){
         try {
-               map<string, string> uniqueNames = parser.getAllSeqsMap();
                 map<string, string>::iterator itUnique;
                 int total = 0;
                 
diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h

index b6957e39d45cc877a9bee28cccf933af6d86f49e..e2855d019ff984d9ac026a2554860c5b3aa33f42 100644 (file)
--- a/chimeraperseuscommand.h
+++ b/chimeraperseuscommand.h
@@ -16,7 +16,9 @@
  #include "mothur.h"
  #include "command.hpp"
  #include "sequenceparser.h"
+#include "sequencecountparser.h"
  #include "myPerseus.h"
+#include "counttable.h"
  
  /***********************************************************/
  class ChimeraPerseusCommand : public Command {
@@ -43,10 +45,12 @@ private:
                 linePair(int i, int j) : start(i), end(j) {}
         };
         
-       bool abort;
-       string fastafile, groupfile, outputDir, namefile;
+       bool abort, hasName, hasCount;
+       string fastafile, groupfile, countfile, outputDir, namefile;
         int processors, alignLength;
         double cutoff, alpha, beta;
+    SequenceParser* parser;
+    SequenceCountParser* cparser;
         
         vector<string> outputNames;
         vector<string> fastaFileNames;
@@ -56,10 +60,11 @@ private:
         string getNamesFile(string&);
         int driver(string, vector<seqData>&, string, int&);
         vector<seqData> readFiles(string, string);
-       vector<seqData> loadSequences(SequenceParser&, string);
-       int deconvoluteResults(SequenceParser&, string, string);
-       int driverGroups(SequenceParser&, string, string, int, int, vector<string>);
-       int createProcessesGroups(SequenceParser&, string, string, vector<string>, string, string, string);
+    vector<seqData> readFiles(string inputFile, CountTable* ct);
+       vector<seqData> loadSequences(string);
+       int deconvoluteResults(map<string, string>&, string, string);
+       int driverGroups(string, string, int, int, vector<string>);
+       int createProcessesGroups(string, string, vector<string>, string, string, string);
  };
  
  /**************************************************************************************************/
@@ -75,12 +80,13 @@ struct perseusData {
         MothurOut* m;
         int start;
         int end;
+    bool hasName, hasCount;
         int threadID, count, numChimeras;
         double alpha, beta, cutoff;
         vector<string> groups;
         
         perseusData(){}
-       perseusData(double a, double b, double c, string o,  string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
+       perseusData(bool hn, bool hc, double a, double b, double c, string o,  string f, string n, string g, string ac, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
                 alpha = a;
                 beta = b;
                 cutoff = c;
@@ -94,6 +100,8 @@ struct perseusData {
                 end = en;
                 threadID = tid;
                 groups = gr;
+        hasName = hn;
+        hasCount = hc;
                 count = 0;
                 numChimeras = 0;
         }
@@ -114,38 +122,67 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                 
                 //parse fasta and name file by group
                 SequenceParser* parser;
-               if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);      }
-               else                                                    { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
-               
+        SequenceCountParser* cparser;
+               if (pDataArray->hasCount) {
+            CountTable* ct = new CountTable();
+            ct->readTable(pDataArray->namefile);
+            cparser = new SequenceCountParser(pDataArray->fastafile, *ct);
+            delete ct;
+        }else {
+            if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
+            else                                                       { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
+        }
+    
                 int totalSeqs = 0;
                 int numChimeras = 0;
                 
                 for (int i = pDataArray->start; i < pDataArray->end; i++) {
                         
-                       int start = time(NULL);  if (pDataArray->m->control_pressed) {  delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+                       int start = time(NULL);  if (pDataArray->m->control_pressed) {  if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
                         
                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group " + pDataArray->groups[i] + "...");  pDataArray->m->mothurOutEndLine();                                      
                         
                         //vector<seqData> sequences = loadSequences(parser, groups[i]); - same function below
                         ////////////////////////////////////////////////////////////////////////////////////////
-                       vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]);
-                       map<string, string> nameMap = parser->getNameMap(pDataArray->groups[i]);
-                       map<string, string>::iterator it;
-                       
-                       vector<seqData> sequences;
                         bool error = false;
-                       
-                       for (int j = 0; j < thisGroupsSeqs.size(); j++) {
-                               
-                               if (pDataArray->m->control_pressed) {  delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
-                               
-                               it = nameMap.find(thisGroupsSeqs[j].getName());
-                               if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[j].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
-                               else {
-                                       int num = pDataArray->m->getNumNames(it->second);
-                                       sequences.push_back(seqData(thisGroupsSeqs[j].getName(), thisGroupsSeqs[j].getUnaligned(), num));
-                               }
-                       }
+            int alignLength = 0;
+            vector<seqData> sequences;
+            if (pDataArray->hasCount) {
+                vector<Sequence> thisGroupsSeqs = cparser->getSeqs(pDataArray->groups[i]);
+                map<string, int> counts = cparser->getCountTable(pDataArray->groups[i]);
+                map<string, int>::iterator it;
+                
+                for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+                    
+                    if (pDataArray->m->control_pressed) {  break; }
+                    
+                    it = counts.find(thisGroupsSeqs[i].getName());
+                    if (it == counts.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+                    else {
+                        sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
+                        if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+                    }
+                }
+            }else{
+                vector<Sequence> thisGroupsSeqs = parser->getSeqs(pDataArray->groups[i]);
+                map<string, string> nameMap = parser->getNameMap(pDataArray->groups[i]);
+                map<string, string>::iterator it;
+                
+                for (int i = 0; i < thisGroupsSeqs.size(); i++) {
+                    
+                    if (pDataArray->m->control_pressed) {  break; }
+                    
+                    it = nameMap.find(thisGroupsSeqs[i].getName());
+                    if (it == nameMap.end()) { error = true; pDataArray->m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+                    else {
+                        int num = pDataArray->m->getNumNames(it->second);
+                        sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
+                        if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
+                    }
+                }
+                
+            }
+            
                         
                         if (error) { pDataArray->m->control_pressed = true; }
                         
@@ -153,7 +190,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                         sort(sequences.rbegin(), sequences.rend());
                         ////////////////////////////////////////////////////////////////////////////////////////
  
-                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
                         
                         //int numSeqs = driver((outputFName + groups[i]), sequences, (accnos+groups[i]), numChimeras); - same function below
                         ////////////////////////////////////////////////////////////////////////////////////////
@@ -184,7 +221,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                         }
                         
                         int numSeqs = sequences.size();
-                       int alignLength = sequences[0].sequence.size();
+                       //int alignLength = sequences[0].sequence.size();
                         
                         ofstream chimeraFile;
                         ofstream accnosFile;
@@ -200,7 +237,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                         
                         for(int j=0;j<numSeqs;j++){     
                                 
-                               if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                               if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                 
                                 vector<bool> restricted = chimeras;
                                 
@@ -217,7 +254,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                                 
                                 int comparisons = myPerseus.getAlignments(j, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
                                 
-                               if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                               if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                 
                                 int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
                                 
@@ -226,7 +263,7 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                                 if(comparisons >= 2){   
                                         minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser;  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         int minMismatchToTrimera = numeric_limits<int>::max();
                                         int leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB;
@@ -234,12 +271,12 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                                         if(minMismatchToChimera >= 3 && comparisons >= 3){
                                                 minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
                                                 
-                                               if (pDataArray->m->control_pressed) { delete parser;  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                               if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         }
                                         
                                         double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser;  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }  pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         string type;
                                         string chimeraRefSeq;
@@ -253,16 +290,16 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                                                 chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
                                         }
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; }; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[j].sequence, chimeraRefSeq);
                                         double loonIndex = myPerseus.calcLoonIndex(sequences[j].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);                
                                         
-                                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
+                                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); chimeraFile.close(); pDataArray->m->mothurRemove(chimeraFileName); accnosFile.close(); pDataArray->m->mothurRemove(accnosFileName); return 0; }
                                         
                                         chimeraFile << j << '\t' << sequences[j].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
                                         chimeraFile << minMismatchToChimera << '\t' << leftParentBi << '\t' << rightParentBi << '\t' << sequences[leftParentBi].seqName << '\t' << sequences[rightParentBi].seqName << '\t';
@@ -304,11 +341,11 @@ static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
                         pDataArray->m->appendFiles(accnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(accnosFileName);
                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->groups[i] + ".");        pDataArray->m->mothurOutEndLine();                                      
                         
-                       if (pDataArray->m->control_pressed) { delete parser; pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
+                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } pDataArray->m->mothurRemove(pDataArray->outputFName); pDataArray->m->mothurRemove(pDataArray->accnos); return 0; }
                 }       
                 
                 pDataArray->count = totalSeqs;
-               delete parser;
+               if (pDataArray->hasCount) { delete cparser; } { delete parser; }
                 return totalSeqs;
                 
         }
diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp

index 59dd0a55a27c13eb80dd521ea83516770c97ed4a..bd9bdbf2f3dcb4d5ed3ce3a7ced35dfe7a4a1ba5 100644 (file)
--- a/chimeraslayercommand.cpp
+++ b/chimeraslayercommand.cpp
@@ -11,14 +11,16 @@
  #include "deconvolutecommand.h"
  #include "referencedb.h"
  #include "sequenceparser.h"
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> ChimeraSlayerCommand::setParameters(){  
         try {
                 CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate);
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pwindow("window", "Number", "", "50", "", "", "",false,false); parameters.push_back(pwindow);
                 CommandParameter pksize("ksize", "Number", "", "7", "", "", "",false,false); parameters.push_back(pksize);
                 CommandParameter pmatch("match", "Number", "", "5.0", "", "", "",false,false); parameters.push_back(pmatch);
@@ -57,10 +59,11 @@ string ChimeraSlayerCommand::getHelpString(){
                 string helpString = "";
                 helpString += "The chimera.slayer command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
                 helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n";
-               helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n";
+               helpString += "The chimera.slayer command parameters are fasta, name, group, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n";
                 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
                 helpString += "The name parameter allows you to provide a name file, if you are using reference=self. \n";
                 helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
+        helpString += "The count parameter allows you to provide a count file. The count file reference=self. If your count file contains group information, when checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
                 helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
                 helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n";
                 helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
@@ -139,6 +142,8 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
         try {
                 abort = false; calledHelp = false;   
                 ReferenceDB* rdb = ReferenceDB::getInstance();
+        hasCount = false;
+        hasName = false;
                 
                 //allow user to run help
                 if(option == "help") { help(); abort = true; calledHelp = true; }
@@ -247,9 +252,8 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
                         
                         
                         //check for required parameters
-                       bool hasName = true;
                         namefile = validParameter.validFile(parameters, "name", false);
-                       if (namefile == "not found") { namefile = "";  hasName = false; }
+                       if (namefile == "not found") { namefile = "";   }
                         else { 
                                 m->splitAtDash(namefile, nameFileNames);
                                 
@@ -316,12 +320,91 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
                                                 }
                                         }
                                 }
+                       }
+            
+            if (nameFileNames.size() != 0) { hasName = true; }
+            
+            //check for required parameters
+            vector<string> countfileNames;
+                       countfile = validParameter.validFile(parameters, "count", false);
+                       if (countfile == "not found") { 
+                countfile = "";  
+                       }else { 
+                               m->splitAtDash(countfile, countfileNames);
                                 
-                               //make sure there is at least one valid file left
-                               if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
+                               //go through files and make sure they are good, if not, then disregard them
+                               for (int i = 0; i < countfileNames.size(); i++) {
+                                       
+                                       bool ignore = false;
+                                       if (countfileNames[i] == "current") { 
+                                               countfileNames[i] = m->getCountTableFile(); 
+                                               if (nameFileNames[i] != "") {  m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); }
+                                               else {  
+                                                       m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }
+                                       }
+                                       
+                                       if (!ignore) {
+                                               
+                                               if (inputDir != "") {
+                                                       string path = m->hasPath(countfileNames[i]);
+                                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                                       if (path == "") {       countfileNames[i] = inputDir + countfileNames[i];               }
+                                               }
+                                               
+                                               int ableToOpen;
+                                               ifstream in;
+                                               
+                                               ableToOpen = m->openInputFile(countfileNames[i], in, "noerror");
+                                               
+                                               //if you can't open it, try default location
+                                               if (ableToOpen == 1) {
+                                                       if (m->getDefaultPath() != "") { //default path is set
+                                                               string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               if (ableToOpen == 1) {
+                                                       if (m->getOutputDir() != "") { //default path is set
+                                                               string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               in.close();
+                                               
+                                               if (ableToOpen == 1) { 
+                                                       m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }else {
+                                                       m->setCountTableFile(countfileNames[i]);
+                                               }
+                                       }
+                               }
                         }
-                       
-                       if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+            
+            if (countfileNames.size() != 0) { hasCount = true; }
+            
+                       //make sure there is at least one valid file left
+            if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+            
+            if (!hasName && hasCount) { nameFileNames = countfileNames; }
+            
+                       if ((hasCount || hasName) && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
                         
                         bool hasGroup = true;
                         groupfile = validParameter.validFile(parameters, "group", false);
@@ -399,7 +482,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
                         
                         if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
                         
-                       
+            if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; }                     
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
                         
@@ -449,6 +532,12 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
                                         m->mothurOutEndLine();
                                         save = false;
                                 }
+                       }else if (hasCount) {  templatefile = "self"; 
+                               if (save) {
+                                       m->mothurOut("[WARNING]: You can't save reference=self, ignoring save."); 
+                                       m->mothurOutEndLine();
+                                       save = false;
+                               }
                         }
                         else { 
                                 if (rdb->referenceSeqs.size() != 0) {
@@ -551,7 +640,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
  
                         if ((search != "blast") && (search != "kmer")) { m->mothurOut(search + " is not a valid search."); m->mothurOutEndLine(); abort = true;  }
                         
-                       if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
+                       if ((hasName || hasCount) && (templatefile != "self")) { m->mothurOut("You have provided a namefile or countfile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
                         if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
  
                         //until we resolve the issue 10-18-11
@@ -599,13 +688,23 @@ int ChimeraSlayerCommand::execute(){
                         map<string, string> fileGroup;
                         fileToPriority[fastaFileNames[s]] = priority; //default
                         fileGroup[fastaFileNames[s]] = "noGroup";
-                       SequenceParser* parser = NULL;
+            map<string, string> uniqueNames; 
                         int totalChimeras = 0;
                         lines.clear();
                         
-                       if (templatefile == "self") { setUpForSelfReference(parser, fileGroup, fileToPriority, s); }
+                       if (templatefile == "self") { 
+                if (hasCount) {
+                    SequenceCountParser* parser = NULL;
+                    setUpForSelfReference(parser, fileGroup, fileToPriority, s); 
+                    if (parser != NULL) { uniqueNames = parser->getAllSeqsMap(); delete parser; }
+                }else {
+                    SequenceParser* parser = NULL;
+                    setUpForSelfReference(parser, fileGroup, fileToPriority, s); 
+                    if (parser != NULL) { uniqueNames = parser->getAllSeqsMap(); delete parser; }
+                }
+            }
                         
-                       if (m->control_pressed) {  if (parser != NULL) { delete parser; } for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        }  return 0;    }
+                       if (m->control_pressed) {   for (int j = 0; j < outputNames.size(); j++) {      m->mothurRemove(outputNames[j]);        }  return 0;    }
  
                         if (fileToPriority.size() == 1) { //you running without a groupfile
                                 itFile = fileToPriority.begin();
@@ -637,7 +736,7 @@ int ChimeraSlayerCommand::execute(){
                                 if(processors == 1){ numSeqs = driver(lines[0], outputFileName, thisFastaName, accnosFileName, trimFastaFileName, thisPriority);  }
                                 else{ numSeqs = createProcesses(outputFileName, thisFastaName, accnosFileName, trimFastaFileName, thisPriority); }
                                 
-                               if (m->control_pressed) { if (parser != NULL) { delete parser; }  outputTypes.clear(); if (trim) { m->mothurRemove(trimFastaFileName); } m->mothurRemove(outputFileName); m->mothurRemove(accnosFileName); for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0; }                          
+                               if (m->control_pressed) {  outputTypes.clear(); if (trim) { m->mothurRemove(trimFastaFileName); } m->mothurRemove(outputFileName); m->mothurRemove(accnosFileName); for (int j = 0; j < outputNames.size(); j++) {      m->mothurRemove(outputNames[j]);        }  return 0; }                          
  #endif
                         }else { //you have provided a groupfile
  #ifdef USE_MPI 
@@ -653,16 +752,13 @@ int ChimeraSlayerCommand::execute(){
                                 
                                 if (pid == 0) {
  #endif
-                               
-                               totalChimeras = deconvoluteResults(parser, outputFileName, accnosFileName, trimFastaFileName);
+                               totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, trimFastaFileName);
                  m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine();
  #ifdef USE_MPI 
                                 }
                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait
  #endif
                         }
-       
-                       if (parser != NULL) { delete parser; } 
                         
              m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences.");   m->mothurOutEndLine();
                 }
@@ -919,9 +1015,8 @@ int ChimeraSlayerCommand::MPIExecute(string inputFile, string outputFileName, st
         }
  }
  //**********************************************************************************************************************
-int ChimeraSlayerCommand::deconvoluteResults(SequenceParser* parser, string outputFileName, string accnosFileName, string trimFileName){
+int ChimeraSlayerCommand::deconvoluteResults(map<string, string>& uniqueNames, string outputFileName, string accnosFileName, string trimFileName){
         try {
-               map<string, string> uniqueNames = parser->getAllSeqsMap();
                 map<string, string>::iterator itUnique;
                 int total = 0;
          
@@ -1169,7 +1264,51 @@ int ChimeraSlayerCommand::setUpForSelfReference(SequenceParser*& parser, map<str
                 exit(1);
         }
  }
-
+//**********************************************************************************************************************
+int ChimeraSlayerCommand::setUpForSelfReference(SequenceCountParser*& parser, map<string, string>& fileGroup, map<string, map<string, int> >& fileToPriority, int s){
+       try {
+               fileGroup.clear();
+               fileToPriority.clear();
+               
+               string nameFile = "";
+               if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one
+                       nameFile = nameFileNames[s];
+               }else {  m->control_pressed = true; return 0; }
+         
+               CountTable ct;
+               if (!ct.testGroups(nameFile)) {  
+                       if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
+            
+                       //sort fastafile by abundance, returns new sorted fastafile name
+                       m->mothurOut("Sorting fastafile according to abundance..."); cout.flush(); 
+                       priority = sortFastaFile(fastaFileNames[s], nameFile);
+                       m->mothurOut("Done."); m->mothurOutEndLine();
+                       
+                       fileToPriority[fastaFileNames[s]] = priority;
+                       fileGroup[fastaFileNames[s]] = "noGroup";
+               }else {
+                       //Parse sequences by group
+                       parser = new SequenceCountParser(nameFile, fastaFileNames[s]);
+                       vector<string> groups = parser->getNamesOfGroups();
+                       
+                       for (int i = 0; i < groups.size(); i++) {
+                               vector<Sequence> thisGroupsSeqs = parser->getSeqs(groups[i]);
+                               map<string, int> thisGroupsMap = parser->getCountTable(groups[i]);
+                               string newFastaFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + groups[i] + "-sortedTemp.fasta";
+                               sortFastaFile(thisGroupsSeqs, thisGroupsMap, newFastaFile); 
+                               fileToPriority[newFastaFile] = thisGroupsMap;
+                               fileGroup[newFastaFile] = groups[i];
+                       }
+               }
+               
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraSlayerCommand", "setUpForSelfReference");
+               exit(1);
+       }
+}
  //**********************************************************************************************************************
  string ChimeraSlayerCommand::getNamesFile(string& inputFile){
         try {
@@ -1820,9 +1959,22 @@ map<string, int> ChimeraSlayerCommand::sortFastaFile(string fastaFile, string na
                 
                 in.close();
                 
-               //read namefile
+               //read namefile or countfile
                 vector<seqPriorityNode> nameMapCount;
-               int error = m->readNames(nameFile, nameMapCount, seqs);
+        int error;
+        if (hasCount) { 
+            CountTable ct;
+            ct.readTable(nameFile);
+            
+            for(map<string, string>::iterator it = seqs.begin(); it != seqs.end(); it++) {
+                int num = ct.getNumSeqs(it->first);
+                if (num == 0) { error = 1; }
+                else {
+                    seqPriorityNode temp(num, it->second, it->first);
+                    nameMapCount.push_back(temp);
+                }
+            }
+        }else { error = m->readNames(nameFile, nameMapCount, seqs); }
                 
                 if (m->control_pressed) { return nameAbund; }
                 
@@ -1904,4 +2056,51 @@ map<string, int> ChimeraSlayerCommand::sortFastaFile(vector<Sequence>& thisseqs,
         }
  }
  /**************************************************************************************************/
+int ChimeraSlayerCommand::sortFastaFile(vector<Sequence>& thisseqs, map<string, int>& countMap, string newFile) {
+       try {
+               vector<seqPriorityNode> nameVector;
+               
+               //read through fastafile and store info
+               map<string, string> seqs;
+        
+               for (int i = 0; i < thisseqs.size(); i++) {
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       map<string, int>::iterator itCountMap = countMap.find(thisseqs[i].getName());
+                       
+                       if (itCountMap == countMap.end()){
+                               m->control_pressed = true;
+                               m->mothurOut("[ERROR]: " + thisseqs[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine();
+                       }else {
+                seqPriorityNode temp(itCountMap->second, thisseqs[i].getAligned(), thisseqs[i].getName());
+                               nameVector.push_back(temp);
+                       }
+               }
+        
+               //sort by num represented
+               sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
+        
+               if (m->control_pressed) { return 0; }
+               
+               if (thisseqs.size() != nameVector.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your count file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
+        
+               ofstream out;
+               m->openOutputFile(newFile, out);
+               
+               //print new file in order of
+               for (int i = 0; i < nameVector.size(); i++) {
+                       out << ">" << nameVector[i].name << endl << nameVector[i].seq << endl;
+               }
+               out.close();
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraSlayerCommand", "sortFastaFile");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
  
diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h

index 82e1228606986b18d7176a5cdf70e84a83cb695f..4d53c5c048dafe1f969437a384ea93a160a83c04 100644 (file)
--- a/chimeraslayercommand.h
+++ b/chimeraslayercommand.h
@@ -15,6 +15,7 @@
  #include "chimera.h"
  #include "chimeraslayer.h"
  #include "sequenceparser.h"
+#include "sequencecountparser.h"
  
  /***********************************************************/
  
@@ -51,12 +52,14 @@ private:
         int divideInHalf(Sequence, string&, string&);
         map<string, int> sortFastaFile(string, string);
         map<string, int> sortFastaFile(vector<Sequence>&, map<string, string>&, string newFile);
+    int sortFastaFile(vector<Sequence>&, map<string, int>&, string newFile);
         string getNamesFile(string&);
         //int setupChimera(string,);
         int MPIExecute(string, string, string, string, map<string, int>&);
-       int deconvoluteResults(SequenceParser*, string, string, string);
+       int deconvoluteResults(map<string, string>&, string, string, string);
         map<string, int> priority;
         int setUpForSelfReference(SequenceParser*&, map<string, string>&, map<string, map<string, int> >&, int);
+    int setUpForSelfReference(SequenceCountParser*&, map<string, string>&, map<string, map<string, int> >&, int);
         int driverGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
         int createProcessesGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
         int MPIExecuteGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
@@ -66,8 +69,8 @@ private:
         int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, string, map<string, int>&, bool);
         #endif
  
-       bool abort, realign, trim, trimera, save;
-       string fastafile, groupfile, templatefile, outputDir, search, namefile, blastlocation;
+       bool abort, realign, trim, trimera, save, hasName, hasCount;
+       string fastafile, groupfile, templatefile, outputDir, search, namefile, countfile, blastlocation;
         int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
         float divR;
         
diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp

index bd31c193e82aa0725cc8920336ec310cc19db820..ae011906ded5c95b3aec8bfb57047e27ae42392b 100644 (file)
--- a/chimerauchimecommand.cpp
+++ b/chimerauchimecommand.cpp
@@ -19,8 +19,9 @@ vector<string> ChimeraUchimeCommand::setParameters(){
         try {
                 CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate);
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -34,6 +35,8 @@ vector<string> ChimeraUchimeCommand::setParameters(){
                 CommandParameter pchunks("chunks", "Number", "", "4", "", "", "",false,false); parameters.push_back(pchunks);
                 CommandParameter pminchunk("minchunk", "Number", "", "64", "", "", "",false,false); parameters.push_back(pminchunk);
                 CommandParameter pidsmoothwindow("idsmoothwindow", "Number", "", "32", "", "", "",false,false); parameters.push_back(pidsmoothwindow);
+        CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pdups);
+
                 //CommandParameter pminsmoothid("minsmoothid", "Number", "", "0.95", "", "", "",false,false); parameters.push_back(pminsmoothid);
                 CommandParameter pmaxp("maxp", "Number", "", "2", "", "", "",false,false); parameters.push_back(pmaxp);
                 CommandParameter pskipgaps("skipgaps", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pskipgaps);
@@ -58,11 +61,13 @@ string ChimeraUchimeCommand::getHelpString(){
                 string helpString = "";
                 helpString += "The chimera.uchime command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
                 helpString += "This command is a wrapper for uchime written by Robert C. Edgar.\n";
-               helpString += "The chimera.uchime command parameters are fasta, name, reference, processors, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n";
+               helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, dereplicate, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n";
                 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
                 helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n";
+        helpString += "The count parameter allows you to provide a count file, if you are using template=self. \n";
                 helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
                 helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
+        helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n";
                 helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n";
                 helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
                 helpString += "The abskew parameter can only be used with template=self. Minimum abundance skew. Default 1.9. Abundance skew is: min [ abund(parent1), abund(parent2) ] / abund(query).\n";
@@ -137,7 +142,7 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(){
  //***************************************************************************************************************
  ChimeraUchimeCommand::ChimeraUchimeCommand(string option)  {
         try {
-               abort = false; calledHelp = false; 
+               abort = false; calledHelp = false; hasName=false; hasCount=false;
                 ReferenceDB* rdb = ReferenceDB::getInstance();
                 
                 //allow user to run help
@@ -247,9 +252,8 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option)  {
                         
                         
                         //check for required parameters
-                       bool hasName = true;
                         namefile = validParameter.validFile(parameters, "name", false);
-                       if (namefile == "not found") { namefile = "";  hasName = false; }
+                       if (namefile == "not found") { namefile = "";   }
                         else { 
                                 m->splitAtDash(namefile, nameFileNames);
                                 
@@ -316,12 +320,91 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option)  {
                                                 }
                                         }
                                 }
+                       }
+            
+            if (nameFileNames.size() != 0) { hasName = true; }
+            
+            //check for required parameters
+            vector<string> countfileNames;
+                       countfile = validParameter.validFile(parameters, "count", false);
+                       if (countfile == "not found") { 
+                countfile = "";  
+                       }else { 
+                               m->splitAtDash(countfile, countfileNames);
                                 
-                               //make sure there is at least one valid file left
-                               if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
+                               //go through files and make sure they are good, if not, then disregard them
+                               for (int i = 0; i < countfileNames.size(); i++) {
+                                       
+                                       bool ignore = false;
+                                       if (countfileNames[i] == "current") { 
+                                               countfileNames[i] = m->getCountTableFile(); 
+                                               if (nameFileNames[i] != "") {  m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); }
+                                               else {  
+                                                       m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }
+                                       }
+                                       
+                                       if (!ignore) {
+                                               
+                                               if (inputDir != "") {
+                                                       string path = m->hasPath(countfileNames[i]);
+                                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                                       if (path == "") {       countfileNames[i] = inputDir + countfileNames[i];               }
+                                               }
+                                               
+                                               int ableToOpen;
+                                               ifstream in;
+                                               
+                                               ableToOpen = m->openInputFile(countfileNames[i], in, "noerror");
+                                               
+                                               //if you can't open it, try default location
+                                               if (ableToOpen == 1) {
+                                                       if (m->getDefaultPath() != "") { //default path is set
+                                                               string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               if (ableToOpen == 1) {
+                                                       if (m->getOutputDir() != "") { //default path is set
+                                                               string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               in.close();
+                                               
+                                               if (ableToOpen == 1) { 
+                                                       m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }else {
+                                                       m->setCountTableFile(countfileNames[i]);
+                                               }
+                                       }
+                               }
                         }
-                       
-                       if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+            
+            if (countfileNames.size() != 0) { hasCount = true; }
+            
+                       //make sure there is at least one valid file left
+            if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+            
+            if (!hasName && hasCount) { nameFileNames = countfileNames; }
+            
+                       if ((hasCount || hasName) && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of name or count files does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
                         
                         bool hasGroup = true;
                         groupfile = validParameter.validFile(parameters, "group", false);
@@ -399,6 +482,10 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option)  {
                         
                         if (hasGroup && (groupFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of groupfiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
                         
+            if (hasGroup && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; }                     
+                       //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
+                       
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
@@ -427,6 +514,7 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option)  {
                                         }
                                 }
                         }else if (hasName) {  templatefile = "self"; }
+            else if (hasCount) {  templatefile = "self"; }
                         else { 
                                 if (rdb->getSavedReference() != "") {
                                         templatefile = rdb->getSavedReference();
@@ -472,6 +560,15 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option)  {
  
                         temp = validParameter.validFile(parameters, "skipgaps2", false);                                if (temp == "not found") { temp = "t"; }
                         skipgaps2 = m->isTrue(temp); 
+            
+            string usedDups = "false";
+                       temp = validParameter.validFile(parameters, "dereplicate", false);      
+                       if (temp == "not found") { 
+                               if (groupfile != "")    {  temp = "false";                                      }
+                               else                    {  temp = "true"; usedDups = "";        }
+                       }
+                       dups = m->isTrue(temp);
+
                         
                         if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
                         if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
@@ -533,7 +630,8 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option)  {
  
  int ChimeraUchimeCommand::execute(){
         try{
-               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
+        
+        if (abort == true) { if (calledHelp) { return 0; }  return 2;  }
                 
                 m->mothurOut("\nuchime by Robert C. Edgar\nhttp://drive5.com/uchime\nThis code is donated to the public domain.\n\n");
                 
@@ -551,9 +649,14 @@ int ChimeraUchimeCommand::execute(){
                                 
                         //you provided a groupfile
                         string groupFile = "";
-                       if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; }
+            bool hasGroup = false;
+                       if (groupFileNames.size() != 0) { groupFile = groupFileNames[s]; hasGroup = true; }
+            else if (hasCount) {
+                CountTable ct;
+                if (ct.testGroups(nameFileNames[s])) { hasGroup = true; }
+            }
                         
-                       if ((templatefile == "self") && (groupFile == "")) { //you want to run uchime with a reference template
+                       if ((templatefile == "self") && (!hasGroup)) { //you want to run uchime with a template=self and no groups
  
                                 if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
                                 if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one
@@ -565,7 +668,21 @@ int ChimeraUchimeCommand::execute(){
  
                                 //read namefile
                                 vector<seqPriorityNode> nameMapCount;
-                               int error = m->readNames(nameFile, nameMapCount, seqs); if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        }  return 0; }
+                int error;
+                if (hasCount) {
+                    CountTable ct;
+                    ct.readTable(nameFile);
+                    for(map<string, string>::iterator it = seqs.begin(); it != seqs.end(); it++) {
+                        int num = ct.getNumSeqs(it->first);
+                        if (num == 0) { error = 1; }
+                        else {
+                            seqPriorityNode temp(num, it->second, it->first);
+                            nameMapCount.push_back(temp);
+                        }
+                    }
+                }else {
+                    error = m->readNames(nameFile, nameMapCount, seqs); if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0; }
+                }
                                 if (error == 1) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        }  return 0; }
                                 if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); for (int j = 0; j < outputNames.size(); j++) {  m->mothurRemove(outputNames[j]);        }  return 0; }
                                 
@@ -575,14 +692,23 @@ int ChimeraUchimeCommand::execute(){
                         
                         if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
                         
-                       if (groupFile != "") {
+                       if (hasGroup) {
                                 if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one
                                         nameFile = nameFileNames[s];
                                 }else { nameFile = getNamesFile(fastaFileNames[s]); }
                                 
                                 //Parse sequences by group
-                               SequenceParser parser(groupFile, fastaFileNames[s], nameFile);
-                               vector<string> groups = parser.getNamesOfGroups();
+                vector<string> groups;
+                map<string, string> uniqueNames;
+                if (hasCount) {
+                    cparser = new SequenceCountParser(nameFile, fastaFileNames[s]);
+                    groups = cparser->getNamesOfGroups();
+                    uniqueNames = cparser->getAllSeqsMap();
+                }else{
+                    sparser = new SequenceParser(groupFile, fastaFileNames[s], nameFile);
+                    groups = sparser->getNamesOfGroups();
+                    uniqueNames = sparser->getAllSeqsMap();
+                }
                                         
                                 if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        }  return 0; }
                                                                 
@@ -593,16 +719,20 @@ int ChimeraUchimeCommand::execute(){
                                 if (chimealns) { m->openOutputFile(alnsFileName, out2); out2.close(); }
                                 int totalSeqs = 0;
                                 
-                               if(processors == 1)     {       totalSeqs = driverGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, 0, groups.size(), groups);     }
-                               else                            {       totalSeqs = createProcessesGroups(parser, outputFileName, newFasta, accnosFileName, alnsFileName, groups, nameFile, groupFile, fastaFileNames[s]);                      }
+                               if(processors == 1)     {       totalSeqs = driverGroups(outputFileName, newFasta, accnosFileName, alnsFileName, 0, groups.size(), groups);     }
+                               else                            {       totalSeqs = createProcessesGroups(outputFileName, newFasta, accnosFileName, alnsFileName, groups, nameFile, groupFile, fastaFileNames[s]);                      }
  
                                 if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
-
-                               int totalChimeras = deconvoluteResults(parser, outputFileName, accnosFileName, alnsFileName);
-                               
-                               m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found.");  m->mothurOutEndLine();
-                               m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
+                if (hasCount) { delete cparser; }
+                else { delete sparser; }
+                
+                if (!dups) { 
+                    int totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, alnsFileName);
                                 
+                    m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found.");     m->mothurOutEndLine();
+                    m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
+                               }
+                
                                 if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
                                         
                         }else{
@@ -657,9 +787,8 @@ int ChimeraUchimeCommand::execute(){
         }
  }
  //**********************************************************************************************************************
-int ChimeraUchimeCommand::deconvoluteResults(SequenceParser& parser, string outputFileName, string accnosFileName, string alnsFileName){
+int ChimeraUchimeCommand::deconvoluteResults(map<string, string>& uniqueNames, string outputFileName, string accnosFileName, string alnsFileName){
         try {
-               map<string, string> uniqueNames = parser.getAllSeqsMap();
                 map<string, string>::iterator itUnique;
                 int total = 0;
                 
@@ -685,7 +814,7 @@ int ChimeraUchimeCommand::deconvoluteResults(SequenceParser& parser, string outp
                         //find unique name
                         itUnique = uniqueNames.find(name);
                         
-                       if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+                       if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find " + name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
                         else {
                                 itChimeras = chimerasInFile.find((itUnique->second));
                                 
@@ -999,7 +1128,7 @@ string ChimeraUchimeCommand::getNamesFile(string& inputFile){
         }
  }
  //**********************************************************************************************************************
-int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, int start, int end, vector<string> groups){
+int ChimeraUchimeCommand::driverGroups(string outputFName, string filename, string accnos, string alns, int start, int end, vector<string> groups){
         try {
                 
                 int totalSeqs = 0;
@@ -1007,8 +1136,10 @@ int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFNam
                 
                 for (int i = start; i < end; i++) {
                         int start = time(NULL);  if (m->control_pressed) {  return 0; }
-                       
-                       int error = parser.getSeqs(groups[i], filename, true); if ((error == 1) || m->control_pressed) {  return 0; }
+            
+                       int error;
+            if (hasCount) { error = cparser->getSeqs(groups[i], filename, true); if ((error == 1) || m->control_pressed) {  return 0; } }
+            else { error = sparser->getSeqs(groups[i], filename, true); if ((error == 1) || m->control_pressed) {  return 0; } }
                         
                         int numSeqs = driver((outputFName + groups[i]), filename, (accnos+ groups[i]), (alns+ groups[i]), numChimeras);
                         totalSeqs += numSeqs;
@@ -1026,7 +1157,6 @@ int ChimeraUchimeCommand::driverGroups(SequenceParser& parser, string outputFNam
                         
                         m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + groups[i] + ".");    m->mothurOutEndLine();                                  
                 }       
-               
                 return totalSeqs;
                 
         }
@@ -1052,29 +1182,20 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
                 vector<char*> cPara;
                 
                 string uchimeCommand = uchimeLocation;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-               uchimeCommand += " ";
-#else
-               uchimeCommand = "\"" + uchimeCommand + "\"";
-#endif
-               
-               char* tempUchime;
+        uchimeCommand = "\"" + uchimeCommand + "\" ";
+        
+        char* tempUchime;
                 tempUchime= new char[uchimeCommand.length()+1]; 
                 *tempUchime = '\0';
                 strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length());
                 cPara.push_back(tempUchime);
                 
-               char* tempIn = new char[8]; 
-               *tempIn = '\0'; strncat(tempIn, "--input", 7);
-               //strcpy(tempIn, "--input"); 
-               cPara.push_back(tempIn);
-               char* temp = new char[filename.length()+1];
-               *temp = '\0'; strncat(temp, filename.c_str(), filename.length());
-               //strcpy(temp, filename.c_str());
-               cPara.push_back(temp);
-               
-               //are you using a reference file
+        //are you using a reference file
                 if (templatefile != "self") {
+            string outputFileName = filename.substr(1, filename.length()-2) + ".uchime_formatted";
+            prepFile(filename.substr(1, filename.length()-2), outputFileName);
+            filename = outputFileName;
+            filename = "\"" + filename + "\"";
                         //add reference file
                         char* tempRef = new char[5]; 
                         //strcpy(tempRef, "--db"); 
@@ -1086,6 +1207,15 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
                         cPara.push_back(tempR);
                 }
                 
+               char* tempIn = new char[8]; 
+               *tempIn = '\0'; strncat(tempIn, "--input", 7);
+               //strcpy(tempIn, "--input"); 
+               cPara.push_back(tempIn);
+               char* temp = new char[filename.length()+1];
+               *temp = '\0'; strncat(temp, filename.c_str(), filename.length());
+               //strcpy(temp, filename.c_str());
+               cPara.push_back(temp);
+               
                 char* tempO = new char[12]; 
                 *tempO = '\0'; strncat(tempO, "--uchimeout", 11);
                 //strcpy(tempO, "--uchimeout"); 
@@ -1339,6 +1469,8 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
                 in.close();
                 out.close();
                 
+        //if (templatefile != "self") {  m->mothurRemove(filename); }
+        
                 return num;
         }
         catch(exception& e) {
@@ -1347,6 +1479,34 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
         }
  }
  /**************************************************************************************************/
+//uchime can't handle some of the things allowed in mothurs fasta files. This functions "cleans up" the file.
+int ChimeraUchimeCommand::prepFile(string filename, string output) {
+       try {
+        
+        ifstream in;
+        m->openInputFile(filename, in);
+        
+        ofstream out;
+        m->openOutputFile(output, out);
+        
+        while (!in.eof()) {
+            if (m->control_pressed) { break;  }
+            
+            Sequence seq(in); m->gobble(in);
+            
+            if (seq.getName() != "") { seq.printSequence(out); }
+        }
+        in.close();
+        out.close();
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraUchimeCommand", "prepFile");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
  
  int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename, string accnos, string alns, int& numChimeras) {
         try {
@@ -1467,7 +1627,7 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename
                         string extension = toString(i) + ".temp";
                         
                         uchimeData* tempUchime = new uchimeData(outputFileName+extension, uchimeLocation, templatefile, files[i], "", "", "", accnos+extension, alns+extension, dummy, m, 0, 0,  i);
-                       tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract);
+                       tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount);
                         tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
                         
                         pDataArray.push_back(tempUchime);
@@ -1519,7 +1679,7 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename
  }
  /**************************************************************************************************/
  
-int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string outputFName, string filename, string accnos, string alns, vector<string> groups, string nameFile, string groupFile, string fastaFile) {
+int ChimeraUchimeCommand::createProcessesGroups(string outputFName, string filename, string accnos, string alns, vector<string> groups, string nameFile, string groupFile, string fastaFile) {
         try {
                 
                 processIDS.clear();
@@ -1549,7 +1709,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o
                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
                                 process++;
                         }else if (pid == 0){
-                               num = driverGroups(parser, outputFName + toString(getpid()) + ".temp", filename + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", alns + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
+                               num = driverGroups(outputFName + toString(getpid()) + ".temp", filename + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", alns + toString(getpid()) + ".temp", lines[process].start, lines[process].end, groups);
                                 
                                 //pass numSeqs to parent
                                 ofstream out;
@@ -1567,7 +1727,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o
                 }
                 
                 //do my part
-               num = driverGroups(parser, outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups);
+               num = driverGroups(outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<processIDS.size();i++) { 
@@ -1599,7 +1759,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o
                         string extension = toString(i) + ".temp";
                         
                         uchimeData* tempUchime = new uchimeData(outputFName+extension, uchimeLocation, templatefile, filename+extension, fastaFile, nameFile, groupFile, accnos+extension, alns+extension, groups, m, lines[i].start, lines[i].end,  i);
-                       tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract);
+                       tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount);
                         tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
                         
                         pDataArray.push_back(tempUchime);
@@ -1612,7 +1772,7 @@ int ChimeraUchimeCommand::createProcessesGroups(SequenceParser& parser, string o
                 
                 
                 //using the main process as a worker saves time and memory
-               num = driverGroups(parser, outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups);
+               num = driverGroups(outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups);
                 
                 //Wait until all threads have terminated.
                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
diff --git a/chimerauchimecommand.h b/chimerauchimecommand.h

index 3ca79391b5d1280c5b5943ada46599cbc98d44b0..f0c30d08a03f1834de06c550668e6b3b1f64c1b1 100644 (file)
--- a/chimerauchimecommand.h
+++ b/chimerauchimecommand.h
@@ -14,6 +14,8 @@
  #include "mothur.h"
  #include "command.hpp"
  #include "sequenceparser.h"
+#include "counttable.h"
+#include "sequencecountparser.h"
  
  /***********************************************************/
  
@@ -45,11 +47,12 @@ private:
         int driver(string, string, string, string, int&);
         int createProcesses(string, string, string, string, int&);
                 
-       bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract;
-       string fastafile, groupfile, templatefile, outputDir, namefile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation;
+       bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount, hasName, dups;
+       string fastafile, groupfile, templatefile, outputDir, namefile, countfile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation;
         int processors;
         
-       
+       SequenceParser* sparser;
+    SequenceCountParser* cparser;
         vector<string> outputNames;
         vector<string> fastaFileNames;
         vector<string> nameFileNames;
@@ -58,9 +61,10 @@ private:
         string getNamesFile(string&);
         int readFasta(string, map<string, string>&);
         int printFile(vector<seqPriorityNode>&, string);
-       int deconvoluteResults(SequenceParser&, string, string, string);
-       int driverGroups(SequenceParser&, string, string, string, string, int, int, vector<string>);
-       int createProcessesGroups(SequenceParser&, string, string, string, string, vector<string>, string, string, string);
+       int deconvoluteResults(map<string, string>&, string, string, string);
+       int driverGroups(string, string, string, string, int, int, vector<string>);
+       int createProcessesGroups(string, string, string, string, vector<string>, string, string, string);
+    int prepFile(string filename, string);
  
  
  };
@@ -81,7 +85,7 @@ struct uchimeData {
         int end;
         int threadID, count, numChimeras;
         vector<string> groups;
-       bool useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract;
+       bool useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount;
         string abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract;
         
         uchimeData(){}
@@ -103,7 +107,7 @@ struct uchimeData {
                 numChimeras = 0;
          uchimeLocation = uloc;
         }
-       void setBooleans(bool Abskew, bool calns, bool MinH, bool Mindiv, bool Xn, bool Dn, bool Xa, bool Chunks, bool Minchunk, bool Idsmoothwindow, bool Minsmoothid, bool Maxp, bool skipgap, bool skipgap2, bool Minlen, bool Maxlen, bool uc, bool Queryfract) {
+       void setBooleans(bool Abskew, bool calns, bool MinH, bool Mindiv, bool Xn, bool Dn, bool Xa, bool Chunks, bool Minchunk, bool Idsmoothwindow, bool Minsmoothid, bool Maxp, bool skipgap, bool skipgap2, bool Minlen, bool Maxlen, bool uc, bool Queryfract, bool hc) {
                 useAbskew = Abskew;
                 chimealns = calns;
                 useMinH = MinH;
@@ -122,6 +126,7 @@ struct uchimeData {
                 useMaxlen = Maxlen;
                 ucl = uc;
                 useQueryfract = Queryfract;
+        hasCount = hc;
         }
         
         void setVariables(string abske, string min, string mindi, string x, string d, string xa2, string chunk, string minchun, string idsmoothwindo, string minsmoothi, string max, string minle, string maxle, string queryfrac) {
@@ -163,16 +168,30 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                 
                 //parse fasta and name file by group
                 SequenceParser* parser;
-               if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);      }
-               else                                                    { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
+        SequenceCountParser* cparser;
+               if (pDataArray->hasCount) {
+            CountTable* ct = new CountTable();
+            ct->readTable(pDataArray->namefile);
+            cparser = new SequenceCountParser(pDataArray->fastafile, *ct);
+            delete ct;
+        }else {
+            if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
+            else                                                       { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
+        }
                 
                 int totalSeqs = 0;
                 int numChimeras = 0;
                 
                 for (int i = pDataArray->start; i < pDataArray->end; i++) {
-                       int start = time(NULL);  if (pDataArray->m->control_pressed) {  delete parser; return 0; }
+                       int start = time(NULL);  if (pDataArray->m->control_pressed) {  if (pDataArray->hasCount) { delete cparser; } { delete parser; } return 0; }
                         
-                       int error = parser->getSeqs(pDataArray->groups[i], pDataArray->filename, true); if ((error == 1) || pDataArray->m->control_pressed) {  delete parser; return 0; }
+            
+                       int error;
+            if (pDataArray->hasCount) { 
+                error = cparser->getSeqs(pDataArray->groups[i], pDataArray->filename, true); if ((error == 1) || pDataArray->m->control_pressed) {  delete cparser; return 0; }
+            }else {
+               error = parser->getSeqs(pDataArray->groups[i], pDataArray->filename, true); if ((error == 1) || pDataArray->m->control_pressed) {  delete parser; return 0; } 
+            }
                         
                         //int numSeqs = driver((outputFName + groups[i]), filename, (accnos+ groups[i]), (alns+ groups[i]), numChimeras);
                         ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -410,7 +429,7 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                         filename = filename.substr(1, filename.length()-2);
                         alns = alns.substr(1, alns.length()-2);
                         
-                       if (pDataArray->m->control_pressed) { delete parser; return 0; }
+                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } return 0; }
                         
                         //create accnos file from uchime results
                         ifstream in; 
@@ -447,7 +466,7 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                         totalSeqs += num;
                         pDataArray->numChimeras += numChimeras;
                         
-                       if (pDataArray->m->control_pressed) { delete parser; return 0; }
+                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } return 0; }
                         
                         //remove file made for uchime
                         pDataArray->m->mothurRemove(filename);
@@ -462,7 +481,7 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                 }       
                 
                 pDataArray->count = totalSeqs;
-               delete parser;
+               if (pDataArray->hasCount) { delete cparser; } { delete parser; }
                 return totalSeqs;
                 
         }
@@ -506,16 +525,31 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){
          strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length());
          cPara.push_back(tempUchime);
                 
-               char* tempIn = new char[8]; 
-               *tempIn = '\0'; strncat(tempIn, "--input", 7);
-               //strcpy(tempIn, "--input"); 
-               cPara.push_back(tempIn);
-               char* temp = new char[filename.length()+1];
-               *temp = '\0'; strncat(temp, filename.c_str(), filename.length());
-               //strcpy(temp, filename.c_str());
-               cPara.push_back(temp);
-       
-               //add reference file
+        string outputFileName = filename.substr(1, filename.length()-2) + ".uchime_formatted";
+        //prepFile(filename.substr(1, filename.length()-2), outputFileName);
+        //prepFile(filename, outputFileName);
+        /******************************************/
+        ifstream in23;
+        pDataArray->m->openInputFile((filename.substr(1, filename.length()-2)), in23);
+        
+        ofstream out23;
+        pDataArray->m->openOutputFile(outputFileName, out23);
+        
+        while (!in23.eof()) {
+            if (pDataArray->m->control_pressed) { break;  }
+            
+            Sequence seq(in23); pDataArray->m->gobble(in23);
+            
+            if (seq.getName() != "") { seq.printSequence(out23); }
+        }
+        in23.close();
+        out23.close();
+        /******************************************/
+        
+        filename = outputFileName;
+        filename = "\"" + filename + "\"";
+        
+        //add reference file
                 char* tempRef = new char[5]; 
                 //strcpy(tempRef, "--db"); 
                 *tempRef = '\0'; strncat(tempRef, "--db", 4);
@@ -524,6 +558,15 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){
                 //strcpy(tempR, templatefile.c_str());
                 *tempR = '\0'; strncat(tempR, templatefile.c_str(), templatefile.length());
                 cPara.push_back(tempR);
+        
+               char* tempIn = new char[8]; 
+               *tempIn = '\0'; strncat(tempIn, "--input", 7);
+               //strcpy(tempIn, "--input"); 
+               cPara.push_back(tempIn);
+               char* temp = new char[filename.length()+1];
+               *temp = '\0'; strncat(temp, filename.c_str(), filename.length());
+               //strcpy(temp, filename.c_str());
+               cPara.push_back(temp);
                 
                 char* tempO = new char[12]; 
                 *tempO = '\0'; strncat(tempO, "--uchimeout", 11);
@@ -715,6 +758,8 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){
                 for (int j = 0; j < cPara.size(); j++) {  uchimeParameters[j] = cPara[j];  commandString += toString(cPara[j]) + " "; } 
                 //int numArgs = cPara.size();
                 
+        commandString = "\"" + commandString + "\"";
+        
                 //uchime_main(numArgs, uchimeParameters); 
                 //cout << "commandString = " << commandString << endl;
          if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: uchime command = " + commandString + ".\n"); }
diff --git a/chopseqscommand.cpp b/chopseqscommand.cpp

index 05037f6817a5a554a029a7d635460d048bf73203..4bcd707bcbe2d90ab9ab0dd3162efdb5faa1b4e1 100644 (file)
--- a/chopseqscommand.cpp
+++ b/chopseqscommand.cpp
@@ -14,7 +14,8 @@
  vector<string> ChopSeqsCommand::setParameters(){       
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pnumbases("numbases", "Number", "", "0", "", "", "",false,true); parameters.push_back(pnumbases);
+               CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+        CommandParameter pnumbases("numbases", "Number", "", "0", "", "", "",false,true); parameters.push_back(pnumbases);
                 CommandParameter pcountgaps("countgaps", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcountgaps);
                 CommandParameter pshort("short", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pshort);
                 CommandParameter pkeep("keep", "Multiple", "front-back", "front", "", "", "",false,false); parameters.push_back(pkeep);
@@ -41,7 +42,8 @@ string ChopSeqsCommand::getHelpString(){
                 helpString += "The keep parameter allows you to specify whether you want to keep the front or the back of your sequence, default=front.\n";
                 helpString += "The countgaps parameter allows you to specify whether you want to count gaps as bases, default=false.\n";
                 helpString += "The short parameter allows you to specify you want to keep sequences that are too short to chop, default=false.\n";
-               helpString += "For example, if you ran chop.seqs with numbases=200 and short=t, if a sequence had 100 bases mothur would keep the sequence rather than eliminate it.\n";
+               helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
+        helpString += "For example, if you ran chop.seqs with numbases=200 and short=t, if a sequence had 100 bases mothur would keep the sequence rather than eliminate it.\n";
                 helpString += "Example chop.seqs(fasta=amazon.fasta, numbases=200, keep=front).\n";
                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
                 return helpString;
@@ -143,6 +145,10 @@ ChopSeqsCommand::ChopSeqsCommand(string option)  {
                         string temp = validParameter.validFile(parameters, "numbases", false);  if (temp == "not found") { temp = "0"; } 
                         m->mothurConvert(temp, numbases);   
                         
+            temp = validParameter.validFile(parameters, "processors", false);  if (temp == "not found"){       temp = m->getProcessors();      }
+                       m->setProcessors(temp);
+                       m->mothurConvert(temp, processors);
+            
                         temp = validParameter.validFile(parameters, "countgaps", false);        if (temp == "not found") { temp = "f"; } 
                         countGaps = m->isTrue(temp);  
                         
@@ -169,39 +175,32 @@ int ChopSeqsCommand::execute(){
                 
                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta");
                 string outputFileNameAccnos = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos");
+        
+        
+        vector<unsigned long long> positions; 
+        vector<linePair> lines;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+        positions = m->divideFile(fastafile, processors);
+        for (int i = 0; i < (positions.size()-1); i++) {       lines.push_back(linePair(positions[i], positions[(i+1)]));      }
+#else
+        int numSeqs = 0;
+        positions = m->setFilePosFasta(fastafile, numSeqs); 
+        if (positions.size() < processors) { processors = positions.size(); }
                 
-               ofstream out;
-               m->openOutputFile(outputFileName, out);
-               
-               ofstream outAcc;
-               m->openOutputFile(outputFileNameAccnos, outAcc);
-               
-               ifstream in;
-               m->openInputFile(fastafile, in);
-               
-               bool wroteAccnos = false;
-               
-               while (!in.eof()) {
-                       
-                       Sequence seq(in);
-                       
-                       if (m->control_pressed) { outputTypes.clear(); in.close(); out.close(); outAcc.close(); m->mothurRemove(outputFileName); m->mothurRemove(outputFileNameAccnos); return 0;  }
-                       
-                       if (seq.getName() != "") {
-                               string newSeqString = getChopped(seq);
-                               
-                               //output trimmed sequence
-                               if (newSeqString != "") {
-                                       out << ">" << seq.getName() << endl << newSeqString << endl;
-                               }else{
-                                       outAcc << seq.getName() << endl;
-                                       wroteAccnos = true;
-                               }
-                       }
-               }
-               in.close();
-               out.close();
-               outAcc.close();
+        //figure out how many sequences you have to process
+        int numSeqsPerProcessor = numSeqs / processors;
+        for (int i = 0; i < processors; i++) {
+            int startIndex =  i * numSeqsPerProcessor;
+            if(i == (processors - 1)){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;        }
+            lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+        }
+#endif
+        
+        bool wroteAccnos = false;
+        if(processors == 1) {   wroteAccnos = driver(lines[0], fastafile, outputFileName, outputFileNameAccnos);        }
+        else                {   wroteAccnos = createProcesses(lines, fastafile, outputFileName, outputFileNameAccnos);  }
+        
+        if (m->control_pressed) {  return 0; }
                 
                 m->mothurOutEndLine();
                 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
@@ -235,6 +234,202 @@ int ChopSeqsCommand::execute(){
                 exit(1);
         }
  }
+/**************************************************************************************************/
+bool ChopSeqsCommand::createProcesses(vector<linePair> lines, string filename, string outFasta, string outAccnos) {
+       try {
+               int process = 1;
+               bool wroteAccnos = false;
+               vector<int> processIDS;
+        vector<string> nonBlankAccnosFiles;
+               
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+               
+               //loop through and create all the processes you want
+               while (process != processors) {
+                       int pid = fork();
+                       
+                       if (pid > 0) {
+                               processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
+                               process++;
+                       }else if (pid == 0){
+                               wroteAccnos = driver(lines[process], filename, outFasta + toString(getpid()) + ".temp", outAccnos + toString(getpid()) + ".temp");
+                               
+                               //pass numSeqs to parent
+                               ofstream out;
+                               string tempFile = fastafile + toString(getpid()) + ".bool.temp";
+                               m->openOutputFile(tempFile, out);
+                               out << wroteAccnos << endl;                             
+                               out.close();
+                               
+                               exit(0);
+                       }else { 
+                               m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
+                               for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+                               exit(0);
+                       }
+               }
+               
+               //do your part
+               wroteAccnos = driver(lines[0], filename, outFasta, outAccnos);
+        
+               //force parent to wait until all the processes are done
+               for (int i=0;i<processIDS.size();i++) { 
+                       int temp = processIDS[i];
+                       wait(&temp);
+               }
+               
+        
+               if (wroteAccnos) { nonBlankAccnosFiles.push_back(outAccnos); }
+               else { m->mothurRemove(outAccnos); } //remove so other files can be renamed to it
+        
+               //parent reads in and combine Filter info
+               for (int i = 0; i < processIDS.size(); i++) {
+                       string tempFilename = fastafile + toString(processIDS[i]) + ".bool.temp";
+                       ifstream in;
+                       m->openInputFile(tempFilename, in);
+                       
+                       bool temp;
+                       in >> temp; m->gobble(in); 
+            if (temp) { wroteAccnos = temp; nonBlankAccnosFiles.push_back(outAccnos + toString(processIDS[i]) + ".temp");  }
+                       else { m->mothurRemove((outAccnos + toString(processIDS[i]) + ".temp"));  }
+            
+                       in.close();
+                       m->mothurRemove(tempFilename);
+               }
+#else
+               //////////////////////////////////////////////////////////////////////////////////////////////////////
+               //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
+               //Above fork() will clone, so memory is separate, but that's not the case with windows, 
+               //Taking advantage of shared memory to allow both threads to add info to vectors.
+               //////////////////////////////////////////////////////////////////////////////////////////////////////
+               
+               vector<chopData*> pDataArray; 
+               DWORD   dwThreadIdArray[processors-1];
+               HANDLE  hThreadArray[processors-1]; 
+               
+               //Create processor worker threads.
+               for( int i=0; i<processors-1; i++ ){
+            
+            string extension = "";
+            if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
+                       // Allocate memory for thread data.
+                       chopData* tempChop = new chopData(filename, (outFasta+extension), (outAccnos+extension), m, lines[i].start, lines[i].end, keep, countGaps, numbases, Short);
+                       pDataArray.push_back(tempChop);
+                       
+                       //MyChopThreadFunction is in header. It must be global or static to work with the threads.
+                       //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+                       hThreadArray[i] = CreateThread(NULL, 0, MyChopThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
+               }
+               
+        //do your part
+               wroteAccnos = driver(lines[processors-1], filename, (outFasta + toString(processors-1) + ".temp"), (outAccnos + toString(processors-1) + ".temp"));
+        processIDS.push_back(processors-1);
+        
+               //Wait until all threads have terminated.
+               WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+               
+        if (wroteAccnos) { nonBlankAccnosFiles.push_back(outAccnos); }
+               else { m->mothurRemove(outAccnos); } //remove so other files can be renamed to it
+
+               //Close all thread handles and free memory allocations.
+               for(int i=0; i < pDataArray.size(); i++){
+            if (pDataArray[i]->wroteAccnos) { wroteAccnos = pDataArray[i]->wroteAccnos; nonBlankAccnosFiles.push_back(outAccnos + toString(processIDS[i]) + ".temp");  }
+                       else { m->mothurRemove((outAccnos + toString(processIDS[i]) + ".temp"));  }
+                       CloseHandle(hThreadArray[i]);
+                       delete pDataArray[i];
+               }
+#endif         
+                
+               for (int i = 0; i < processIDS.size(); i++) {
+                       m->appendFiles((outFasta + toString(processIDS[i]) + ".temp"), outFasta);
+                       m->mothurRemove((outFasta + toString(processIDS[i]) + ".temp"));
+               }
+               
+        if (nonBlankAccnosFiles.size() != 0) { 
+                       m->renameFile(nonBlankAccnosFiles[0], outAccnos);
+                       
+                       for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
+                               m->appendFiles(nonBlankAccnosFiles[h], outAccnos);
+                               m->mothurRemove(nonBlankAccnosFiles[h]);
+                       }
+               }else { //recreate the accnosfile if needed
+                       ofstream out;
+                       m->openOutputFile(outAccnos, out);
+                       out.close();
+               }
+
+               return wroteAccnos;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChopSeqsCommand", "createProcesses");
+               exit(1);
+       }
+}
+/**************************************************************************************/
+bool ChopSeqsCommand::driver(linePair filePos, string filename, string outFasta, string outAccnos) {   
+       try {
+               
+               ofstream out;
+               m->openOutputFile(outFasta, out);
+        
+        ofstream outAcc;
+               m->openOutputFile(outAccnos, outAcc);
+        
+               ifstream in;
+               m->openInputFile(filename, in);
+        
+               in.seekg(filePos.start);
+        
+               bool done = false;
+        bool wroteAccnos = false;
+               int count = 0;
+        
+               while (!done) {
+            
+                       if (m->control_pressed) { in.close(); out.close(); return 1; }
+            
+                       Sequence seq(in); m->gobble(in);
+                       
+                       if (m->control_pressed) {  in.close(); out.close(); outAcc.close(); m->mothurRemove(outFasta); m->mothurRemove(outAccnos); return 0;  }
+                       
+                       if (seq.getName() != "") {
+                               string newSeqString = getChopped(seq);
+                               
+                               //output trimmed sequence
+                               if (newSeqString != "") {
+                                       out << ">" << seq.getName() << endl << newSeqString << endl;
+                               }else{
+                                       outAcc << seq.getName() << endl;
+                                       wroteAccnos = true;
+                               }
+                count++;
+                       }
+                       
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+            unsigned long long pos = in.tellg();
+            if ((pos == -1) || (pos >= filePos.end)) { break; }
+#else
+            if (in.eof()) { break; }
+#endif
+            //report progress
+                       if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
+                       
+               }
+               //report progress
+               if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
+
+               
+               in.close();
+        out.close();
+        outAcc.close();
+               
+               return wroteAccnos;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChopSeqsCommand", "driver");
+               exit(1);
+       }
+}
  //**********************************************************************************************************************
  string ChopSeqsCommand::getChopped(Sequence seq) {
         try {
diff --git a/chopseqscommand.h b/chopseqscommand.h

index cc22c751483a75f246f69e4955985f158d041638..fa3f559056161945021e10f9b1892db742a6116e 100644 (file)
--- a/chopseqscommand.h
+++ b/chopseqscommand.h
@@ -34,14 +34,235 @@ class ChopSeqsCommand : public Command {
                 void help() { m->mothurOut(getHelpString()); }          
         
         private:
+        struct linePair {
+            unsigned long long start;
+            unsigned long long end;
+            linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
+        };
+    
                 string fastafile, outputDir, keep;
                 bool abort, countGaps, Short;
-               int numbases;
+               int numbases, processors;
                 vector<string> outputNames;
                 
                 string getChopped(Sequence);
+        bool driver (linePair, string, string, string);
+        bool createProcesses(vector<linePair>, string, string, string);
  };
  
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct chopData {
+       string filename; 
+       string outFasta, outAccnos, keep; 
+       unsigned long long start;
+       unsigned long long end;
+       int numbases;
+    bool countGaps, Short, wroteAccnos;
+       MothurOut* m;
+       string namefile;
+       map<string, int> nameMap;
+       
+       
+       chopData(){}
+       chopData(string f, string ff, string a, MothurOut* mout, unsigned long long st, unsigned long long en, string k, bool cGaps, int nbases, bool S) {
+               filename = f;
+               outFasta = ff;
+        outAccnos = a;
+               m = mout;
+               start = st;
+               end = en;
+        keep = k;
+        countGaps = cGaps;
+        numbases = nbases;
+        Short = S;
+               wroteAccnos = false;
+       }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyChopThreadFunction(LPVOID lpParam){ 
+       chopData* pDataArray;
+       pDataArray = (chopData*)lpParam;
+       
+       try {
+        ofstream out;
+               pDataArray->m->openOutputFile(pDataArray->outFasta, out);
+        
+        ofstream outAcc;
+               pDataArray->m->openOutputFile(pDataArray->outAccnos, outAcc);
+        
+               ifstream in;
+               pDataArray->m->openInputFile(pDataArray->filename, in);
+        
+               if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
+                       in.seekg(0);
+               }else { //this accounts for the difference in line endings. 
+                       in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); 
+               }
+
+               bool done = false;
+        bool wroteAccnos = false;
+               int count = 0;
+
+               for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+                                               
+                       if (pDataArray->m->control_pressed) {  in.close(); out.close(); outAcc.close(); pDataArray->m->mothurRemove(pDataArray->outFasta); pDataArray->m->mothurRemove(pDataArray->outAccnos); return 0;  }
+            
+            Sequence seq(in); pDataArray->m->gobble(in);
+                       
+                       if (seq.getName() != "") {
+                               //string newSeqString = getChopped(seq);
+                ///////////////////////////////////////////////////////////////////////
+                string temp = seq.getAligned();
+                string tempUnaligned = seq.getUnaligned();
+                
+                if (pDataArray->countGaps) {
+                    //if needed trim sequence
+                    if (pDataArray->keep == "front") {//you want to keep the beginning
+                        int tempLength = temp.length();
+                        
+                        if (tempLength > pDataArray->numbases) { //you have enough bases to remove some
+                            
+                            int stopSpot = 0;
+                            int numBasesCounted = 0;
+                            
+                            for (int i = 0; i < temp.length(); i++) {
+                                //eliminate N's
+                                if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
+                                
+                                numBasesCounted++; 
+                                
+                                if (numBasesCounted >= pDataArray->numbases) { stopSpot = i; break; }
+                            }
+                            
+                            if (stopSpot == 0) { temp = ""; }
+                            else {  temp = temp.substr(0, stopSpot+1);  }
+                                                       
+                        }else { 
+                            if (!pDataArray->Short) { temp = ""; } //sequence too short
+                        }
+                    }else { //you are keeping the back
+                        int tempLength = temp.length();
+                        if (tempLength > pDataArray->numbases) { //you have enough bases to remove some
+                            
+                            int stopSpot = 0;
+                            int numBasesCounted = 0;
+                            
+                            for (int i = (temp.length()-1); i >= 0; i--) {
+                                //eliminate N's
+                                if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
+                                
+                                numBasesCounted++; 
+                                
+                                if (numBasesCounted >= pDataArray->numbases) { stopSpot = i; break; }
+                            }
+                            
+                            if (stopSpot == 0) { temp = ""; }
+                            else {  temp = temp.substr(stopSpot+1);  }
+                        }else { 
+                            if (!pDataArray->Short) { temp = ""; } //sequence too short
+                        }
+                    }
+                    
+                }else{
+                    
+                    //if needed trim sequence
+                    if (pDataArray->keep == "front") {//you want to keep the beginning
+                        int tempLength = tempUnaligned.length();
+                        
+                        if (tempLength > pDataArray->numbases) { //you have enough bases to remove some
+                            
+                            int stopSpot = 0;
+                            int numBasesCounted = 0;
+                            
+                            for (int i = 0; i < temp.length(); i++) {
+                                //eliminate N's
+                                if (toupper(temp[i]) == 'N') { 
+                                    temp[i] = '.'; 
+                                    tempLength--;
+                                    if (tempLength < pDataArray->numbases) { stopSpot = 0; break; }
+                                }
+                                
+                                if(isalpha(temp[i])) { numBasesCounted++; }
+                                
+                                if (numBasesCounted >= pDataArray->numbases) { stopSpot = i; break; }
+                            }
+                            
+                            if (stopSpot == 0) { temp = ""; }
+                            else {  temp = temp.substr(0, stopSpot+1);  }
+                                                       
+                        }else { 
+                            if (!pDataArray->Short) { temp = ""; } //sequence too short
+                        }                              
+                    }else { //you are keeping the back
+                        int tempLength = tempUnaligned.length();
+                        if (tempLength > pDataArray->numbases) { //you have enough bases to remove some
+                            
+                            int stopSpot = 0;
+                            int numBasesCounted = 0;
+                            
+                            for (int i = (temp.length()-1); i >= 0; i--) {
+                                //eliminate N's
+                                if (toupper(temp[i]) == 'N') { 
+                                    temp[i] = '.'; 
+                                    tempLength--;
+                                    if (tempLength < pDataArray->numbases) { stopSpot = 0; break; }
+                                }
+                                
+                                if(isalpha(temp[i])) { numBasesCounted++; }
+                                
+                                if (numBasesCounted >= pDataArray->numbases) { stopSpot = i; break; }
+                            }
+                            
+                            if (stopSpot == 0) { temp = ""; }
+                            else {  temp = temp.substr(stopSpot);  }
+                        }else { 
+                            if (!pDataArray->Short) { temp = ""; } //sequence too short
+                        }
+                    }
+                }
+                
+                string newSeqString = temp;
+                ///////////////////////////////////////////////////////////////////////
+                               
+                               //output trimmed sequence
+                               if (newSeqString != "") {
+                                       out << ">" << seq.getName() << endl << newSeqString << endl;
+                               }else{
+                                       outAcc << seq.getName() << endl;
+                                       pDataArray->wroteAccnos = true;
+                               }
+                count++;
+                       }
+            //report progress
+                       if((count) % 1000 == 0){        pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine();           }
+                       
+               }
+               //report progress
+               if((count) % 1000 != 0){        pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine();           }
+        
+               
+               in.close();
+        out.close();
+        outAcc.close();
+                               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               pDataArray->m->errorOut(e, "ChopsSeqsCommand", "MyChopThreadFunction");
+               exit(1);
+       }
+} 
+#endif
+
+
+
  #endif
  
  
diff --git a/classify.cpp b/classify.cpp

index 212e563f94c4ae7af3ba0916d1040afd9b0e11fa..8aa3cdb381ed7a389667ce61962d47cefac15ddd 100644 (file)
--- a/classify.cpp
+++ b/classify.cpp
@@ -61,7 +61,8 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                                         names.push_back(temp.getName());
                                         database->addSequence(temp);    
                                 }
-                               database->generateDB();
+                               if ((method == "kmer") && (!shortcuts)) {;} //don't print
+                else {database->generateDB(); }
                         }else if ((method == "kmer") && (!needToGenerate)) {    
                                 ifstream kmerFileTest(kmerDBName.c_str());
                                 database->readKmerDB(kmerFileTest);     
@@ -200,7 +201,8 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                                 }
                                 fastaFile.close();
  
-                               database->generateDB();
+                if ((method == "kmer") && (!shortcuts)) {;} //don't print
+                else {database->generateDB(); } 
                                 
                         }else if ((method == "kmer") && (!needToGenerate)) {    
                                 ifstream kmerFileTest(kmerDBName.c_str());
@@ -260,9 +262,6 @@ int Classify::readTaxonomy(string file) {
                 MPI_File inMPI;
                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
-
-               //char* inFileName = new char[file.length()];
-               //memcpy(inFileName, file.c_str(), file.length());
                 
                 char inFileName[1024];
                 strcpy(inFileName, file.c_str());
@@ -355,3 +354,37 @@ vector<string> Classify::parseTax(string tax) {
  }
  /**************************************************************************************************/
  
+double Classify::getLogExpSum(vector<double> probabilities, int& maxIndex){
+       try {
+        //     http://jblevins.org/notes/log-sum-exp
+        
+        double maxProb = probabilities[0];
+        maxIndex = 0;
+        
+        int numProbs = (int)probabilities.size();
+        
+        for(int i=1;i<numProbs;i++){
+            if(probabilities[i] >= maxProb){
+                maxProb = probabilities[i];
+                maxIndex = i;
+            }
+        }
+        
+        double probSum = 0.0000;
+        
+        for(int i=0;i<numProbs;i++){
+            probSum += exp(probabilities[i] - maxProb);                
+        }
+        
+        probSum = log(probSum) + maxProb;
+        
+        return probSum;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Classify", "getLogExpSum");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
diff --git a/classify.h b/classify.h

index 4e0354782e5063e2a52268e88a1412b40db3c28e..7b4c1022fef35aacbc33824d99ede551e55b187c 100644 (file)
--- a/classify.h
+++ b/classify.h
@@ -17,10 +17,8 @@
  #include "database.hpp"
  #include "phylotree.h"
  
-
  class Sequence;
  
-
  /**************************************************************************************************/
  
  class Classify {
@@ -37,7 +35,6 @@ public:
  protected:
  
         map<string, string> taxonomy;  //name maps to taxonomy
-       //map<string, int> genusCount;  //maps genus to count - in essence a list of how many seqs are in each taxonomy
         map<string, int>::iterator itTax;
         map<string, string>::iterator it;
         Database* database;
@@ -45,11 +42,12 @@ protected:
         
         string taxFile, templateFile, simpleTax;
         vector<string> names;
-       int threadID;
-       bool flip, flipped;
+       int threadID, numLevels, numTaxa;
+       bool flip, flipped, shortcuts;
         
         int readTaxonomy(string);
         vector<string> parseTax(string);
+    double getLogExpSum(vector<double>, int&);
         MothurOut* m;
         
  };
diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp

index 00ae690214177d94020069986aedbbb0cb118b9e..660d53c4bdf05c3249b74d84b7a2ba1eb7e5e50e 100644 (file)
--- a/classifyotucommand.cpp
+++ b/classifyotucommand.cpp
@@ -17,8 +17,9 @@ vector<string> ClassifyOtuCommand::setParameters(){
                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
                 CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                 CommandParameter pbasis("basis", "Multiple", "otu-sequence", "otu", "", "", "",false,false); parameters.push_back(pbasis);
                 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
@@ -39,11 +40,12 @@ vector<string> ClassifyOtuCommand::setParameters(){
  string ClassifyOtuCommand::getHelpString(){    
         try {
                 string helpString = "";
-               helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, cutoff, label, basis and probs.  The taxonomy and list parameters are required unless you have a valid current file.\n";
+               helpString += "The classify.otu command parameters are list, taxonomy, reftaxonomy, name, group, count, cutoff, label, basis and probs.  The taxonomy and list parameters are required unless you have a valid current file.\n";
                 helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. Providing it will keep the rankIDs in the summary file static.\n";
                 helpString += "The name parameter allows you add a names file with your taxonomy file.\n";
                 helpString += "The group parameter allows you provide a group file to use in creating the summary file breakdown.\n";
-               helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n";
+               helpString += "The count parameter allows you add a count file associated with your list file. When using the count parameter mothur assumes your list file contains only uniques.\n";
+        helpString += "The basis parameter allows you indicate what you want the summary file to represent, options are otu and sequence. Default is otu.\n";
                 helpString += "For example consider the following basis=sequence could give Clostridiales       3       105     16      43      46, where 105 is the total number of sequences whose otu classified to Clostridiales.\n";
                 helpString += "16 is the number of sequences in the otus from groupA, 43 is the number of sequences in the otus from groupB, and 46 is the number of sequences in the otus from groupC.\n";
                 helpString += "Now for basis=otu could give Clostridiales       3       7       6       1       2, where 7 is the number of otus that classified to Clostridiales.\n";
@@ -172,6 +174,14 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -211,6 +221,20 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
                         if (groupfile == "not open") { abort = true; }  
                         else if (groupfile == "not found") { groupfile = ""; }
                         else { m->setGroupFile(groupfile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+
                         
                         //check for optional parameter and set defaults
                         // ...at some point should added some additional type checking...
@@ -235,10 +259,12 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option)  {
                         
                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
                         
-                       if (namefile == ""){
-                               vector<string> files; files.push_back(taxfile);
-                               parser.getNameFile(files);
-                       }
+            if (countfile == "") {
+                if (namefile == ""){
+                    vector<string> files; files.push_back(taxfile);
+                    parser.getNameFile(files);
+                }
+            }
                         
                 }
         }
@@ -255,7 +281,11 @@ int ClassifyOtuCommand::execute(){
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
                 //if user gave a namesfile then use it
-               if (namefile != "") {   m->readNames(namefile, nameMap, true);  }
+               if (namefile != "")     {       m->readNames(namefile, nameMap, true);  }
+        if (groupfile != "")    {   groupMap = new GroupMap(groupfile);  groupMap->readMap(); }
+        else { groupMap = NULL;  }
+        if (countfile != "") {  ct = new CountTable(); ct->readTable(countfile);    }
+        else {  ct = NULL;    }
                 
                 //read taxonomy file and save in map for easy access in building bin trees
                 m->readTax(taxfile, taxMap);
@@ -270,7 +300,7 @@ int ClassifyOtuCommand::execute(){
                 set<string> processedLabels;
                 set<string> userLabels = labels;
                 
-               if (m->control_pressed) { outputTypes.clear(); delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  }  return 0; }
+               if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {      m->mothurRemove(outputNames[i]);  }  return 0; }
         
                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                         
@@ -278,7 +308,7 @@ int ClassifyOtuCommand::execute(){
                         
                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
                                         process(list);
-                                       if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
+                                       if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete input; delete list; return 0; }
                                                                                 
                                         processedLabels.insert(list->getLabel());
                                         userLabels.erase(list->getLabel());
@@ -293,7 +323,7 @@ int ClassifyOtuCommand::execute(){
                                         process(list);
                                 
                                         
-                                       if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
+                                       if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; }  if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
                                                                                 
                                         processedLabels.insert(list->getLabel());
                                         userLabels.erase(list->getLabel());
@@ -329,10 +359,12 @@ int ClassifyOtuCommand::execute(){
                         process(list);
                         delete list;
                         
-                       if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
+                       if (m->control_pressed) { outputTypes.clear();  if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } delete input; delete list; return 0; }
                 }
                 
                 delete input;  
+        if (groupMap != NULL) { delete groupMap; }
+        if (ct != NULL) { delete ct; }
                                 
                 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0; }
                 
@@ -400,10 +432,16 @@ vector<string> ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* th
                                 if (it == taxMap.end()) { //this name is not in taxonomy file, skip it
                                         m->mothurOut(names[i] + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
                                 }else{
+                    if (countfile != "") {
+                        int numDups = ct->getNumSeqs(names[i]); 
+                        for (int j = 0; j < numDups; j++) {  phylo->addSeqToTree(names[i], it->second);  }
+                        size += numDups;
+                    }else{
                                         //add seq to tree
-                                       phylo->addSeqToTree(names[i], it->second);
-                                       size++;
-                                       allNames.push_back(names[i]);
+                        phylo->addSeqToTree(names[i], it->second);
+                        size++;  
+                    }
+                    allNames.push_back(names[i]);
                                 }
                         }
  
@@ -486,24 +524,25 @@ int ClassifyOtuCommand::process(ListVector* processList) {
                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
                                 
                 ofstream out;
-               string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + getOutputFileNameTag("constaxonomy");
+               string outputFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("constaxonomy");
                 m->openOutputFile(outputFile, out);
                 outputNames.push_back(outputFile); outputTypes["constaxonomy"].push_back(outputFile);
                 
                 ofstream outSum;
-               string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + getOutputFileNameTag("taxsummary");
+               string outputSumFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." +getOutputFileNameTag("taxsummary");
                 m->openOutputFile(outputSumFile, outSum);
                 outputNames.push_back(outputSumFile); outputTypes["taxsummary"].push_back(outputSumFile);
                 
                 out << "OTU\tSize\tTaxonomy" << endl;
                 
                 PhyloSummary* taxaSum;
-               if (refTaxonomy != "") {
-                       taxaSum = new PhyloSummary(refTaxonomy, groupfile);
+        if (countfile != "") {
+            if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct);  }
+            else {  taxaSum = new PhyloSummary(ct); }
                 }else {
-                       taxaSum = new PhyloSummary(groupfile);
-               }
-               
+            if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap);  }
+            else {  taxaSum = new PhyloSummary(groupMap); }
+        }
  
                 //for each bin in the list vector
          string snumBins = toString(processList->getNumBins());
@@ -534,7 +573,34 @@ int ClassifyOtuCommand::process(ListVector* processList) {
                         if (basis == "sequence") {
                                 for(int j = 0; j < names.size(); j++) {  taxaSum->addSeqToTree(names[j], noConfidenceConTax);  }
                         }else { //otu
-                               taxaSum->addSeqToTree(noConfidenceConTax, names);
+                map<string, bool> containsGroup; 
+                if (countfile != "") {
+                    if (ct->hasGroupInfo()) {
+                        vector<string> mGroups = ct->getNamesOfGroups();
+                        for (int k = 0; k < names.size(); k++) {
+                            vector<int> counts = ct->getGroupCounts(names[k]);
+                            for (int h = 0; h < counts.size(); h++) {  
+                                if (counts[h] != 0) {  containsGroup[mGroups[h]] = true; }
+                            }
+                        }
+                    }
+                }else {
+                    if (groupfile != "") {
+                        vector<string> mGroups = groupMap->getNamesOfGroups();
+                        for (int j = 0; j < mGroups.size(); j++) { containsGroup[mGroups[j]] = false; }
+                        
+                        for (int k = 0; k < names.size(); k++) {
+                            //find out the sequences group
+                            string group = groupMap->getGroup(names[k]);
+                            
+                            if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
+                            else {
+                                containsGroup[group] = true;
+                            }
+                        }
+                    }
+                }
+                               taxaSum->addSeqToTree(noConfidenceConTax, containsGroup);
                         }
                 }
  
diff --git a/classifyotucommand.h b/classifyotucommand.h

index 36a0328aa85bb2d646e14ec37b1d13f8a0581241..2e76057f1219b8c44785a5be9b6962043bbb68af 100644 (file)
--- a/classifyotucommand.h
+++ b/classifyotucommand.h
@@ -13,6 +13,7 @@
  #include "command.hpp"
  #include "listvector.hpp"
  #include "inputdata.h"
+#include "counttable.h"
  
  
  class ClassifyOtuCommand : public Command {
@@ -34,10 +35,11 @@ public:
         void help() { m->mothurOut(getHelpString()); }  
  
  private:
-
+    GroupMap* groupMap;
+    CountTable* ct;
         ListVector* list;
         InputData* input;
-       string listfile, namefile, taxfile, label, outputDir, refTaxonomy, groupfile, basis;
+       string listfile, namefile, taxfile, label, outputDir, refTaxonomy, groupfile, basis, countfile;
         bool abort, allLines, probs;
         int cutoff;
         set<string> labels; //holds labels to be used
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp

index c76b047cbd064a5b7b967ab388109bb4e46cc0cc..36bccc2b205635c6b1b0df872332534d615e3bb1 100644 (file)
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -17,11 +17,13 @@ vector<string> ClassifySeqsCommand::setParameters(){
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
                 CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate);
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
-               CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
+
+               CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance-align", "kmer", "", "", "",false,false); parameters.push_back(psearch);
                 CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize);
-               CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod);
+               CommandParameter pmethod("method", "Multiple", "wang-knn-zap", "wang", "", "", "",false,false); parameters.push_back(pmethod);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
                 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
@@ -32,6 +34,7 @@ vector<string> ClassifySeqsCommand::setParameters(){
                 CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs);
                 CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters);
                 CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave);
+        CommandParameter pshortcuts("shortcuts", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshortcuts);
                 CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -50,12 +53,13 @@ string ClassifySeqsCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n";
-               helpString += "The classify.seqs command parameters are reference, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n";
+               helpString += "The classify.seqs command parameters are reference, fasta, name, group, count, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n";
                 helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
-               helpString += "The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n";
+               helpString += "The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast, align and distance. The default is kmer.\n";
                 helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n";
                 helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n";
-               helpString += "The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n";
+        helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n";
+               helpString += "The method parameter allows you to specify classification method to use.  Your options are: wang, knn and zap. The default is wang.\n";
                 helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n";
                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
  #ifdef USE_MPI
@@ -68,8 +72,8 @@ string ClassifySeqsCommand::getHelpString(){
                 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
                 helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n";
                 helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n";
-               helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n";
-               helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n";
+               helpString += "The probs parameter shuts off the bootstrapping results for the wang and zap method. The default is true, meaning you want the bootstrapping to be shown.\n";
+               helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the wang method.  The default is 100.\n";
                 //helpString += "The flip parameter allows you shut off mothur's   The default is T.\n";
                 helpString += "The classify.seqs command should be in the following format: \n";
                 helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n";
@@ -127,7 +131,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(){
  ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
         try {
                 abort = false; calledHelp = false;   
-               rdb = ReferenceDB::getInstance();
+               rdb = ReferenceDB::getInstance(); hasName = false; hasCount=false;
                 
                 //allow user to run help
                 if(option == "help") { help(); abort = true; calledHelp = true; }
@@ -185,6 +189,14 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         fastaFileName = validParameter.validFile(parameters, "fasta", false);
@@ -333,11 +345,90 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                                         }
                                 }
                         }
-
+            
+            if (namefileNames.size() != 0) { hasName = true; }
+            
                         if (namefile != "") {
                                 if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
                         }
                         
+            //check for required parameters
+                       countfile = validParameter.validFile(parameters, "count", false);
+                       if (countfile == "not found") { 
+                countfile = "";  
+                       }else { 
+                               m->splitAtDash(countfile, countfileNames);
+                               
+                               //go through files and make sure they are good, if not, then disregard them
+                               for (int i = 0; i < countfileNames.size(); i++) {
+                                       
+                                       bool ignore = false;
+                                       if (countfileNames[i] == "current") { 
+                                               countfileNames[i] = m->getCountTableFile(); 
+                                               if (countfileNames[i] != "") {  m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); }
+                                               else {  
+                                                       m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }
+                                       }
+                                       
+                                       if (!ignore) {
+                                               
+                                               if (inputDir != "") {
+                                                       string path = m->hasPath(countfileNames[i]);
+                                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                                       if (path == "") {       countfileNames[i] = inputDir + countfileNames[i];               }
+                                               }
+                                               
+                                               int ableToOpen;
+                                               ifstream in;
+                                               
+                                               ableToOpen = m->openInputFile(countfileNames[i], in, "noerror");
+                                               
+                                               //if you can't open it, try default location
+                                               if (ableToOpen == 1) {
+                                                       if (m->getDefaultPath() != "") { //default path is set
+                                                               string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               if (ableToOpen == 1) {
+                                                       if (m->getOutputDir() != "") { //default path is set
+                                                               string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]);
+                                                               m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               countfileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               
+                                               in.close();
+                                               
+                                               if (ableToOpen == 1) { 
+                                                       m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
+                                                       //erase from file list
+                                                       countfileNames.erase(countfileNames.begin()+i);
+                                                       i--;
+                                               }else {
+                                                       m->setCountTableFile(countfileNames[i]);
+                                               }
+                                       }
+                               }
+                       }
+            
+            if (countfileNames.size() != 0) { hasCount = true; if (countfileNames.size() != fastaFileNames.size()) {m->mothurOut("If you provide a count file, you must have one for each fasta file."); m->mothurOutEndLine(); } }
+            
+                       //make sure there is at least one valid file left
+            if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+
                         groupfile = validParameter.validFile(parameters, "group", false);
                         if (groupfile == "not found") { groupfile = "";  }
                         else { 
@@ -393,6 +484,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
  
                         if (groupfile != "") {
                                 if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); }
+                if (hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; }
                         }else {
                                 for (int i = 0; i < fastaFileNames.size(); i++) {  groupfileNames.push_back("");  }
                         }
@@ -400,9 +492,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                         //check for optional parameter and set defaults
                         // ...at some point should added some additional type checking...
                         string temp;
-                       temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
-                       m->mothurConvert(temp, kmerSize); 
-                       
                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
                         m->setProcessors(temp);
                         m->mothurConvert(temp, processors); 
@@ -444,7 +533,13 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                         
                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
                         
-                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
+                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "wang";        }
+            
+            temp = validParameter.validFile(parameters, "ksize", false);               if (temp == "not found"){       
+                temp = "8";    
+                if (method == "zap") { temp = "7"; }
+            }
+                       m->mothurConvert(temp, kmerSize); 
                         
                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
                         m->mothurConvert(temp, match);  
@@ -466,6 +561,9 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                         
                         temp = validParameter.validFile(parameters, "probs", false);            if (temp == "not found"){       temp = "true";                  }
                         probs = m->isTrue(temp);
+            
+            temp = validParameter.validFile(parameters, "shortcuts", false);   if (temp == "not found"){       temp = "true";                  }
+                       writeShortcuts = m->isTrue(temp);
                         
                         //temp = validParameter.validFile(parameters, "flip", false);                   if (temp == "not found"){       temp = "T";                             }
                         //flip = m->isTrue(temp); 
@@ -475,16 +573,23 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                         m->mothurConvert(temp, iters); 
  
                         
-                       if ((method == "bayesian") && (search != "kmer"))  { 
-                               m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine();
+                       if ((method == "wang") && (search != "kmer"))  { 
+                               m->mothurOut("The wang method requires the kmer search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine();
+                               search = "kmer";
+                       }
+            
+            if ((method == "zap") && ((search != "kmer") && (search != "align")))  { 
+                               m->mothurOut("The zap method requires the kmer or align search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine();
                                 search = "kmer";
                         }
                         
              if (!abort) {
-                if (namefileNames.size() == 0){
-                    if (fastaFileNames.size() != 0) {
-                        vector<string> files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); 
-                        parser.getNameFile(files);
+                if (!hasCount) {
+                    if (namefileNames.size() == 0){
+                        if (fastaFileNames.size() != 0) {
+                            vector<string> files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); 
+                            parser.getNameFile(files);
+                        }
                      }
                  }
              }
@@ -508,12 +613,18 @@ int ClassifySeqsCommand::execute(){
         try {
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
          
-               if(method == "bayesian"){       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip);             }
+        string outputMethodTag = method + ".";
+               if(method == "wang"){   classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts);     }
                 else if(method == "knn"){       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand());                               }
+        else if(method == "zap"){      
+            outputMethodTag = search + "_" + outputMethodTag;
+            if (search == "kmer") {   classify = new KmerTree(templateFileName, taxonomyFileName, kmerSize, cutoff); }
+            else {  classify = new AlignTree(templateFileName, taxonomyFileName, cutoff);  }
+        }
                 else {
-                       m->mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
+                       m->mothurOut(search + " is not a valid method option. I will run the command using wang.");
                         m->mothurOutEndLine();
-                       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip);     
+                       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts);     
                 }
                 
                 if (m->control_pressed) { delete classify; return 0; }
@@ -522,7 +633,7 @@ int ClassifySeqsCommand::execute(){
                 
                         m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
                         
-                       string baseTName = taxonomyFileName;
+                       string baseTName = m->getSimpleName(taxonomyFileName);
                         if (taxonomyFileName == "saved") {baseTName = rdb->getSavedTaxonomy();  }
                         
              //set rippedTaxName to 
@@ -536,10 +647,10 @@ int ClassifySeqsCommand::execute(){
              if (RippedTaxName != "") { RippedTaxName +=  "."; }   
            
                         if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); }
-                       string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxonomy");
-                       string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("accnos");
+                       string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxonomy");
+                       string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag +getOutputFileNameTag("accnos");
                         string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp";
-                       string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + getOutputFileNameTag("taxsummary");
+                       string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + outputMethodTag + getOutputFileNameTag("taxsummary");
                         
                         if ((method == "knn") && (search == "distance")) { 
                                 string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("matchdist");
@@ -694,46 +805,58 @@ int ClassifySeqsCommand::execute(){
                         }
                 #endif
  
-                       string group = "";
-                       if (groupfile != "") {  group = groupfileNames[s]; }
-                       
-                       PhyloSummary taxaSum(baseTName, group);
-                       
-                       if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        } delete classify; return 0; }
-               
-                       if (namefile == "") {  taxaSum.summarize(tempTaxonomyFile);  }
-                       else {
-                               ifstream in;
-                               m->openInputFile(tempTaxonomyFile, in);
-                               
-                               //read in users taxonomy file and add sequences to tree
-                               string name, taxon;
-                               
-                               while(!in.eof()){
-                                       in >> name >> taxon; m->gobble(in);
-                                       
-                                       itNames = nameMap.find(name);
-               
-                                       if (itNames == nameMap.end()) { 
-                                               m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
-                                       }else{
-                                               for (int i = 0; i < itNames->second.size(); i++) { 
-                                                       taxaSum.addSeqToTree(itNames->second[i], taxon);  //add it as many times as there are identical seqs
-                                               }
-                                               itNames->second.clear();
-                                               nameMap.erase(itNames->first);
-                                       }
-                               }
-                               in.close();
-                       }
+                string group = "";
+                GroupMap* groupMap = NULL;
+                CountTable* ct = NULL;
+                PhyloSummary* taxaSum;
+                if (hasCount) { 
+                    ct = new CountTable();
+                    ct->readTable(countfileNames[s]);
+                    taxaSum = new PhyloSummary(taxonomyFileName, ct);
+                    taxaSum->summarize(tempTaxonomyFile);
+                }else {
+                    if (groupfile != "") {  group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); }
+                    
+                    taxaSum = new PhyloSummary(taxonomyFileName, groupMap);
+                    
+                    if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; }  if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);        } delete classify; return 0; }
+                    
+                    if (namefile == "") {  taxaSum->summarize(tempTaxonomyFile);  }
+                    else {
+                        ifstream in;
+                        m->openInputFile(tempTaxonomyFile, in);
+                        
+                        //read in users taxonomy file and add sequences to tree
+                        string name, taxon;
+                        
+                        while(!in.eof()){
+                            if (m->control_pressed) { outputTypes.clear();  if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);        } delete classify; return 0; }
+                            
+                            in >> name >> taxon; m->gobble(in);
+                            
+                            itNames = nameMap.find(name);
+                            
+                            if (itNames == nameMap.end()) { 
+                                m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
+                            }else{
+                                for (int i = 0; i < itNames->second.size(); i++) { 
+                                    taxaSum->addSeqToTree(itNames->second[i], taxon);  //add it as many times as there are identical seqs
+                                }
+                                itNames->second.clear();
+                                nameMap.erase(itNames->first);
+                            }
+                        }
+                        in.close();
+                    }
+                }
                         m->mothurRemove(tempTaxonomyFile);
                         
-                       if (m->control_pressed) {  outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        } delete classify; return 0; }
+                       if (m->control_pressed) {  outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } delete classify; return 0; }
                         
                         //print summary file
                         ofstream outTaxTree;
                         m->openOutputFile(taxSummary, outTaxTree);
-                       taxaSum.print(outTaxTree);
+                       taxaSum->print(outTaxTree);
                         outTaxTree.close();
                         
                         //output taxonomy with the unclassified bins added
@@ -745,12 +868,12 @@ int ClassifySeqsCommand::execute(){
                         m->openOutputFile(unclass, outTax);
                         
                         //get maxLevel from phylotree so you know how many 'unclassified's to add
-                       int maxLevel = taxaSum.getMaxLevel();
+                       int maxLevel = taxaSum->getMaxLevel();
                                                         
                         //read taxfile - this reading and rewriting is done to preserve the confidence scores.
                         string name, taxon;
                         while (!inTax.eof()) {
-                               if (m->control_pressed) { outputTypes.clear();  for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);        } m->mothurRemove(unclass); delete classify; return 0; }
+                               if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; }  if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } m->mothurRemove(unclass); delete classify; return 0; }
  
                                 inTax >> name >> taxon; m->gobble(inTax);
                                 
@@ -761,6 +884,8 @@ int ClassifySeqsCommand::execute(){
                         inTax.close();  
                         outTax.close();
                         
+            if (ct != NULL) { delete ct; }
+            if (groupMap != NULL) { delete groupMap; } delete taxaSum;
                         m->mothurRemove(newTaxonomyFile);
                         rename(unclass.c_str(), newTaxonomyFile.c_str());
                         
@@ -897,7 +1022,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile,
                         string extension = "";
                         if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
                         
-                       classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flip);
+                       classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flip, writeShortcuts);
                         pDataArray.push_back(tempclass);
                         
                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
diff --git a/classifyseqscommand.h b/classifyseqscommand.h

index 4965642e7f77f0d8638bceb177bd79788c8e39ad..0cffec6948a83849a078fbef99b81cb5fe7b4433 100644 (file)
--- a/classifyseqscommand.h
+++ b/classifyseqscommand.h
@@ -19,9 +19,11 @@
  #include "phylotree.h"
  #include "phylosummary.h"
  #include "knn.h"
+#include "kmertree.h"
+#include "aligntree.h"
  
  
-//KNN and Bayesian methods modeled from algorithms in
+//KNN and Wang methods modeled from algorithms in
  //Naı¨ve Bayesian Classiﬁer for Rapid Assignment of rRNA Sequences 
  //into the New Bacterial Taxonomy􏰎† 
  //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* 
@@ -62,6 +64,7 @@ private:
         vector<linePair*> lines;
         vector<string> fastaFileNames;
         vector<string> namefileNames;
+    vector<string> countfileNames;
         vector<string> groupfileNames;
         vector<string> outputNames;
         map<string, vector<string> > nameMap;
@@ -70,10 +73,10 @@ private:
         Classify* classify;
         ReferenceDB* rdb;
         
-       string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
+       string fastaFileName, templateFileName, countfile, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
         int processors, kmerSize, numWanted, cutoff, iters;
         float match, misMatch, gapOpen, gapExtend;
-       bool abort, probs, save, flip;
+       bool abort, probs, save, flip, hasName, hasCount, writeShortcuts;
         
         int driver(linePair*, string, string, string, string);
         int createProcesses(string, string, string, string); 
@@ -99,10 +102,10 @@ struct classifyData {
         MothurOut* m;
         float match, misMatch, gapOpen, gapExtend;
         int count, kmerSize, threadID, cutoff, iters, numWanted;
-       bool probs, flip;
+       bool probs, flip, writeShortcuts;
          
         classifyData(){}
-       classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli) {
+       classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli, bool wsh) {
                 accnos = acc;
                 taxonomyFileName = tx;
                 templateFileName = te;
@@ -126,6 +129,7 @@ struct classifyData {
                 probs = p;
                 count = 0;
                 flip = fli;
+        writeShortcuts = wsh;
         }
  };
  
@@ -162,12 +166,17 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){
                 
                 //make classify
                 Classify* myclassify;
-               if(pDataArray->method == "bayesian"){   myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip);         }
+               if(pDataArray->method == "bayesian"){   myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts);             }
                 else if(pDataArray->method == "knn"){   myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID);                           }
+        else if(pDataArray->method == "zap"){  
+            outputMethodTag = search + "_" + outputMethodTag;
+            if (pDataArray->search == "kmer") {   myclassify = new KmerTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->kmerSize, pDataArray->cutoff); }
+            else {  myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff);  }
+        }
                 else {
                         pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
                         pDataArray->m->mothurOutEndLine();
-                       myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); 
+                       myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts);     
                 }
                 
                 if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
diff --git a/classifysharedcommand.cpp b/classifysharedcommand.cpp

new file mode 100755 (executable)

index 0000000..f964937
--- /dev/null
+++ b/classifysharedcommand.cpp
@@ -0,0 +1,364 @@
+//
+//  classifysharedcommand.cpp
+//  Mothur
+//
+//  Created by Abu Zaher Md. Faridee on 8/13/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "classifysharedcommand.h"
+#include "randomforest.hpp"
+#include "decisiontree.hpp"
+#include "rftreenode.hpp"
+
+//**********************************************************************************************************************
+vector<string> ClassifySharedCommand::setParameters(){ 
+       try {
+               //CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);        
+        CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);            
+        CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign);    
+        CommandParameter potupersplit("otupersplit", "Multiple", "log2-squareroot", "log2", "", "", "",false,false); parameters.push_back(potupersplit);
+        CommandParameter psplitcriteria("splitcriteria", "Multiple", "gainratio-infogain", "gainratio", "", "", "",false,false); parameters.push_back(psplitcriteria);
+               CommandParameter pnumtrees("numtrees", "Number", "", "100", "", "", "",false,false); parameters.push_back(pnumtrees);
+
+        CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
+               CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+               
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifySharedCommand", "setParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string ClassifySharedCommand::getHelpString(){ 
+       try {
+               string helpString = "";
+               helpString += "The classify.shared command allows you to ....\n";
+               helpString += "The classify.shared command parameters are: shared, design, label, groups, otupersplit.\n";
+        helpString += "The label parameter is used to analyze specific labels in your input.\n";
+               helpString += "The groups parameter allows you to specify which of the groups in your designfile you would like analyzed.\n";
+               helpString += "The classify.shared should be in the following format: \n";
+               helpString += "classify.shared(shared=yourSharedFile, design=yourDesignFile)\n";
+               return helpString;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifySharedCommand", "getHelpString");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string ClassifySharedCommand::getOutputFileNameTag(string type, string inputName=""){  
+       try {
+        string tag = "";
+               map<string, vector<string> >::iterator it;
+        
+        //is this a type this command creates
+        it = outputTypes.find(type);
+        if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
+        else {
+            if (type == "summary") {  tag = "summary"; }
+            else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
+        }
+        return tag;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifySharedCommand", "getOutputFileName");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+ClassifySharedCommand::ClassifySharedCommand() {
+  try {
+    abort = true; calledHelp = true;
+    setParameters();
+    vector<string> tempOutNames;
+    outputTypes["summary"] = tempOutNames; 
+  }
+  catch(exception& e) {
+    m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand");
+    exit(1);
+  }
+}
+//**********************************************************************************************************************
+ClassifySharedCommand::ClassifySharedCommand(string option) {
+  try {
+    abort = false; calledHelp = false;
+    allLines = 1;
+      
+      //allow user to run help
+    if(option == "help") { help(); abort = true; calledHelp = true; }
+    else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+    
+    else {
+        //valid paramters for this command
+      vector<string> myArray = setParameters();
+      
+      OptionParser parser(option);
+      map<string,string> parameters = parser.getParameters();
+      
+      ValidParameters validParameter;
+      map<string,string>::iterator it;
+        //check to make sure all parameters are valid for command
+      for (it = parameters.begin(); it != parameters.end(); it++) {
+        if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+      }
+        
+        vector<string> tempOutNames;
+        outputTypes["summary"] = tempOutNames;
+      
+        //if the user changes the input directory command factory will send this info to us in the output parameter
+      string inputDir = validParameter.validFile(parameters, "inputdir", false);
+      if (inputDir == "not found"){    inputDir = "";          }
+      else {
+        string path;
+        it = parameters.find("shared");
+          //user has given a shared file
+        if(it != parameters.end()){
+          path = m->hasPath(it->second);
+            //if the user has not given a path then, add inputdir. else leave path alone.
+          if (path == "") {    parameters["shared"] = inputDir + it->second;           }
+        }
+        
+        it = parameters.find("design");
+          //user has given a design file
+        if(it != parameters.end()){
+          path = m->hasPath(it->second);
+            //if the user has not given a path then, add inputdir. else leave path alone.
+          if (path == "") {    parameters["design"] = inputDir + it->second;           }
+        }
+        
+      }
+       
+        //check for parameters
+        //get shared file, it is required
+      sharedfile = validParameter.validFile(parameters, "shared", true);
+      if (sharedfile == "not open") { sharedfile = ""; abort = true; }
+      else if (sharedfile == "not found") {
+          //if there is a current shared file, use it
+        sharedfile = m->getSharedFile();
+        if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
+        else {         m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
+      }else { m->setSharedFile(sharedfile); }
+      
+        //get design file, it is required
+      designfile = validParameter.validFile(parameters, "design", true);
+      if (designfile == "not open") { sharedfile = ""; abort = true; }
+      else if (designfile == "not found") {
+          //if there is a current shared file, use it
+        designfile = m->getDesignFile();
+        if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); }
+        else {         m->mothurOut("You have no current designfile and the design parameter is required."); m->mothurOutEndLine(); abort = true; }
+      }else { m->setDesignFile(designfile); }
+
+      
+        //if the user changes the output directory command factory will send this info to us in the output parameter
+      outputDir = validParameter.validFile(parameters, "outputdir", false);            if (outputDir == "not found"){
+        outputDir = m->hasPath(sharedfile); //if user entered a file with a path then preserve it
+      }
+      
+    
+        // NEW CODE for OTU per split selection criteria
+      otupersplit = validParameter.validFile(parameters, "otupersplit", false);
+      if (otupersplit == "not found") { otupersplit = "log2"; }
+      if ((otupersplit == "squareroot") || (otupersplit == "log2")) {
+        optimumFeatureSubsetSelectionCriteria = otupersplit;
+      }else { m->mothurOut("Not a valid OTU per split selection method. Valid OTU per split selection methods are 'log2' and 'squareroot'."); m->mothurOutEndLine(); abort = true; }
+      
+        // splitcriteria
+      splitcriteria = validParameter.validFile(parameters, "splitcriteria", false);
+      if (splitcriteria == "not found") { splitcriteria = "gainratio"; }
+      if ((splitcriteria == "gainratio") || (splitcriteria == "infogain")) {
+        treeSplitCriterion = splitcriteria;
+      }else { m->mothurOut("Not a valid tree splitting criterio. Valid tree splitting criteria are 'gainratio' and 'infogain'."); m->mothurOutEndLine(); abort = true; }
+      
+      
+      string temp = validParameter.validFile(parameters, "numtrees", false); if (temp == "not found"){ temp = "100";   }
+      m->mothurConvert(temp, numDecisionTrees);
+
+        //Groups must be checked later to make sure they are valid. SharedUtilities has functions of check the validity, just make to so m->setGroups() after the checks.  If you are using these with a shared file no need to check the SharedRAbundVector class will call SharedUtilites for you, kinda nice, huh?
+      string groups = validParameter.validFile(parameters, "groups", false);
+      if (groups == "not found") { groups = ""; }
+      else { m->splitAtDash(groups, Groups); }
+      m->setGroups(Groups);
+      
+        //Commonly used to process list, rabund, sabund, shared and relabund files.  Look at "smart distancing" examples below in the execute function.
+      string label = validParameter.validFile(parameters, "label", false);
+      if (label == "not found") { label = ""; }
+      else {
+        if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
+        else { allLines = 1;  }
+      }
+    }
+    
+  }
+  catch(exception& e) {
+    m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand");
+    exit(1);
+  }
+}
+//**********************************************************************************************************************
+int ClassifySharedCommand::execute() {
+  try {
+    
+    if (abort == true) { if (calledHelp) { return 0; }  return 2;      }
+    
+    InputData input(sharedfile, "sharedfile");
+    vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
+        
+    //read design file
+    designMap.readDesignMap(designfile);
+    
+    string lastLabel = lookup[0]->getLabel();
+    set<string> processedLabels;
+    set<string> userLabels = labels;
+    
+      //as long as you are not at the end of the file or done wih the lines you want
+    while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+      
+      if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  return 0; }
+      
+      if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
+        
+        m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+        
+        processSharedAndDesignData(lookup);  
+          
+        processedLabels.insert(lookup[0]->getLabel());
+        userLabels.erase(lookup[0]->getLabel());
+      }
+      
+      if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+        string saveLabel = lookup[0]->getLabel();
+        
+        for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+        lookup = input.getSharedRAbundVectors(lastLabel);
+        m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+           
+        processSharedAndDesignData(lookup);        
+        
+        processedLabels.insert(lookup[0]->getLabel());
+        userLabels.erase(lookup[0]->getLabel());
+        
+          //restore real lastlabel to save below
+        lookup[0]->setLabel(saveLabel);
+      }
+      
+      lastLabel = lookup[0]->getLabel();
+        //prevent memory leak
+      for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
+      
+      if (m->control_pressed) { return 0; }
+      
+        //get next line to process
+      lookup = input.getSharedRAbundVectors();
+    }
+    
+    if (m->control_pressed) {  return 0; }
+    
+      //output error messages about any remaining user labels
+    set<string>::iterator it;
+    bool needToRun = false;
+    for (it = userLabels.begin(); it != userLabels.end(); it++) {
+      m->mothurOut("Your file does not include the label " + *it);
+      if (processedLabels.count(lastLabel) != 1) {
+        m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+        needToRun = true;
+      }else {
+        m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+      }
+    }
+    
+      //run last label if you need to
+    if (needToRun == true)  {
+      for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
+      lookup = input.getSharedRAbundVectors(lastLabel);
+      
+      m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+      
+      processSharedAndDesignData(lookup);  
+        
+      for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+      
+    }
+
+      m->mothurOutEndLine();
+      m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+      for (int i = 0; i < outputNames.size(); i++) {   m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+      m->mothurOutEndLine();
+      
+    return 0;
+    
+  }
+  catch(exception& e) {
+    m->errorOut(e, "ClassifySharedCommand", "execute");
+    exit(1);
+  }
+}
+//**********************************************************************************************************************
+
+void ClassifySharedCommand::processSharedAndDesignData(vector<SharedRAbundVector*> lookup){  
+    try {
+//    for (int i = 0; i < designMap->getNamesOfGroups().size(); i++) {
+//      string groupName = designMap->getNamesOfGroups()[i];
+//      cout << groupName << endl;
+//    }
+
+//    for (int i = 0; i < designMap->getNumSeqs(); i++) {
+//      string sharedGroupName = designMap->getNamesSeqs()[i];
+//      string treatmentName = designMap->getGroup(sharedGroupName);
+//      cout << sharedGroupName << " : " << treatmentName <<  endl;
+//    }
+  
+        map<string, int> treatmentToIntMap;
+        map<int, string> intToTreatmentMap;
+        for (int  i = 0; i < designMap.getNumGroups(); i++) {
+            string treatmentName = designMap.getNamesOfGroups()[i];
+            treatmentToIntMap[treatmentName] = i;
+            intToTreatmentMap[i] = treatmentName;
+        }
+        
+        int numSamples = lookup.size();
+        int numFeatures = lookup[0]->getNumBins();
+        
+        int numRows = numSamples;
+        int numColumns = numFeatures + 1;           // extra one space needed for the treatment/outcome
+        
+        vector< vector<int> > dataSet(numRows, vector<int>(numColumns, 0));
+        
+        for (int i = 0; i < lookup.size(); i++) {
+            string sharedGroupName = lookup[i]->getGroup();
+            string treatmentName = designMap.getGroup(sharedGroupName);
+            
+            int j = 0;
+            for (; j < lookup[i]->getNumBins(); j++) {
+                int otuCount = lookup[i]->getAbundance(j);
+                dataSet[i][j] = otuCount;
+            }
+            dataSet[i][j] = treatmentToIntMap[treatmentName];
+        }
+        
+        RandomForest randomForest(dataSet, numDecisionTrees, treeSplitCriterion);
+        randomForest.populateDecisionTrees();
+        randomForest.calcForrestErrorRate();
+        
+        string filename = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + lookup[0]->getLabel() + "." + getOutputFileNameTag("summary");
+        outputNames.push_back(filename); outputTypes["summary"].push_back(filename);
+        
+        randomForest.calcForrestVariableImportance(filename);
+        
+        m->mothurOutEndLine();
+    }
+    catch(exception& e) {
+        m->errorOut(e, "ClassifySharedCommand", "processSharedAndDesignData");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+
diff --git a/classifysharedcommand.h b/classifysharedcommand.h

new file mode 100755 (executable)

index 0000000..93c6286
--- /dev/null
+++ b/classifysharedcommand.h
@@ -0,0 +1,54 @@
+//
+//  classifysharedcommand.h
+//  Mothur
+//
+//  Created by Abu Zaher Md. Faridee on 8/13/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef __Mothur__classifysharedcommand__
+#define __Mothur__classifysharedcommand__
+
+#include "command.hpp"
+#include "inputdata.h"
+
+class ClassifySharedCommand : public Command {
+public:
+  ClassifySharedCommand();
+  ClassifySharedCommand(string);
+  ~ClassifySharedCommand() {};
+  
+  vector<string> setParameters();
+  string getCommandName()                      { return "classify.shared";     }
+   string getCommandCategory()         { return "OTU-Based Approaches";                }
+  string getOutputFileNameTag(string, string);
+  string getHelpString();
+  string getCitation() { return "http://www.mothur.org/wiki/Classify.shared\n"; }
+  string getDescription()              { return "description"; }
+  int execute();
+  
+  void help() { m->mothurOut(getHelpString()); }
+
+private:
+    bool abort;
+    string outputDir;
+    vector<string> outputNames, Groups;
+  
+    string sharedfile, designfile, otupersplit, splitcriteria;
+    set<string> labels;
+    bool allLines;
+  
+    int processors;
+    bool useTiming;
+
+    GroupMap designMap;
+  
+    int numDecisionTrees;
+    string treeSplitCriterion, optimumFeatureSubsetSelectionCriteria;
+    bool doPruning, discardHighErrorTrees;
+    double pruneAggressiveness, highErrorTreeDiscardThreshold, featureStandardDeviationThreshold;
+    
+    void processSharedAndDesignData(vector<SharedRAbundVector*> lookup);
+};
+
+#endif /* defined(__Mothur__classifysharedcommand__) */
diff --git a/classifytreecommand.cpp b/classifytreecommand.cpp

index 7861a01bab3420f1598e351b30765dbd0ae5f124..69da8e0d74e58d54d7ae59c78c7ad7f91e9f4b29 100644 (file)
--- a/classifytreecommand.cpp
+++ b/classifytreecommand.cpp
@@ -15,8 +15,9 @@ vector<string> ClassifyTreeCommand::setParameters(){
         try {
                 CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree);
          CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy);
-        CommandParameter pname("name", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pname);
-        CommandParameter pgroup("group", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
          CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -37,8 +38,9 @@ string ClassifyTreeCommand::getHelpString(){
                 helpString += "The classify.tree command reads a tree and taxonomy file and output the consensus taxonomy for each node on the tree. \n";
                 helpString += "If you provide a group file, the concensus for each group will also be provided. \n";
                 helpString += "The new tree contains labels at each internal node.  The label is the node number so you can relate the tree to the summary file.\n";
+        helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n";
                 helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n";
-               helpString += "The classify.tree command parameters are tree, group, name and taxonomy. The tree and taxonomy files are required.\n";
+               helpString += "The classify.tree command parameters are tree, group, name, count and taxonomy. The tree and taxonomy files are required.\n";
          helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
          helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n";
                 helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n"; 
@@ -147,6 +149,14 @@ ClassifyTreeCommand::ClassifyTreeCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
@@ -178,16 +188,30 @@ ClassifyTreeCommand::ClassifyTreeCommand(string option)  {
                         else if (groupfile == "not found") { groupfile = ""; }
                         else { m->setGroupFile(groupfile); }
              
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+            
              string temp = validParameter.validFile(parameters, "cutoff", false);                       if (temp == "not found") { temp = "51"; }
                         m->mothurConvert(temp, cutoff); 
                         
                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
              
-            if (namefile == "") {
-                               vector<string> files; files.push_back(treefile);
-                               parser.getNameFile(files);
+            if (countfile == "") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(treefile);
+                    parser.getNameFile(files);
+                }
                         }
-                       
                 }
         }
         catch(exception& e) {
@@ -213,7 +237,7 @@ int ClassifyTreeCommand::execute(){
          
          TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
          vector<Tree*> T = reader->getTrees();
-        TreeMap* tmap = T[0]->getTreeMap();
+        CountTable* tmap = T[0]->getCountTable();
          Tree* outputTree = T[0];
          delete reader;
  
@@ -367,10 +391,15 @@ string ClassifyTreeCommand::getTaxonomy(set<string> names, int& size) {
                                 if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
                                         m->mothurOut((*it) + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
                                 }else{
-                                       //add seq to tree
-                                       phylo->addSeqToTree((*it), itTax->second);
-                    size++;
-                               }
+                                       if (countfile != "") {
+                        int numDups = ct->getNumSeqs((*it)); 
+                        for (int j = 0; j < numDups; j++) {  phylo->addSeqToTree((*it), itTax->second);  }
+                        size += numDups;
+                    }else{
+                        //add seq to tree
+                        phylo->addSeqToTree((*it), itTax->second);
+                        size++;  
+                    }                          }
                         }
              
                         if (m->control_pressed) { delete phylo; return conTax; }
@@ -444,12 +473,12 @@ map<string, set<string> > ClassifyTreeCommand::getDescendantList(Tree*& T, int i
                 
                 int lc = T->tree[i].getLChild();
                 int rc = T->tree[i].getRChild();
-        TreeMap* tmap = T->getTreeMap();
+       // TreeMap* tmap = T->getTreeMap();
                 
                 if (lc == -1) { //you are a leaf your only descendant is yourself
-            string group = tmap->getGroup(T->tree[i].getName());
+            vector<string> groups = T->tree[i].getGroup();
              set<string> mynames; mynames.insert(T->tree[i].getName());
-            names[group] = mynames; //mygroup -> me
+            for (int j = 0; j < groups.size(); j++) { names[groups[j]] = mynames;   } //mygroup -> me
              names["AllGroups"] = mynames;
                 }else{ //your descedants are the combination of your childrens descendants
                         names = descendants[lc];
diff --git a/classifytreecommand.h b/classifytreecommand.h

index 758a438dc068f0790aab0b8b309f32a0fb6f24a5..dd972b61b457378462392f771d759585bef53188 100644 (file)
--- a/classifytreecommand.h
+++ b/classifytreecommand.h
@@ -12,6 +12,7 @@
  #include "command.hpp"
  #include "readtree.h"
  #include "treemap.h"
+#include "counttable.h"
  
  class ClassifyTreeCommand : public Command {
  public:
@@ -31,13 +32,14 @@ public:
         void help() { m->mothurOut(getHelpString()); }  
         
  private:
-       string treefile, taxonomyfile, groupfile, namefile, outputDir;
+       string treefile, taxonomyfile, groupfile, namefile, countfile, outputDir;
         bool abort;
         vector<string> outputNames;
      int numUniquesInName, cutoff;
      map<string, string> nameMap;
      map<string, int> nameCount;
      map<string, string> taxMap;
+    CountTable* ct;
         
         int getClassifications(Tree*&);
         map<string, set<string> > getDescendantList(Tree*&, int, map<int, map<string, set<string> > >);
diff --git a/clusterclassic.cpp b/clusterclassic.cpp

index 2d1b9a6b781960a824fddafc03b904e8039179e1..32a9341613d07274d397c1ea9cc9d6b8d9383826 100644 (file)
--- a/clusterclassic.cpp
+++ b/clusterclassic.cpp
@@ -231,6 +231,205 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) {
                 exit(1);
         }
  
+}
+/***********************************************************************/
+int ClusterClassic::readPhylipFile(string filename, CountTable* countTable) {
+       try {
+               double distance;
+               int square;
+               string name;
+               vector<string> matrixNames;
+               
+               ifstream fileHandle;
+               m->openInputFile(filename, fileHandle);
+               
+        string numTest;
+               fileHandle >> numTest >> name;
+        
+        if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+        else { convert(numTest, nseqs); }
+        
+        
+               matrixNames.push_back(name);
+        
+               if(countTable == NULL){
+            list = new ListVector(nseqs);
+            list->set(0, name);
+        }
+        else{  list = new ListVector(countTable->getListVector()); }
+
+               
+               //initialize distance matrix to cutoff
+               dMatrix.resize(nseqs);
+               //rowSmallDists.resize(nseqs, temp);
+               for (int i = 1; i < nseqs; i++) {                       
+                       dMatrix[i].resize(i, aboveCutoff);              
+               }                                                                                               
+        
+               
+               char d;
+               while((d=fileHandle.get()) != EOF){
+            
+            if(isalnum(d)){
+                square = 1;
+                fileHandle.putback(d);
+                for(int i=0;i<nseqs;i++){
+                    fileHandle >> distance;
+                }
+                break;
+            }
+            if(d == '\n'){
+                square = 0;
+                break;
+            }
+               }
+        
+               Progress* reading;
+        
+               if(square == 0){
+            
+            reading = new Progress("Reading matrix:     ", nseqs * (nseqs - 1) / 2);
+            
+            int        index = 0;
+            
+            for(int i=1;i<nseqs;i++){
+                if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
+                
+                fileHandle >> name;
+                matrixNames.push_back(name);
+                
+                
+                //there's A LOT of repeated code throughout this method...
+                 if(countTable == NULL){
+                    list->set(i, name);
+                    
+                    for(int j=0;j<i;j++){
+                        
+                        if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
+                        
+                        fileHandle >> distance;
+                        
+                        if (distance == -1) { distance = 1000000; }
+                        else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
+                        
+                        //if(distance < cutoff){
+                        dMatrix[i][j] = distance;
+                        if (distance < smallDist) { smallDist = distance; }
+                        //if (rowSmallDists[i].dist > distance) {  rowSmallDists[i].dist = distance; rowSmallDists[i].col = j; rowSmallDists[i].row = i; }
+                        //if (rowSmallDists[j].dist > distance) {  rowSmallDists[j].dist = distance; rowSmallDists[j].col = i; rowSmallDists[j].row = j; }
+                        //}
+                        index++;
+                        reading->update(index);
+                    }
+                    
+                }
+                else{
+                    for(int j=0;j<i;j++){
+                        fileHandle >> distance;
+                        
+                        if (m->control_pressed) { delete reading; fileHandle.close(); return 0;  }
+                        
+                        if (distance == -1) { distance = 1000000; }
+                        else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
+                        
+                        if (distance < smallDist) { smallDist = distance; }
+                        
+                        int row = countTable->get(matrixNames[i]);
+                        int col = countTable->get(matrixNames[j]);
+                       
+                        if (row < col) {  dMatrix[col][row] = distance; }
+                        else { dMatrix[row][col] = distance; }
+                        
+                        index++;
+                        reading->update(index);
+                    }
+                }
+            }
+               }
+               else{
+            
+            reading = new Progress("Reading matrix:     ", nseqs * nseqs);
+            
+            int index = nseqs;
+            
+            for(int i=1;i<nseqs;i++){
+                fileHandle >> name;                
+                matrixNames.push_back(name);
+                
+                if(countTable == NULL){
+                    list->set(i, name);
+                    for(int j=0;j<nseqs;j++){
+                        fileHandle >> distance;
+                        
+                        if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
+                        
+                        if (distance == -1) { distance = 1000000; }
+                        else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.
+                        
+                        if(j < i){
+                            if (distance < smallDist) { smallDist = distance; }
+                            
+                            dMatrix[i][j] = distance;
+                        }
+                        index++;
+                        reading->update(index);
+                    }
+                    
+                }
+                else{
+                    
+                    for(int j=0;j<nseqs;j++){
+                        fileHandle >> distance;
+                        
+                        if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
+                        
+                        if (distance == -1) { distance = 1000000; }
+                        else if (sim) { distance = 1.0 - distance;  }  //user has entered a sim matrix that we need to convert.                                                        
+                        
+                        if(j < i){
+                            if (distance < smallDist) { smallDist = distance; }
+                            
+                            int row = countTable->get(matrixNames[i]);
+                            int col = countTable->get(matrixNames[j]);
+                            
+                            if (row < col) {  dMatrix[col][row] = distance; }
+                            else { dMatrix[row][col] = distance; }
+                        }
+                        index++;
+                        reading->update(index);
+                    }
+                }
+            }
+               }
+               
+               if (m->control_pressed) {  fileHandle.close();  delete reading; return 0; }
+               
+               reading->finish();
+               delete reading;
+        
+               list->setLabel("0");
+               rabund = new RAbundVector();
+        rabund->setLabel(list->getLabel());  
+        
+        for(int i = 0; i < list->getNumBins(); i++) { 
+            if (m->control_pressed) { break; }
+            vector<string> binNames;
+            string bin = list->get(i);
+            m->splitAtComma(bin, binNames);
+            int total = 0;
+            for (int j = 0; j < binNames.size(); j++) { total += countTable->getNumSeqs(binNames[j]);  }
+            rabund->push_back(total);   
+        }
+               
+               fileHandle.close();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClusterClassic", "readPhylipFile");
+               exit(1);
+       }
+    
  }
  /***********************************************************************/
  //sets smallCol and smallRow, returns distance
@@ -398,16 +597,12 @@ void ClusterClassic::setMapWanted(bool f)  {
                         
                         //parse bin 
                         string names = list->get(i);
-                       while (names.find_first_of(',') != -1) { 
-                               //get name from bin
-                               string name = names.substr(0,names.find_first_of(','));
+                       vector<string> binnames;
+            m->splitAtComma(names, binnames);
+            for (int j = 0; j < binnames.size(); j++) {
                                 //save name and bin number
-                               seq2Bin[name] = i;
-                               names = names.substr(names.find_first_of(',')+1, names.length());
+                               seq2Bin[binnames[j]] = i;
                         }
-                       
-                       //get last name
-                       seq2Bin[names] = i;
                 }
                 
         }
@@ -420,17 +615,13 @@ void ClusterClassic::setMapWanted(bool f)  {
  void ClusterClassic::updateMap() {
  try {
                 //update location of seqs in smallRow since they move to smallCol now
-               string names = list->get(smallRow);
-               while (names.find_first_of(',') != -1) { 
-                       //get name from bin
-                       string name = names.substr(0,names.find_first_of(','));
-                       //save name and bin number
-                       seq2Bin[name] = smallCol;
-                       names = names.substr(names.find_first_of(',')+1, names.length());
-               }
-                       
-               //get last name
-               seq2Bin[names] = smallCol;
+        string names = list->get(smallRow);
+        vector<string> binnames;
+        m->splitAtComma(names, binnames);
+        for (int j = 0; j < binnames.size(); j++) {
+            //save name and bin number
+            seq2Bin[binnames[j]] = smallCol;
+        }
                 
         }
         catch(exception& e) {
diff --git a/clusterclassic.h b/clusterclassic.h

index a650bbf8de79147c3bd48bbcf1f63fdf22c6f574..eaccb273b4980b297e1ad4d9f419d6494e37a120 100644 (file)
--- a/clusterclassic.h
+++ b/clusterclassic.h
@@ -6,6 +6,7 @@
  #include "listvector.hpp"
  #include "rabundvector.hpp"
  #include "nameassignment.hpp"
+#include "counttable.h"
  
  /*
   *  clusterclassic.h
@@ -22,6 +23,7 @@ class ClusterClassic {
  public:
         ClusterClassic(float, string, bool);
         int readPhylipFile(string, NameAssignment*);
+    int readPhylipFile(string, CountTable*);
         void update(double&);
         double getSmallDist() { return smallDist; }     
         int getNSeqs() { return nseqs; }        
diff --git a/clustercommand.cpp b/clustercommand.cpp

index 19eaf85bc04fdce079f21a189c7d5bd4baa6eca0..06e627a615dacf69fe80464db0123582bef510ed 100644 (file)
--- a/clustercommand.cpp
+++ b/clustercommand.cpp
@@ -154,6 +154,14 @@ ClusterCommand::ClusterCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //check for required parameters
@@ -481,12 +489,12 @@ void ClusterCommand::printData(string label){
                 loops = 0;
                 start = time(NULL);
          
+        oldRAbund.setLabel(label);
          if (countfile == "") {
              oldRAbund.print(rabundFile);
              oldRAbund.getSAbundVector().print(sabundFile);
          }
-        
-        oldRAbund.setLabel(label);
+       
          if (m->isTrue(showabund)) {
              oldRAbund.getSAbundVector().print(cout);
          }
diff --git a/clusterdoturcommand.cpp b/clusterdoturcommand.cpp

index 9bfb52b9d92b0222af2e4b1acff37c82f968349f..2515b5c15129434f4d45c19f798c76c7645ad369 100644 (file)
--- a/clusterdoturcommand.cpp
+++ b/clusterdoturcommand.cpp
@@ -14,7 +14,8 @@
  vector<string> ClusterDoturCommand::setParameters(){   
         try {
                 CommandParameter pphylip("phylip", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pphylip);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+               CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
                 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
@@ -37,7 +38,7 @@ string ClusterDoturCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The cluster.classic command clusters using the algorithm from dotur. \n";
-               helpString += "The cluster.classic command parameter options are phylip, name, method, cuttoff, hard, sim, precision. Phylip is required, unless you have a valid current file.\n";
+               helpString += "The cluster.classic command parameter options are phylip, name, count, method, cuttoff, hard, sim, precision. Phylip is required, unless you have a valid current file.\n";
                 helpString += "The cluster.classic command should be in the following format: \n";
                 helpString += "cluster.classic(phylip=yourDistanceMatrix, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n";
                 helpString += "The acceptable cluster methods are furthest, nearest, weighted and average.  If no method is provided then average is assumed.\n";       
@@ -132,7 +133,14 @@ ClusterDoturCommand::ClusterDoturCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
-
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //initialize outputTypes
@@ -159,10 +167,17 @@ ClusterDoturCommand::ClusterDoturCommand(string option)  {
                 
                         //check for optional parameter and set defaults
                         namefile = validParameter.validFile(parameters, "name", true);
-                       if (namefile == "not open") { abort = true; }   
+                       if (namefile == "not open") { abort = true; namefile = ""; }    
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { m->setCountTableFile(countfile); }
                         
+            if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.classic command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+            
                         string temp;
                         temp = validParameter.validFile(parameters, "precision", false);
                         if (temp == "not found") { temp = "100"; }
@@ -204,36 +219,49 @@ int ClusterDoturCommand::execute(){
         
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
-               if(namefile != ""){     
+        
+        ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
+        
+        NameAssignment* nameMap = NULL;
+        CountTable* ct = NULL;
+        if(namefile != "") {   
                         nameMap = new NameAssignment(namefile);
                         nameMap->readMap();
-               }else{
-                       nameMap = NULL;
-               }
-               
-               //reads phylip file storing data in 2D vector, also fills list and rabund
-               ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
-               cluster->readPhylipFile(phylipfile, nameMap);
-               
-               if (m->control_pressed) { delete cluster; delete list; delete rabund; return 0; }
+            cluster->readPhylipFile(phylipfile, nameMap);
+            delete nameMap;
+               }else if (countfile != "") {
+            ct = new CountTable();
+            ct->readTable(countfile);
+            cluster->readPhylipFile(phylipfile, ct);
+            delete ct;
+        }else {
+            cluster->readPhylipFile(phylipfile, nameMap);
+        }
+        tag = cluster->getTag();
+        
+               if (m->control_pressed) { delete cluster; return 0; }
                 
                 list = cluster->getListVector();
                 rabund = cluster->getRAbundVector();
-                                               
+                                                               
                 if (outputDir == "") { outputDir += m->hasPath(phylipfile); }
                 fileroot = outputDir + m->getRootName(m->getSimpleName(phylipfile));
                         
          string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
          string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
-        string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list");
+        string listFileName = fileroot+ tag + ".";
+        if (countfile != "") { listFileName += "unique_"; }
+        listFileName += getOutputFileNameTag("list");
          
-               m->openOutputFile(sabundFileName,       sabundFile);
-               m->openOutputFile(rabundFileName,       rabundFile);
+        if (countfile == "") {
+            m->openOutputFile(sabundFileName,  sabundFile);
+            m->openOutputFile(rabundFileName,  rabundFile);
+            outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
+            outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
+            
+        }
                 m->openOutputFile(listFileName, listFile);
-               
-               outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
-               outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
-               outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
+        outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
                 
                 float previousDist = 0.00000;
                 float rndPreviousDist = 0.00000;
@@ -245,7 +273,8 @@ int ClusterDoturCommand::execute(){
                 int estart = time(NULL);
         
                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
-                       if (m->control_pressed) { delete cluster; delete list; delete rabund; sabundFile.close();rabundFile.close();listFile.close();  for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);        } outputTypes.clear();  return 0;  }
+                       if (m->control_pressed) { delete cluster; delete list; delete rabund; if(countfile == "") {rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); }
+                listFile.close(); m->mothurRemove((fileroot+ tag + ".list")); outputTypes.clear();  return 0;  }
                 
                         cluster->update(cutoff);
         
@@ -276,18 +305,14 @@ int ClusterDoturCommand::execute(){
                 else if(rndPreviousDist<cutoff){
                         printData(toString(rndPreviousDist, length-1));
                 }
-                                       
-               sabundFile.close();
-               rabundFile.close();
+               
+        if (countfile == "") {
+            sabundFile.close();
+            rabundFile.close();
+        }
                 listFile.close();
                 
-               delete cluster; delete nameMap; delete list; delete rabund;
-       
-               //if (saveCutoff != cutoff) { 
-               //      if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
-               //      else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
-               //      m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); 
-               //}
+               delete cluster;  delete list; delete rabund;
                 
                 //set list file as new current listfile
                 string current = "";
@@ -327,11 +352,12 @@ int ClusterDoturCommand::execute(){
  
  void ClusterDoturCommand::printData(string label){
         try {
-       
-               oldRAbund.setLabel(label);
-               oldRAbund.print(rabundFile);
-               oldRAbund.getSAbundVector().print(sabundFile);
-               
+        oldRAbund.setLabel(label);
+        if (countfile == "") {
+            oldRAbund.print(rabundFile);
+            oldRAbund.getSAbundVector().print(sabundFile);
+        }
+
                 oldRAbund.getSAbundVector().print(cout);
                 
                 oldList.setLabel(label);
diff --git a/clusterdoturcommand.h b/clusterdoturcommand.h

index 09ee8229984a488f5eeecbcf269922a01c5d9500..dd61a35bdcf3e45a72a590ec9b430584cc30376d 100644 (file)
--- a/clusterdoturcommand.h
+++ b/clusterdoturcommand.h
@@ -37,7 +37,7 @@ public:
         
  private:
         bool abort, hard, sim;
-       string method, fileroot, tag, outputDir, phylipfile, namefile;
+       string method, fileroot, tag, outputDir, phylipfile, namefile, countfile;
         double cutoff;
         int precision, length;
         ofstream sabundFile, rabundFile, listFile;
diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp

index 4a338419af8900f8b7ebd693225a7e9f4d193481..f785c506db502c32c52209badf1af3258b97f1c9 100644 (file)
--- a/clusterfragmentscommand.cpp
+++ b/clusterfragmentscommand.cpp
@@ -29,7 +29,8 @@ inline bool comparePriority(seqRNode first, seqRNode second) {
  vector<string> ClusterFragmentsCommand::setParameters(){       
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+               CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
                 CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs);
                 CommandParameter ppercent("percent", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppercent);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
@@ -49,8 +50,8 @@ string ClusterFragmentsCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The cluster.fragments command groups sequences that are part of a larger sequence.\n";
-               helpString += "The cluster.fragments command outputs a new fasta and name file.\n";
-               helpString += "The cluster.fragments command parameters are fasta, name, diffs and percent. The fasta parameter is required, unless you have a valid current file. \n";
+               helpString += "The cluster.fragments command outputs a new fasta and name or count file.\n";
+               helpString += "The cluster.fragments command parameters are fasta, name, count, diffs and percent. The fasta parameter is required, unless you have a valid current file. \n";
                 helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
                 helpString += "The diffs parameter allows you to set the number of differences allowed, default=0. \n";
                 helpString += "The percent parameter allows you to set percentage of differences allowed, default=0. percent=2 means if the number of difference is less than or equal to two percent of the length of the fragment, then cluster.\n";
@@ -78,6 +79,7 @@ string ClusterFragmentsCommand::getOutputFileNameTag(string type, string inputNa
          else {
              if (type == "fasta") {  outputFileName =  "fragclust.fasta"; }
              else if (type == "name") {  outputFileName =  "fragclust.names"; }
+            else if (type == "count") {  outputFileName =  "fragclust.count_table"; }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
          return outputFileName;
@@ -96,6 +98,7 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(){
                 vector<string> tempOutNames;
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "ClusterFragmentsCommand", "ClusterFragmentsCommand");
@@ -129,6 +132,7 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) {
                         vector<string> tempOutNames;
                         outputTypes["fasta"] = tempOutNames;
                         outputTypes["name"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -150,6 +154,14 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         //check for required parameters
@@ -171,6 +183,13 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) {
                         if (namefile == "not found") { namefile =  "";  }
                         else if (namefile == "not open") { namefile = ""; abort = true; }       
                         else {  readNameFile(); m->setNameFile(namefile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { ct.readTable(countfile); m->setCountTableFile(countfile); }
+                       
+            if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.fragments command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
                         
                         string temp;
                         temp = validParameter.validFile(parameters, "diffs", false);            if (temp == "not found"){       temp = "0";                             }
@@ -179,10 +198,12 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) {
                         temp = validParameter.validFile(parameters, "percent", false);          if (temp == "not found"){       temp = "0";                             }
                         m->mothurConvert(temp, percent);
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(fastafile);
-                               parser.getNameFile(files);
-                       }
+                       if (countfile == "") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(fastafile);
+                    parser.getNameFile(files);
+                }
+            }
                         
                 }
                                 
@@ -229,10 +250,13 @@ int ClusterFragmentsCommand::execute(){
                                                 string jBases = alignSeqs[j].seq.getUnaligned();
                                                                                                         
                                                 if (isFragment(iBases, jBases)) {
-                                                       //merge
-                                                       alignSeqs[i].names += ',' + alignSeqs[j].names;
-                                                       alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
-
+                            if (countfile != "") {
+                                ct.mergeCounts(alignSeqs[i].names, alignSeqs[j].names);
+                            }else {
+                                //merge
+                                alignSeqs[i].names += ',' + alignSeqs[j].names;
+                                alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
+                            }
                                                         alignSeqs[j].active = 0;
                                                         alignSeqs[j].numIdentical = 0;
                                                         count++;
@@ -254,6 +278,7 @@ int ClusterFragmentsCommand::execute(){
                 
                 string newFastaFile = fileroot + getOutputFileNameTag("fasta");
                 string newNamesFile = fileroot + getOutputFileNameTag("name");
+        if (countfile != "") { newNamesFile = fileroot + getOutputFileNameTag("count"); }
                 
                 if (m->control_pressed) { return 0; }
                 
@@ -285,6 +310,11 @@ int ClusterFragmentsCommand::execute(){
                 if (itTypes != outputTypes.end()) {
                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
                 }
+        
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
  
                 return 0;
                 
@@ -372,7 +402,10 @@ int ClusterFragmentsCommand::readFASTA(){
                                         else{
                                                 seqRNode tempNode(itSize->second, seq, names[seq.getName()], seq.getUnaligned().length());
                                                 alignSeqs.push_back(tempNode);
-                                       }       
+                                       }
+                }else if(countfile != "") {
+                    seqRNode tempNode(ct.getNumSeqs(seq.getName()), seq, seq.getName(), seq.getUnaligned().length());
+                    alignSeqs.push_back(tempNode);
                                 }else { //no names file, you are identical to yourself 
                                         seqRNode tempNode(1, seq, seq.getName(), seq.getUnaligned().length());
                                         alignSeqs.push_back(tempNode);
@@ -396,17 +429,18 @@ void ClusterFragmentsCommand::printData(string newfasta, string newname){
                 ofstream outNames;
                 
                 m->openOutputFile(newfasta, outFasta);
-               m->openOutputFile(newname, outNames);
+               if (countfile == "") {  m->openOutputFile(newname, outNames); }
                 
                 for (int i = 0; i < alignSeqs.size(); i++) {
                         if (alignSeqs[i].numIdentical != 0) {
                                 alignSeqs[i].seq.printSequence(outFasta); 
-                               outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
+                               if (countfile == "") {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
                         }
                 }
                 
                 outFasta.close();
-               outNames.close();
+               if (countfile == "") {  outNames.close(); }
+        else { ct.printTable(newname); }
         }
         catch(exception& e) {
                 m->errorOut(e, "ClusterFragmentsCommand", "printData");
@@ -438,6 +472,5 @@ void ClusterFragmentsCommand::readNameFile(){
                 exit(1);
         }
  }
-
  /**************************************************************************************************/
  
diff --git a/clusterfragmentscommand.h b/clusterfragmentscommand.h

index c322529f3bedf95600a8b30e0ac013be2870d290..e3d861a8a1477c0edf71ae7208c86560ceb56860 100644 (file)
--- a/clusterfragmentscommand.h
+++ b/clusterfragmentscommand.h
@@ -13,6 +13,7 @@
  
  #include "command.hpp"
  #include "sequence.hpp"
+#include "counttable.h"
  
  /************************************************************/
  struct seqRNode {
@@ -46,8 +47,9 @@ public:
         void help() { m->mothurOut(getHelpString()); }  
         
  private:
+    CountTable ct;
         bool abort;
-       string fastafile, namefile, outputDir;
+       string fastafile, namefile, countfile, outputDir;
         int diffs, percent;
         vector<seqRNode> alignSeqs; 
         map<string, string> names; //represents the names file first column maps to second column
diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp

index b097f382024d5be9c56bd346723e398b10ef3725..b3ce0f938c90d55fcfb86360677e3fd5e93a01b5 100644 (file)
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -16,7 +16,8 @@ vector<string> ClusterSplitCommand::setParameters(){
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
+               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount);
                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
                 CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
@@ -45,7 +46,7 @@ vector<string> ClusterSplitCommand::setParameters(){
  string ClusterSplitCommand::getHelpString(){   
         try {
                 string helpString = "";
-               helpString += "The cluster.split command parameter options are fasta, phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
+               helpString += "The cluster.split command parameter options are fasta, phylip, column, name, count, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Fasta or Phylip or column and name are required.\n";
                 helpString += "The cluster.split command can split your files in 3 ways. Splitting by distance file, by classification, or by classification also using a fasta file. \n";
                 helpString += "For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n";
                 helpString += "For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n";
@@ -54,7 +55,8 @@ string ClusterSplitCommand::getHelpString(){
                 helpString += "You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n";
                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
-               helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
+               helpString += "The name parameter allows you to enter your name file. \n";
+        helpString += "The count parameter allows you to enter your count file. \n A count or name file is required if your distance file is in column format";
                 helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
                 helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
@@ -196,6 +198,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //check for required parameters
@@ -210,9 +220,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                         else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
                         
                         namefile = validParameter.validFile(parameters, "name", true);
-                       if (namefile == "not open") { abort = true; }   
+                       if (namefile == "not open") { abort = true; namefile = "";}     
                         else if (namefile == "not found") { namefile = "";  }
                         else { m->setNameFile(namefile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = "";}   
+                       else if (countfile == "not found") { countfile = "";  }
+                       else { m->setCountTableFile(countfile); }
                         
                         fastafile = validParameter.validFile(parameters, "fasta", true);
                         if (fastafile == "not open") { abort = true; }  
@@ -243,14 +258,20 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                                 }
                         }
                         else if ((phylipfile != "") && (columnfile != "") && (fastafile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: fasta, phylip or column."); m->mothurOutEndLine(); abort = true; }
-               
+            
+            if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+            
                         if (columnfile != "") {
-                               if (namefile == "") { 
+                               if ((namefile == "") && (countfile == "")) { 
                                         namefile = m->getNameFile(); 
                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
                                         else { 
-                                               m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
-                                               abort = true; 
+                                               countfile = m->getCountTableFile();
+                        if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                        else { 
+                            m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
+                            abort = true; 
+                        }      
                                         }       
                                 }
                         }
@@ -265,12 +286,16 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                                         }       
                                 }
                                 
-                               if (namefile == "") { 
+                               if ((namefile == "") && (countfile == "")) { 
                                         namefile = m->getNameFile(); 
                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
                                         else { 
-                                               m->mothurOut("You need to provide a namefile if you are if you are using a fasta file to generate the split."); m->mothurOutEndLine(); 
-                                               abort = true; 
+                                               countfile = m->getCountTableFile();
+                        if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                        else { 
+                            m->mothurOut("You need to provide a namefile or countfile if you are going to use the fasta file to generate the split."); m->mothurOutEndLine(); 
+                            abort = true; 
+                        }      
                                         }       
                                 }
                         }
@@ -379,7 +404,7 @@ int ClusterSplitCommand::execute(){
                 
                         //if no names file given with phylip file, create it
                         ListVector* listToMakeNameFile =  convert->getListVector();
-                       if (namefile == "") {  //you need to make a namefile for split matrix
+                       if ((namefile == "") && (countfile == "")) {  //you need to make a namefile for split matrix
                                 ofstream out;
                                 namefile = phylipfile + ".names";
                                 m->openOutputFile(namefile, out);
@@ -401,9 +426,9 @@ int ClusterSplitCommand::execute(){
                 
                 //split matrix into non-overlapping groups
                 SplitMatrix* split;
-               if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                                       }
-               else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                                       }
-               else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);     }
+               if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, cutoff, splitmethod, large);                                                    }
+               else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, countfile, taxFile, taxLevelCutoff, splitmethod, large);                                    }
+               else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, countfile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir);  }
                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0;             }
                 
                 split->split();
@@ -666,15 +691,21 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
                 
                 //read in singletons
                 if (singleton != "none") {
-                       ifstream in;
-                       m->openInputFile(singleton, in);
+            
+            ifstream in;
+            m->openInputFile(singleton, in);
                                 
                         string firstCol, secondCol;
                         listSingle = new ListVector();
+            
+            if (countfile != "") { m->getline(in); m->gobble(in); }
+            
                         while (!in.eof()) {
-                               in >> firstCol >> secondCol; m->gobble(in);
-                               listSingle->push_back(secondCol);
+                               in >> firstCol >> secondCol; m->getline(in); m->gobble(in);
+                               if (countfile == "") { listSingle->push_back(secondCol); }
+                else { listSingle->push_back(firstCol); }
                         }
+            
                         in.close();
                         m->mothurRemove(singleton);
                         
@@ -775,15 +806,21 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                 
          string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
          string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
-        string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list");
+        string listFileName = fileroot+ tag + ".";
+        if (countfile != "") { listFileName += "unique_"; }
+        listFileName += getOutputFileNameTag("list");
          
-               m->openOutputFile(sabundFileName,       outSabund);
-               m->openOutputFile(rabundFileName,       outRabund);
+        if (countfile == "") {
+            m->openOutputFile(sabundFileName,  outSabund);
+            m->openOutputFile(rabundFileName,  outRabund);
+            outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
+            outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
+            
+        }
                 m->openOutputFile(listFileName, outList);
+        outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
+               
                 
-               outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
-               outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
-               outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);               
                 map<float, int>::iterator itLabel;
  
                 //for each label needed
@@ -794,22 +831,25 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                         else { thisLabel = toString(itLabel->first,  length-1);  } 
                         
                         outList << thisLabel << '\t' << itLabel->second << '\t';
-
-                       RAbundVector* rabund = new RAbundVector();
-                       rabund->setLabel(thisLabel);
+            
+            RAbundVector* rabund = NULL;
+            if (countfile == "") {
+                rabund = new RAbundVector();
+                rabund->setLabel(thisLabel);
+            }
  
                         //add in singletons
                         if (listSingle != NULL) {
                                 for (int j = 0; j < listSingle->getNumBins(); j++) {
                                         outList << listSingle->get(j) << '\t';
-                                       rabund->push_back(m->getNumNames(listSingle->get(j)));
+                                       if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
                                 }
                         }
                         
                         //get the list info from each file
                         for (int k = 0; k < listNames.size(); k++) {
         
-                               if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } delete rabund; return 0; }
+                               if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } if (rabund != NULL) { delete rabund; } return 0; }
                                 
                                 InputData* input = new InputData(listNames[k], "list");
                                 ListVector* list = input->getListVector(thisLabel);
@@ -819,26 +859,28 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                                 else {          
                                         for (int j = 0; j < list->getNumBins(); j++) {
                                                 outList << list->get(j) << '\t';
-                                               rabund->push_back(m->getNumNames(list->get(j)));
+                                               if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
                                         }
                                         delete list;
                                 }
                                 delete input;
                         }
                         
-                       SAbundVector sabund = rabund->getSAbundVector();
-                       
-                       sabund.print(outSabund);
-                       rabund->print(outRabund);
+            if (countfile == "") {
+                SAbundVector sabund = rabund->getSAbundVector();
+                sabund.print(outSabund);
+                rabund->print(outRabund);
+            }
                         outList << endl;
                         
-                       delete rabund;
+                       if (rabund != NULL) { delete rabund; }
                 }
                 
                 outList.close();
-               outRabund.close();
-               outSabund.close();
-               
+        if (countfile == "") {
+            outRabund.close();
+            outSabund.close();
+               }
                 if (listSingle != NULL) { delete listSingle;  }
                 
                 for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
@@ -993,7 +1035,7 @@ vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string>
                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
                 //Taking advantage of shared memory to allow both threads to add labels.
                 //////////////////////////////////////////////////////////////////////////////////////////////////////
-               
+               /*
                 vector<clusterData*> pDataArray; 
                 DWORD   dwThreadIdArray[processors-1];
                 HANDLE  hThreadArray[processors-1]; 
@@ -1031,7 +1073,7 @@ vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string>
                         CloseHandle(hThreadArray[i]);
                         delete pDataArray[i];
                 }
-
+*/
         #endif          
          
          return listFiles;
@@ -1101,16 +1143,25 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN
  
          m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
          
-        NameAssignment* nameMap = new NameAssignment(thisNamefile);
-        nameMap->readMap();
-                               
-               //reads phylip file storing data in 2D vector, also fills list and rabund
+        //reads phylip file storing data in 2D vector, also fills list and rabund
          bool sim = false;
                 ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
-               cluster->readPhylipFile(thisDistFile, nameMap);
-               tag = cluster->getTag();
          
-               if (m->control_pressed) { delete cluster; return 0; }
+        NameAssignment* nameMap = NULL;
+        CountTable* ct = NULL;
+        if(namefile != ""){    
+                       nameMap = new NameAssignment(thisNamefile);
+                       nameMap->readMap();
+            cluster->readPhylipFile(thisDistFile, nameMap);
+               }else if (countfile != "") {
+            ct = new CountTable();
+            ct->readTable(thisNamefile);
+            cluster->readPhylipFile(thisDistFile, ct);
+        }
+        tag = cluster->getTag();
+        
+               if (m->control_pressed) { if(namefile != ""){   delete nameMap; }
+            else { delete ct; } delete cluster; return 0; }
                 
                 list = cluster->getListVector();
                 rabund = cluster->getRAbundVector();
@@ -1136,7 +1187,8 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN
          m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
          
                 while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
-                       if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  return listFileName;  }
+                       if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close();  if(namefile != ""){    delete nameMap; }
+                else { delete ct; } return listFileName;  }
              
                         cluster->update(cutoff);
              
@@ -1179,8 +1231,12 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN
          
                 listFile.close();
                 
-               delete cluster; delete nameMap; delete list; delete rabund;
-
+               delete cluster;  delete list; delete rabund;
+        if(namefile != ""){    delete nameMap; }
+        else { delete ct; }
+        
+        m->mothurRemove(thisDistFile);
+        m->mothurRemove(thisNamefile);
          
          return listFileName;
          
@@ -1219,18 +1275,30 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile
          ReadMatrix* read = new ReadColumnMatrix(thisDistFile);         
          read->setCutoff(cutoff);
          
-        NameAssignment* nameMap = new NameAssignment(thisNamefile);
-        nameMap->readMap();
-        read->read(nameMap);
-        
-        if (m->control_pressed) {  delete read; delete nameMap; return listFileName; }
-        
-        list = read->getListVector();
+        NameAssignment* nameMap = NULL;
+        CountTable* ct = NULL;
+               if(namefile != ""){     
+                       nameMap = new NameAssignment(thisNamefile);
+                       nameMap->readMap();
+            read->read(nameMap);
+               }else if (countfile != "") {
+            ct = new CountTable();
+            ct->readTable(thisNamefile);
+            read->read(ct);
+        }
+               
+               list = read->getListVector();
          oldList = *list;
-        matrix = read->getDMatrix();
+               matrix = read->getDMatrix();
          
+               if(countfile != "") {
+            rabund = new RAbundVector();
+            createRabund(ct, list, rabund); //creates an rabund that includes the counts for the unique list
+            delete ct;
+        }else { rabund = new RAbundVector(list->getRAbundVector()); }
+
          delete read;  read = NULL;
-        delete nameMap; nameMap = NULL;
+        if (namefile != "") { delete nameMap; nameMap = NULL; }
          
          
  #ifdef USE_MPI
@@ -1242,8 +1310,6 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile
          
          m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
                 
-        rabund = new RAbundVector(list->getRAbundVector());
-        
          //create cluster
          if (method == "furthest")      {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
          else if(method == "nearest"){  cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
@@ -1385,3 +1451,24 @@ int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> >
         }
  }
  //**********************************************************************************************************************
+int ClusterSplitCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
+    try {
+        rabund->setLabel(list->getLabel());        
+        for(int i = 0; i < list->getNumBins(); i++) { 
+            if (m->control_pressed) { break; }
+            vector<string> binNames;
+            string bin = list->get(i);
+            m->splitAtComma(bin, binNames);
+            int total = 0;
+            for (int j = 0; j < binNames.size(); j++) { total += ct->getNumSeqs(binNames[j]);  }
+            rabund->push_back(total);   
+        }
+        return 0;
+    }
+    catch(exception& e) {
+               m->errorOut(e, "ClusterCommand", "createRabund");
+               exit(1);
+       }
+    
+}
+//**********************************************************************************************************************
diff --git a/clustersplitcommand.h b/clustersplitcommand.h

index 59039ea8600a4ac7e4ea30ab2a684679f6c9f51e..936ae6f69ce94877b60b9f99219423db8b526d73 100644 (file)
--- a/clustersplitcommand.h
+++ b/clustersplitcommand.h
@@ -47,7 +47,7 @@ private:
         vector<int> processIDS;   //processid
         vector<string> outputNames;
         
-       string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile;
+       string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, countfile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile;
         double cutoff, splitcutoff;
         int precision, length, processors, taxLevelCutoff;
         bool print_start, abort, hard, large, classic;
@@ -62,6 +62,7 @@ private:
         int mergeLists(vector<string>, map<float, int>, ListVector*);
         map<float, int> completeListFile(vector<string>, string, set<string>&, ListVector*&);
         int createMergedDistanceFile(vector< map<string, string> >);
+    int createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund);
  };
  
  /////////////////not working for Windows////////////////////////////////////////////////////////////
@@ -75,7 +76,7 @@ private:
  // anything to do with mothur's use of copy constructors in many of our data structures. ie. listvector 
  // is copied by nameassignment and passed to read which passes to the thread?  -westcott 2-8-12
  ////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************************************************************************/
+/**************************************************************************************************
  //custom data structure for threads to use.
  // This is passed by void pointer so it can be any data type
  // that can be passed using a single void pointer (LPVOID).
@@ -105,7 +106,7 @@ struct clusterData {
         }
  };
  
-/**************************************************************************************************/
+/**************************************************************************************************
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
  #else
  static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){ 
@@ -257,7 +258,7 @@ static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){
  } 
  #endif
  
-
+*/
  
  
  #endif
diff --git a/commandfactory.cpp b/commandfactory.cpp

index 02af6767b00b724459c054839b789798f4b295f7..6d87a68c49c6d9154572adcbc51ebc3b4555a35f 100644 (file)
--- a/commandfactory.cpp
+++ b/commandfactory.cpp
@@ -134,6 +134,8 @@
  #include "removeotulabelscommand.h"
  #include "makecontigscommand.h"
  #include "loadlogfilecommand.h"
+#include "sffmultiplecommand.h"
+#include "classifysharedcommand.h"
  
  /*******************************************************/
  
@@ -290,7 +292,10 @@ CommandFactory::CommandFactory(){
      commands["make.contigs"]        = "make.contigs";
      commands["load.logfile"]        = "load.logfile";
      commands["make.table"]          = "make.table";
+    commands["sff.multiple"]        = "sff.multiple";
         commands["quit"]                                = "MPIEnabled"; 
+    commands["classify.shared"]                = "classify.shared"; 
+    
  
  }
  /***********************************************************/
@@ -503,6 +508,8 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
          else if(commandName == "remove.otulabels")      {      command = new RemoveOtuLabelsCommand(optionString);         }
          else if(commandName == "make.contigs")          {      command = new MakeContigsCommand(optionString);             }
          else if(commandName == "load.logfile")          {      command = new LoadLogfileCommand(optionString);             }
+        else if(commandName == "sff.multiple")          {      command = new SffMultipleCommand(optionString);             }
+        else if(commandName == "classify.shared")       {      command = new ClassifySharedCommand(optionString);          }
                 else                                                                                    {       command = new NoCommand(optionString);                                          }
  
                 return command;
@@ -657,6 +664,8 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str
          else if(commandName == "remove.otulabels")      {      pipecommand = new RemoveOtuLabelsCommand(optionString);         }
          else if(commandName == "make.contigs")          {      pipecommand = new MakeContigsCommand(optionString);             }
          else if(commandName == "load.logfile")          {      pipecommand = new LoadLogfileCommand(optionString);             }
+        else if(commandName == "sff.multiple")          {      pipecommand = new SffMultipleCommand(optionString);             }
+        else if(commandName == "classify.shared")       {      pipecommand = new ClassifySharedCommand(optionString);          }
                 else                                                                                    {       pipecommand = new NoCommand(optionString);                                              }
  
                 return pipecommand;
@@ -797,6 +806,8 @@ Command* CommandFactory::getCommand(string commandName){
          else if(commandName == "remove.otulabels")      {      shellcommand = new RemoveOtuLabelsCommand();        }
          else if(commandName == "make.contigs")          {      shellcommand = new MakeContigsCommand();            }
          else if(commandName == "load.logfile")          {      shellcommand = new LoadLogfileCommand();            }
+        else if(commandName == "sff.multiple")          {      shellcommand = new SffMultipleCommand();            }
+        else if(commandName == "classify.shared")       {      shellcommand = new ClassifySharedCommand();         }
                 else                                                                                    {       shellcommand = new NoCommand();                                         }
  
                 return shellcommand;
diff --git a/consensus.cpp b/consensus.cpp

index 1be052f3aee81f6f0aded053de8249dd958118dd..d45a395604148c398f0f7c5a3d379476e3bdda97 100644 (file)
--- a/consensus.cpp
+++ b/consensus.cpp
@@ -21,7 +21,7 @@ Tree* Consensus::getTree(vector<Tree*>& t){
                 
                 if (m->control_pressed) { return 0; }
                 
-               consensusTree = new Tree(t[0]->getTreeMap());
+               consensusTree = new Tree(t[0]->getCountTable());
                 
                 it2 = nodePairs.find(treeSet);
                 
@@ -37,8 +37,7 @@ Tree* Consensus::getTree(vector<Tree*>& t){
                 
                 if (m->control_pressed) {  delete consensusTree; return 0; }
                 
-        map<string, string> empty;
-               consensusTree->assembleTree(empty);
+               consensusTree->assembleTree();
                 
                 if (m->control_pressed) {  delete consensusTree; return 0; }
                                 
diff --git a/consensusseqscommand.cpp b/consensusseqscommand.cpp

index d6158ba3ed3bc147027d54098aa71398182dc2f6..94d66827068338c9b254f8e3aa30d9ec65301db9 100644 (file)
--- a/consensusseqscommand.cpp
+++ b/consensusseqscommand.cpp
@@ -15,7 +15,8 @@
  vector<string> ConsensusSeqsCommand::setParameters(){  
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+               CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plist);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                 CommandParameter pcutoff("cutoff", "Number", "", "100", "", "", "",false,false); parameters.push_back(pcutoff);
@@ -36,7 +37,7 @@ string ConsensusSeqsCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The consensus.seqs command can be used in 2 ways: create a consensus sequence from a fastafile, or with a listfile create a consensus sequence for each otu. Sequences must be aligned.\n";
-               helpString += "The consensus.seqs command parameters are fasta, list, name, cutoff and label.\n";
+               helpString += "The consensus.seqs command parameters are fasta, list, name, count, cutoff and label.\n";
                 helpString += "The fasta parameter allows you to enter the fasta file containing your sequences, and is required, unless you have a valid current fasta file. \n";
                 helpString += "The list parameter allows you to enter a your list file. \n";
                 helpString += "The name parameter allows you to enter a names file associated with the fasta file. \n";
@@ -65,6 +66,7 @@ string ConsensusSeqsCommand::getOutputFileNameTag(string type, string inputName=
          else {
              if (type == "fasta") {  outputFileName =  "cons.fasta"; }
              else if (type == "name") {  outputFileName =  "cons.names"; }
+            else if (type == "count") {  outputFileName =  "cons.count_table"; }
              else if (type == "summary") {  outputFileName =  "cons.summary"; }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
@@ -84,6 +86,7 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(){
                 vector<string> tempOutNames;
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
                 outputTypes["summary"] = tempOutNames;
         }
         catch(exception& e) {
@@ -120,6 +123,7 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option)  {
                         vector<string> tempOutNames;
                         outputTypes["fasta"] = tempOutNames;
                         outputTypes["name"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         outputTypes["summary"] = tempOutNames;
                         
                                                 
@@ -151,6 +155,14 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         
@@ -168,6 +180,13 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option)  {
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { m->setCountTableFile(countfile); }
+                       
+            if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+            
                         listfile = validParameter.validFile(parameters, "list", true);
                         if (listfile == "not open") { abort = true; }
                         else if (listfile == "not found") { listfile = "";  }   
@@ -186,10 +205,12 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option)  {
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(fastafile);      }
                         
-                       if (namefile == ""){
-                               vector<string> files; files.push_back(fastafile); 
-                               parser.getNameFile(files);
-                       }
+            if (countfile == "") {
+                if (namefile == ""){
+                    vector<string> files; files.push_back(fastafile); 
+                    parser.getNameFile(files);
+                }
+            }
                 }
         }
         catch(exception& e) {
@@ -209,6 +230,7 @@ int ConsensusSeqsCommand::execute(){
                 if (m->control_pressed) { return 0; }
                 
                 if (namefile != "") { readNames(); }
+        if (countfile != "") { ct.readTable(countfile);  }
                 
                 if (m->control_pressed) { return 0; }
                 
@@ -227,25 +249,12 @@ int ConsensusSeqsCommand::execute(){
                         string outputFastaFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta");
                         m->openOutputFile(outputFastaFile, outFasta);
                         outputNames.push_back(outputFastaFile); outputTypes["fasta"].push_back(outputFastaFile);
-                       
-                       vector<string> seqs;
-                       int seqLength = 0;
-                       for (map<string, string>::iterator it = nameMap.begin(); it != nameMap.end(); it++) {
-                               
-                               if (m->control_pressed) { outSummary.close(); outFasta.close(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } return 0; }
-                               
-                               string seq = fastaMap[it->second];
-                               seqs.push_back(seq);
-                               
-                               if (seqLength == 0) { seqLength = seq.length(); }
-                               else if (seqLength != seq.length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
-
-                       }
-                       
+        
                         vector< vector<float> > percentages; percentages.resize(5);
                         for (int j = 0; j < percentages.size(); j++) { percentages[j].resize(seqLength, 0.0); }
                         
                         string consSeq = "";
+            int thisCount;
                         //get counts
                         for (int j = 0; j < seqLength; j++) {
                                 
@@ -253,41 +262,55 @@ int ConsensusSeqsCommand::execute(){
                                 
                                 vector<int> counts; counts.resize(5, 0); //A,T,G,C,Gap
                                 int numDots = 0;
-                               
-                               for (int i = 0; i < seqs.size(); i++) {
+                               thisCount = 0;
+                               for (map<string, string>::iterator it = fastaMap.begin(); it != fastaMap.end(); it++) {
                                         
-                                       if (seqs[i][j] == '.') { numDots++; }
-                                       
-                                       char base = toupper(seqs[i][j]);
-                                       if (base == 'A') { counts[0]++; }
-                                       else if (base == 'T') { counts[1]++; }
-                                       else if (base == 'G') { counts[2]++; }
-                                       else if (base == 'C') { counts[3]++; }
-                                       else { counts[4]++; }
+                    string thisSeq = it->second;
+                    int size = 0;
+                    
+                    if (countfile != "") { size = ct.getNumSeqs(it->first); }
+                    else {
+                        map<string, int>::iterator itCount = nameFileMap.find(it->first);
+                        if (itCount != nameFileMap.end()) {
+                            size = itCount->second;
+                        }else { m->mothurOut("[ERROR]: file mismatch, aborting.\n"); m->control_pressed = true; break; }
+                    }
+                    
+                    for (int k = 0; k < size; k++) {
+                        if (thisSeq[j] == '.') { numDots++; }
+                        
+                        char base = toupper(thisSeq[j]);
+                        if (base == 'A') { counts[0]++; }
+                        else if (base == 'T') { counts[1]++; }
+                        else if (base == 'G') { counts[2]++; }
+                        else if (base == 'C') { counts[3]++; }
+                        else { counts[4]++; }
+                        thisCount++;
+                    }
                                 }
                                 
                                 char conBase = '.';
-                               if (numDots != seqs.size()) { conBase = getBase(counts, seqs.size()); }
+                               if (numDots != thisCount) { conBase = getBase(counts, thisCount); }
                                 
                                 consSeq += conBase;
                                 
-                               percentages[0][j] = counts[0] / (float) seqs.size();
-                               percentages[1][j] = counts[1] / (float) seqs.size();
-                               percentages[2][j] = counts[2] / (float) seqs.size();
-                               percentages[3][j] = counts[3] / (float) seqs.size();
-                               percentages[4][j] = counts[4] / (float) seqs.size();
-                               
+                               percentages[0][j] = counts[0] / (float) thisCount;
+                               percentages[1][j] = counts[1] / (float) thisCount;
+                               percentages[2][j] = counts[2] / (float) thisCount;
+                               percentages[3][j] = counts[3] / (float) thisCount;
+                               percentages[4][j] = counts[4] / (float) thisCount;
                         }
                         
                         for (int j = 0; j < seqLength; j++) { 
-                               outSummary << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << seqs.size() << '\t' << consSeq[j] << endl;
+                               outSummary << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << thisCount << '\t' << consSeq[j] << endl;
                         }
                         
                                 
                         outFasta << ">conseq" << endl << consSeq << endl;
                         
                         outSummary.close(); outFasta.close();
-                       
+            
+                       if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); } return 0; }
                 
                 }else {
                         
@@ -414,12 +437,10 @@ int ConsensusSeqsCommand::processList(ListVector*& list){
                         if (m->control_pressed) { outSummary.close(); outName.close(); outFasta.close(); return 0; }
                         
                         string bin = list->get(i);
-                       
-                       string newName = "";
-                       string consSeq = getConsSeq(bin, outSummary, newName, i);
+                       string consSeq = getConsSeq(bin, outSummary, i);
                         
                         outFasta << ">seq" << (i+1) << endl << consSeq << endl;
-                       outName << "seq" << (i+1) << '\t' << "seq" << (i+1) << "," << newName << endl;
+                       outName << "seq" << (i+1) << '\t' << "seq" << (i+1) << "," << bin << endl;
                 }
                 
                 outSummary.close(); outName.close(); outFasta.close();
@@ -434,96 +455,127 @@ int ConsensusSeqsCommand::processList(ListVector*& list){
  }
  
  //***************************************************************************************************************
-//made this smart enough to owrk with unique or non unique list file
-string ConsensusSeqsCommand::getConsSeq(string bin, ofstream& outSummary, string& name, int binNumber){
+string ConsensusSeqsCommand::getConsSeq(string bin, ofstream& outSummary, int binNumber){
         try{
                 
                 string consSeq = "";
                 bool error = false;
-               
-               //the whole bin is the second column if no names file, otherwise build it
-               name = bin;
-               if (namefile != "") { name = ""; }
-               
+        int totalSize=0;
+                               
                 vector<string> binNames;
                 m->splitAtComma(bin, binNames);
-               
-               //get sequence strings for each name in the bin
-               vector<string> seqs;
-               
-               set<string> addedAlready;
-               int seqLength = 0;
-               for (int i = 0; i < binNames.size(); i++) {
-                       
-                       map<string, string>::iterator it;
-                       
-                       it = nameMap.find(binNames[i]);
-                       if (it == nameMap.end()) { 
-                               if (namefile == "") { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); error = true; }
-                               else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta or name file, please correct."); m->mothurOutEndLine(); error = true; }
-                               break;
-                       }else {
-                               
-                               //add sequence string to seqs vector to process below
-                               string seq = fastaMap[it->second];
-                               seqs.push_back(seq);
-                               
-                               if (seqLength == 0) { seqLength = seq.length(); }
-                               else if (seqLength != seq.length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); error = true; break; }
-                               
-                               if (namefile != "") { 
-                                       //did we add this line from name file already?
-                                       if (addedAlready.count(it->second) == 0) {
-                                               name += "," + nameFileMap[it->second];
-                                               addedAlready.insert(it->second);
-                                       }
-                               }
-                               
-                       }
-               }
-               
-               if (error) { m->control_pressed = true; return consSeq; }
-               
-               if (namefile != "") { name = name.substr(1); }
-               
-               vector< vector<float> > percentages; percentages.resize(5);
+        
+        vector< vector<float> > percentages; percentages.resize(5);
                 for (int j = 0; j < percentages.size(); j++) { percentages[j].resize(seqLength, 0.0); }
-               
-               //get counts
-               for (int j = 0; j < seqLength; j++) {
-                       
-                       if (m->control_pressed) { return consSeq; }
-                       
-                       vector<int> counts; counts.resize(5, 0); //A,T,G,C,Gap
-                       int numDots = 0;
-                       
-                       for (int i = 0; i < seqs.size(); i++) {
-                               
-                               if (seqs[i][j] == '.') { numDots++; }
-                               
-                               char base = toupper(seqs[i][j]);
-                               if (base == 'A') { counts[0]++; }
-                               else if (base == 'T') { counts[1]++; }
-                               else if (base == 'G') { counts[2]++; }
-                               else if (base == 'C') { counts[3]++; }
-                               else { counts[4]++; }
-                       }
-                       
-                       char conBase = '.';
-                       if (numDots != seqs.size()) { conBase = getBase(counts, seqs.size()); }
-                       
-                       consSeq += conBase;
-                       
-                       percentages[0][j] = counts[0] / (float) seqs.size();
-                       percentages[1][j] = counts[1] / (float) seqs.size();
-                       percentages[2][j] = counts[2] / (float) seqs.size();
-                       percentages[3][j] = counts[3] / (float) seqs.size();
-                       percentages[4][j] = counts[4] / (float) seqs.size();
-                       
+
+        if (countfile != "") {
+            //get counts
+            for (int j = 0; j < seqLength; j++) {
+                
+                if (m->control_pressed) { return consSeq; }
+                
+                vector<int> counts; counts.resize(5, 0); //A,T,G,C,Gap
+                int numDots = 0;
+                totalSize = 0;
+                 for (int i = 0; i < binNames.size(); i++) {
+                     if (m->control_pressed) { return consSeq; }
+                     
+                     string thisSeq = "";
+                     map<string, string>::iterator itFasta = fastaMap.find(binNames[i]);
+                     if (itFasta != fastaMap.end()) {
+                         thisSeq = itFasta->second;
+                     }else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+                     
+                     int size = ct.getNumSeqs(binNames[i]);
+                     if (size != 0) {
+                         for (int k = 0; k < size; k++) {
+                             if (thisSeq[j] == '.') { numDots++; }
+                             
+                             char base = toupper(thisSeq[j]);
+                             if (base == 'A') { counts[0]++; }
+                             else if (base == 'T') { counts[1]++; }
+                             else if (base == 'G') { counts[2]++; }
+                             else if (base == 'C') { counts[3]++; }
+                             else { counts[4]++; }
+                             totalSize++;
+                         }
+                     }else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+                 }
+                char conBase = '.';
+                if (numDots != totalSize) { conBase = getBase(counts, totalSize); }
+                
+                consSeq += conBase;
+                
+                percentages[0][j] = counts[0] / (float) totalSize;
+                percentages[1][j] = counts[1] / (float) totalSize;
+                percentages[2][j] = counts[2] / (float) totalSize;
+                percentages[3][j] = counts[3] / (float) totalSize;
+                percentages[4][j] = counts[4] / (float) totalSize;
+            }
+
+        }else {
+               
+            //get sequence strings for each name in the bin
+            vector<string> seqs;
+            for (int i = 0; i < binNames.size(); i++) {
+                
+                map<string, string>::iterator it;
+                it = nameMap.find(binNames[i]);
+                if (it == nameMap.end()) { 
+                    if (namefile == "") { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta file, please correct."); m->mothurOutEndLine(); error = true; }
+                    else { m->mothurOut("[ERROR]: " + binNames[i] + " is not in your fasta or name file, please correct."); m->mothurOutEndLine(); error = true; }
+                    break;
+                }else {
+                    //add sequence string to seqs vector to process below
+                    map<string, string>::iterator itFasta = fastaMap.find(it->second);
+                    
+                    if (itFasta != fastaMap.end()) {
+                        string seq = itFasta->second;
+                        seqs.push_back(seq);
+                    }else { m->mothurOut("[ERROR]: file mismatch, aborting. \n"); }
+                }
+            }
+            
+            if (error) { m->control_pressed = true; return consSeq; }
+            totalSize = seqs.size();
+            //get counts
+            for (int j = 0; j < seqLength; j++) {
+                
+                if (m->control_pressed) { return consSeq; }
+                
+                vector<int> counts; counts.resize(5, 0); //A,T,G,C,Gap
+                int numDots = 0;
+                
+                for (int i = 0; i < seqs.size(); i++) {
+                    
+                    if (seqs[i][j] == '.') { numDots++; }
+                    
+                    char base = toupper(seqs[i][j]);
+                    if (base == 'A') { counts[0]++; }
+                    else if (base == 'T') { counts[1]++; }
+                    else if (base == 'G') { counts[2]++; }
+                    else if (base == 'C') { counts[3]++; }
+                    else { counts[4]++; }
+                }
+                
+                char conBase = '.';
+                if (numDots != seqs.size()) { conBase = getBase(counts, seqs.size()); }
+                
+                consSeq += conBase;
+                
+                percentages[0][j] = counts[0] / (float) seqs.size();
+                percentages[1][j] = counts[1] / (float) seqs.size();
+                percentages[2][j] = counts[2] / (float) seqs.size();
+                percentages[3][j] = counts[3] / (float) seqs.size();
+                percentages[4][j] = counts[4] / (float) seqs.size();
+                
+            }
                 }
-               
+        
+        
+        
                 for (int j = 0; j < seqLength; j++) { 
-                       outSummary << (binNumber + 1) << '\t' << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << seqs.size() << '\t' << consSeq[j] << endl;
+                       outSummary << (binNumber + 1) << '\t' << (j+1) << '\t' << percentages[0][j] << '\t'<< percentages[1][j] << '\t'<< percentages[2][j] << '\t' << percentages[3][j] << '\t' << percentages[4][j] << '\t' << totalSize << '\t' << consSeq[j] << endl;
                 }
                 
                 return consSeq;
@@ -646,7 +698,8 @@ int ConsensusSeqsCommand::readFasta(){
                 
                 ifstream in;
                 m->openInputFile(fastafile, in);
-               
+               seqLength = 0;
+        
                 while (!in.eof()) {
                         
                         if (m->control_pressed) { break; }
@@ -657,7 +710,10 @@ int ConsensusSeqsCommand::readFasta(){
                         if (name != "") {
                                 fastaMap[name] = seq.getAligned();
                                 nameMap[name] = name; //set nameMap incase no names file
-                               nameFileMap[name] = name;
+                               nameFileMap[name] = 1;
+                
+                if (seqLength == 0) { seqLength = seq.getAligned().length(); }
+                               else if (seqLength != seq.getAligned().length()) { m->mothurOut("[ERROR]: sequence are not the same length, please correct."); m->mothurOutEndLine(); m->control_pressed = true; break; }
                         }
                 }
                 
@@ -688,7 +744,7 @@ int ConsensusSeqsCommand::readNames(){
               
               it = nameMap.find(thisname);
                          if (it != nameMap.end()) { //then this sequence was in the fastafile
-                                nameFileMap[thisname] = repnames;      //for later when outputting the new namesFile if the list file is unique
+                                nameFileMap[thisname] = m->getNumNames(repnames);      //for later when outputting the new namesFile if the list file is unique
                   
                                  vector<string> splitRepNames;
                                  m->splitAtComma(repnames, splitRepNames);
diff --git a/consensusseqscommand.h b/consensusseqscommand.h

index 1459b43d25bd54b525459721b5de55a5a1d6d9f6..e0c97150bfb0bc3e5baafcea911c4ba1c6c60dcb 100644 (file)
--- a/consensusseqscommand.h
+++ b/consensusseqscommand.h
@@ -13,6 +13,7 @@
  
  #include "command.hpp"
  #include "listvector.hpp"
+#include "counttable.h"
  
  class ConsensusSeqsCommand : public Command {
  public:
@@ -34,19 +35,20 @@ public:
         
  private:
         
+    CountTable ct;
         bool abort, allLines;
-       string fastafile, listfile, namefile, label, outputDir;
+       string fastafile, listfile, namefile, countfile, label, outputDir;
         set<string> labels;
         vector<string> outputNames;
         map<string, string> fastaMap;
         map<string, string> nameMap;
-       map<string, string> nameFileMap;
-       int cutoff;
+       map<string, int> nameFileMap;
+       int cutoff, seqLength;
         
         int readFasta();
         int readNames();
         int processList(ListVector*&);
-       string getConsSeq(string, ofstream&, string&, int);
+       string getConsSeq(string, ofstream&, int);
         char getBase(vector<int>, int);
  };
  
diff --git a/countgroupscommand.cpp b/countgroupscommand.cpp

index ccf8988e39308fd2816f6eb0a6ff0454ab944d19..716dc90693969140d3af32403d3818c04224d7f9 100644 (file)
--- a/countgroupscommand.cpp
+++ b/countgroupscommand.cpp
@@ -16,6 +16,7 @@ vector<string> CountGroupsCommand::setParameters(){
         try {
                 CommandParameter pshared("shared", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none",false,false); parameters.push_back(pshared);
                 CommandParameter pgroup("group", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pcount("count", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none",false,false); parameters.push_back(pcount);
                 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
@@ -34,7 +35,7 @@ vector<string> CountGroupsCommand::setParameters(){
  string CountGroupsCommand::getHelpString(){    
         try {
                 string helpString = "";
-               helpString += "The count.groups command counts sequences from a specific group or set of groups from the following file types: group or shared file.\n";
+               helpString += "The count.groups command counts sequences from a specific group or set of groups from the following file types: group, count or shared file.\n";
                 helpString += "The count.groups command parameters are accnos, group, shared and groups. You must provide a group or shared file.\n";
                 helpString += "The accnos parameter allows you to provide a file containing the list of groups.\n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like.  You can separate group names with dashes.\n";
@@ -114,6 +115,14 @@ CountGroupsCommand::CountGroupsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         
@@ -138,9 +147,23 @@ CountGroupsCommand::CountGroupsCommand(string option)  {
                         groupfile = validParameter.validFile(parameters, "group", true);
                         if (groupfile == "not open") { groupfile = ""; abort = true; }
                         else if (groupfile == "not found") {    groupfile = ""; }
-                       else { m->setGroupFile(groupfile); }    
+                       else { m->setGroupFile(groupfile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+            if (countfile == "not open") { countfile = ""; abort = true; }
+            else if (countfile == "not found") { countfile = "";  }    
+            else { 
+                m->setCountTableFile(countfile); 
+                CountTable ct;
+                if (!ct.testGroups(countfile)) { m->mothurOut("[ERROR]: Your count file does not have any group information, aborting."); m->mothurOutEndLine(); abort=true; }
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+
                         
-                       if ((sharedfile == "") && (groupfile == "")) { 
+                       if ((sharedfile == "") && (groupfile == "") && (countfile == "")) { 
                                 //give priority to shared, then group
                                 sharedfile = m->getSharedFile(); 
                                 if (sharedfile != "") {  m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
@@ -148,7 +171,11 @@ CountGroupsCommand::CountGroupsCommand(string option)  {
                                         groupfile = m->getGroupFile(); 
                                         if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); }
                                         else { 
-                                               m->mothurOut("You have no current groupfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true;
+                                               countfile = m->getCountTableFile(); 
+                        if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                        else { 
+                            m->mothurOut("You have no current groupfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true;
+                        }
                                         }
                                 }
                         }
@@ -182,9 +209,36 @@ int CountGroupsCommand::execute(){
                         vector<string> nameGroups = groupMap.getNamesOfGroups();
                         util.setGroups(Groups, nameGroups);
                         
+            int total = 0;
+                       for (int i = 0; i < Groups.size(); i++) {
+                int num = groupMap.getNumSeqs(Groups[i]);
+                total += num;
+                               m->mothurOut(Groups[i] + " contains " + toString(num) + "."); m->mothurOutEndLine();
+                       }
+            
+            m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine();
+               }
+        
+        if (m->control_pressed) { return 0; }
+        
+        if (countfile != "") {
+                       CountTable ct;
+                       ct.readTable(countfile);
+            
+                       //make sure groups are valid
+                       //takes care of user setting groupNames that are invalid or setting groups=all
+                       SharedUtil util;
+                       vector<string> nameGroups = ct.getNamesOfGroups();
+                       util.setGroups(Groups, nameGroups);
+                       
+            int total = 0;
                         for (int i = 0; i < Groups.size(); i++) {
-                               m->mothurOut(Groups[i] + " contains " + toString(groupMap.getNumSeqs(Groups[i])) + "."); m->mothurOutEndLine();
+                int num = ct.getGroupCount(Groups[i]);
+                total += num;
+                               m->mothurOut(Groups[i] + " contains " + toString(num) + "."); m->mothurOutEndLine();
                         }
+            
+            m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine();
                 }
                 
                 if (m->control_pressed) { return 0; }
@@ -193,10 +247,15 @@ int CountGroupsCommand::execute(){
                         InputData input(sharedfile, "sharedfile");
                         vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
                         
+            int total = 0;
                         for (int i = 0; i < lookup.size(); i++) {
-                               m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + "."); m->mothurOutEndLine();
+                int num = lookup[i]->getNumSeqs();
+                total += num;
+                               m->mothurOut(lookup[i]->getGroup() + " contains " + toString(num) + "."); m->mothurOutEndLine();
                                 delete lookup[i];
-                       }                       
+                       }
+                       
+            m->mothurOut("\nTotal seqs: " + toString(total) + "."); m->mothurOutEndLine();
                 }
                                 
                 return 0;               
diff --git a/countgroupscommand.h b/countgroupscommand.h

index dd0e0a23eb3ccb9a5f4072aa9bb076228f2fb8b2..d27a7f8f94ce1577421061e862696de4a4856a2a 100644 (file)
--- a/countgroupscommand.h
+++ b/countgroupscommand.h
@@ -33,7 +33,7 @@ public:
         
         
  private:
-       string sharedfile, groupfile, outputDir, groups, accnosfile;
+       string sharedfile, groupfile, countfile, outputDir, groups, accnosfile;
         bool abort;
         vector<string> Groups;
  };
diff --git a/countseqscommand.cpp b/countseqscommand.cpp

index 210dd9641a1736f8b0ea509f96b210dbbe3bd2e1..fa6fd4f09a2c0c1703a4f12b65943d6df1ac06a2 100644 (file)
--- a/countseqscommand.cpp
+++ b/countseqscommand.cpp
@@ -10,6 +10,7 @@
  #include "countseqscommand.h"
  #include "groupmap.h"
  #include "sharedutilities.h"
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> CountSeqsCommand::setParameters(){      
@@ -34,7 +35,7 @@ vector<string> CountSeqsCommand::setParameters(){
  string CountSeqsCommand::getHelpString(){      
         try {
                 string helpString = "";
-               helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count.table file.  You may also provide a group file to get the counts broken down by group.\n";
+               helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count_table file.  You may also provide a group file to get the counts broken down by group.\n";
                 helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n";
          helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n";
                 helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n";
@@ -58,7 +59,7 @@ string CountSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
          it = outputTypes.find(type);
          if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
          else {
-            if (type == "counttable") {  outputFileName =  "count.table"; }
+            if (type == "counttable") {  outputFileName =  "count_table"; }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
          return outputFileName;
@@ -175,7 +176,7 @@ int CountSeqsCommand::execute(){
          int total = 0;
          if (!large) { total = processSmall(outputFileName); }
          else { total = processLarge(outputFileName);  }
-                               
+        
                 if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
                 
          //set rabund file as new current rabundfile
@@ -450,6 +451,26 @@ map<int, string> CountSeqsCommand::processNameFile(string name) {
                 in.close();
          out.close();
                 
+        if (rest != "") {
+            vector<string> pieces = m->splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    //parse names into vector
+                    vector<string> theseNames;
+                    m->splitAtComma(secondCol, theseNames);
+                    for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
+                    indexToNames[count] = firstCol;
+                    pairDone = false; 
+                    count++;
+                }
+            }
+
+        }
+        
          return indexToNames;
      }
         catch(exception& e) {
@@ -502,6 +523,26 @@ map<int, string> CountSeqsCommand::getGroupNames(string filename, set<string>& n
                 }
                 in.close();
          out.close();
+        
+        if (rest != "") {
+            vector<string> pieces = m->splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    it = groupIndex.find(secondCol);
+                    if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
+                        groupIndex[secondCol] = count;
+                        count++;
+                    }
+                    out << firstCol << '\t' << groupIndex[secondCol] << endl; 
+                    namesOfGroups.insert(secondCol);
+                    pairDone = false; 
+                }
+            }
+        }
                 
          for (it = groupIndex.begin(); it != groupIndex.end(); it++) {  indexToGroups[it->second] = it->first;  }
          
diff --git a/counttable.cpp b/counttable.cpp

index a664228b17c35fff4ce48ca0ea5255b5b99bb1f3..2ab0e345dbd110cc5c0402d0caa36becc1820bd2 100644 (file)
--- a/counttable.cpp
+++ b/counttable.cpp
@@ -8,7 +8,199 @@
  
  #include "counttable.h"
  
+/************************************************************/
+int CountTable::createTable(set<string>& n, map<string, string>& g, set<string>& gs) {
+    try {
+        int numGroups = 0;
+        groups.clear();
+        totalGroups.clear();
+        indexGroupMap.clear();
+        indexNameMap.clear();
+        counts.clear();
+        for (set<string>::iterator it = gs.begin(); it != gs.end(); it++) { groups.push_back(*it);  hasGroups = true; }
+        numGroups = groups.size();
+        totalGroups.resize(numGroups, 0);
+        
+               //sort groups to keep consistent with how we store the groups in groupmap
+        sort(groups.begin(), groups.end());
+        for (int i = 0; i < groups.size(); i++) {  indexGroupMap[groups[i]] = i; }
+        m->setAllGroups(groups);
+        
+        uniques = 0;
+        total = 0;
+        for (set<string>::iterator it = n.begin(); it != n.end(); it++) {
+            
+            if (m->control_pressed) { break; }
+            
+            string seqName = *it;
+            
+            vector<int> groupCounts; groupCounts.resize(numGroups, 0);
+            map<string, string>::iterator itGroup = g.find(seqName);
+            
+            if (itGroup != g.end()) {   
+                groupCounts[indexGroupMap[itGroup->second]] = 1; 
+                totalGroups[indexGroupMap[itGroup->second]]++;
+            }else { m->mothurOut("[ERROR]: Your group file does not contain " + seqName + ". Please correct."); m->mothurOutEndLine(); }
+            
+            map<string, int>::iterator it2 = indexNameMap.find(seqName);
+            if (it2 == indexNameMap.end()) {
+                if (hasGroups) {  counts.push_back(groupCounts);  }
+                indexNameMap[seqName] = uniques;
+                totals.push_back(1);
+                total++;
+                uniques++;
+            }
+        }
+        
+        if (hasGroups) {
+            for (int i = 0; i < totalGroups.size(); i++) {
+                if (totalGroups[i] == 0) { m->mothurOut("\nRemoving group: " + groups[i] + " because all sequences have been removed.\n"); removeGroup(groups[i]); i--; }
+            }
+        }
  
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "createTable");
+               exit(1);
+       }
+}
+/************************************************************/
+bool CountTable::testGroups(string file) {
+    try {
+        m = MothurOut::getInstance(); hasGroups = false; total = 0;
+        ifstream in;
+        m->openInputFile(file, in);
+    
+        string headers = m->getline(in); m->gobble(in);
+        vector<string> columnHeaders = m->splitWhiteSpace(headers);
+        if (columnHeaders.size() > 2) { hasGroups = true;   }
+        return hasGroups;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "readTable");
+               exit(1);
+       }
+}
+/************************************************************/
+int CountTable::createTable(string namefile, string groupfile, bool createGroup) {
+    try {
+        
+        if (namefile == "") { m->mothurOut("[ERROR]: namefile cannot be blank when creating a count table.\n"); m->control_pressed = true; }
+                                           
+        GroupMap* groupMap;
+        int numGroups = 0;
+        groups.clear();
+        totalGroups.clear();
+        indexGroupMap.clear();
+        indexNameMap.clear();
+        counts.clear();
+        map<int, string> originalGroupIndexes;
+        
+        if (groupfile != "") { 
+            hasGroups = true;
+            groupMap = new GroupMap(groupfile); groupMap->readMap();
+            numGroups = groupMap->getNumGroups();
+            groups = groupMap->getNamesOfGroups();
+            totalGroups.resize(numGroups, 0);
+        }else if(createGroup) {
+            hasGroups = true;
+            numGroups = 1;
+            groups.push_back("Group1");
+            totalGroups.resize(numGroups, 0);
+        }
+               //sort groups to keep consistent with how we store the groups in groupmap
+        sort(groups.begin(), groups.end());
+        for (int i = 0; i < groups.size(); i++) {  indexGroupMap[groups[i]] = i; }
+        m->setAllGroups(groups);
+        
+        bool error = false;
+        string name;
+        uniques = 0;
+        total = 0;
+        
+        
+        //open input file
+        ifstream in;
+        m->openInputFile(namefile, in);
+        
+        int total = 0;
+        while (!in.eof()) {
+            if (m->control_pressed) { break; }
+            
+            string firstCol, secondCol;
+            in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
+            
+            vector<string> names;
+            m->splitAtChar(secondCol, names, ',');
+            
+            map<string, int> groupCounts;
+            int thisTotal = 0;
+            if (groupfile != "") {
+                //set to 0
+                for (int i = 0; i < groups.size(); i++) { groupCounts[groups[i]] = 0; }
+                
+                //get counts for each of the users groups
+                for (int i = 0; i < names.size(); i++) {
+                    string group = groupMap->getGroup(names[i]);
+                    
+                    if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); error=true; }
+                    else {
+                        map<string, int>::iterator it = groupCounts.find(group);
+                        
+                        //if not found, then this sequence is not from a group we care about
+                        if (it != groupCounts.end()) {
+                            it->second++;
+                            thisTotal++;
+                        }
+                    }
+                }
+            }else if (createGroup) {
+                groupCounts["Group1"]=0;
+                for (int i = 0; i < names.size(); i++) {
+                    string group = "Group1";
+                    groupCounts["Group1"]++; thisTotal++;
+                }
+            }else { thisTotal = names.size();  }
+            
+            //if group info, then read it
+            vector<int> thisGroupsCount; thisGroupsCount.resize(numGroups, 0);
+            for (int i = 0; i < numGroups; i++) {  
+                thisGroupsCount[i] = groupCounts[groups[i]]; 
+                totalGroups[i] += thisGroupsCount[i]; 
+            }
+            
+            map<string, int>::iterator it = indexNameMap.find(firstCol);
+            if (it == indexNameMap.end()) {
+                if (hasGroups) {  counts.push_back(thisGroupsCount);  }
+                indexNameMap[firstCol] = uniques;
+                totals.push_back(thisTotal);
+                total += thisTotal;
+                uniques++;
+            }else {
+                error = true;
+                m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + firstCol + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); 
+            }
+        }
+        in.close();
+               
+        if (error) { m->control_pressed = true; }
+        else { //check for zero groups
+            if (hasGroups) {
+                for (int i = 0; i < totalGroups.size(); i++) {
+                    if (totalGroups[i] == 0) { m->mothurOut("\nRemoving group: " + groups[i] + " because all sequences have been removed.\n"); removeGroup(groups[i]); i--; }
+                }
+            }
+        }
+        if (groupfile != "") { delete groupMap; }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "createTable");
+               exit(1);
+       }
+}
  /************************************************************/
  int CountTable::readTable(string file) {
      try {
@@ -64,6 +256,13 @@ int CountTable::readTable(string file) {
          in.close();
          
          if (error) { m->control_pressed = true; }
+        else { //check for zero groups
+            if (hasGroups) {
+                for (int i = 0; i < totalGroups.size(); i++) {
+                    if (totalGroups[i] == 0) { m->mothurOut("\nRemoving group: " + groups[i] + " because all sequences have been removed.\n"); removeGroup(groups[i]); i--; }
+                }
+            }
+        }
          
          return 0;
      }
@@ -73,6 +272,68 @@ int CountTable::readTable(string file) {
         }
  }
  /************************************************************/
+int CountTable::printTable(string file) {
+    try {
+        ofstream out;
+        m->openOutputFile(file, out); 
+               out << "Representative_Sequence\ttotal\t";
+        for (int i = 0; i < groups.size(); i++) { out << groups[i] << '\t'; }
+        out << endl;
+        
+        for (map<string, int>::iterator itNames = indexNameMap.begin(); itNames != indexNameMap.end(); itNames++) {
+            out << itNames->first << '\t' << totals[itNames->second] << '\t';
+            if (hasGroups) {
+                
+                for (int i = 0; i < groups.size(); i++) {
+                    out << counts[itNames->second][i] << '\t';
+                }
+            }
+            out << endl;
+        }
+        out.close();
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "printTable");
+               exit(1);
+       }
+}
+/************************************************************/
+int CountTable::printHeaders(ofstream& out) {
+    try {
+               out << "Representative_Sequence\ttotal\t";
+        for (int i = 0; i < groups.size(); i++) { out << groups[i] << '\t'; }
+        out << endl;
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "printHeaders");
+               exit(1);
+       }
+}
+/************************************************************/
+int CountTable::printSeq(ofstream& out, string seqName) {
+    try {
+               map<string, int>::iterator it = indexNameMap.find(seqName);
+        if (it == indexNameMap.end()) {
+            m->mothurOut("[ERROR]: " + seqName + " is not in your count table. Please correct.\n"); m->control_pressed = true;
+        }else { 
+            out << it->first << '\t' << totals[it->second] << '\t';
+            if (hasGroups) {
+                for (int i = 0; i < groups.size(); i++) {
+                    out << counts[it->second][i] << '\t';
+                }
+            }
+            out << endl;
+        }
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "printSeq");
+               exit(1);
+       }
+}
+/************************************************************/
  //group counts for a seq
  vector<int> CountTable::getGroupCounts(string seqName) {
      try {
@@ -138,6 +399,179 @@ int CountTable::getGroupCount(string seqName, string groupName) {
                 exit(1);
         }
  }
+/************************************************************/
+//set the number of sequences for the seq for the group
+int CountTable::setAbund(string seqName, string groupName, int num) {
+    try {
+        if (hasGroups) {
+            map<string, int>::iterator it = indexGroupMap.find(groupName);
+            if (it == indexGroupMap.end()) {
+                m->mothurOut("[ERROR]: " + groupName + " is not in your count table. Please correct.\n"); m->control_pressed = true;
+            }else { 
+                map<string, int>::iterator it2 = indexNameMap.find(seqName);
+                if (it2 == indexNameMap.end()) {
+                    m->mothurOut("[ERROR]: " + seqName + " is not in your count table. Please correct.\n"); m->control_pressed = true;
+                }else { 
+                    int oldCount = counts[it2->second][it->second];
+                    counts[it2->second][it->second] = num;
+                    totalGroups[it->second] += (num - oldCount);
+                    total += (num - oldCount);
+                    totals[it2->second] += (num - oldCount);
+                }
+            }
+        }else{  m->mothurOut("[ERROR]: Your count table does not have group info. Please correct.\n");  m->control_pressed = true; }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "set");
+               exit(1);
+       }
+}
+/************************************************************/
+//add group
+int CountTable::addGroup(string groupName) {
+    try {        
+        bool sanity = m->inUsersGroups(groupName, groups);
+        if (sanity) { m->mothurOut("[ERROR]: " + groupName + " is already in the count table, cannot add again.\n"); m->control_pressed = true;  return 0; }
+        
+        groups.push_back(groupName);
+        if (!hasGroups) { counts.resize(uniques);  }
+        
+        for (int i = 0; i < counts.size(); i++) { counts[i].push_back(0); }
+        totalGroups.push_back(0);
+        indexGroupMap[groupName] = groups.size()-1;
+        map<string, int> originalGroupMap = indexGroupMap;
+        
+        //important to play well with others, :)
+        sort(groups.begin(), groups.end());
+        
+        //fix indexGroupMap && totalGroups
+        vector<int> newTotals; newTotals.resize(groups.size(), 0);
+        for (int i = 0; i < groups.size(); i++) {  
+            indexGroupMap[groups[i]] = i;  
+            //find original spot of group[i]
+            int index = originalGroupMap[groups[i]];
+            newTotals[i] = totalGroups[index];
+        }
+        totalGroups = newTotals;
+        
+        //fix counts vectors
+        for (int i = 0; i < counts.size(); i++) {
+            vector<int> newCounts; newCounts.resize(groups.size(), 0);
+            for (int j = 0; j < groups.size(); j++) {  
+                //find original spot of group[i]
+                int index = originalGroupMap[groups[j]];
+                newCounts[j] = counts[i][index];
+            }
+            counts[i] = newCounts;
+        }
+        hasGroups = true;
+        m->setAllGroups(groups);
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "addGroup");
+               exit(1);
+       }
+}
+/************************************************************/
+//remove group
+int CountTable::removeGroup(string groupName) {
+    try {        
+        if (hasGroups) {
+            //save for later in case removing a group means we need to remove a seq.
+            map<int, string> reverse;
+            for (map<string, int>::iterator it = indexNameMap.begin(); it !=indexNameMap.end(); it++) { reverse[it->second] = it->first;  }
+            
+            map<string, int>::iterator it = indexGroupMap.find(groupName);
+            if (it == indexGroupMap.end()) {
+                m->mothurOut("[ERROR]: " + groupName + " is not in your count table. Please correct.\n"); m->control_pressed = true;
+            }else { 
+                int indexOfGroupToRemove = it->second;
+                map<string, int> currentGroupIndex = indexGroupMap;
+                vector<string> newGroups;
+                for (int i = 0; i < groups.size(); i++) {
+                    if (groups[i] != groupName) { 
+                        newGroups.push_back(groups[i]);
+                        indexGroupMap[groups[i]] = newGroups.size()-1;
+                    }
+                }
+                indexGroupMap.erase(groupName);
+                groups = newGroups;
+                totalGroups.erase(totalGroups.begin()+indexOfGroupToRemove);
+                
+                int thisIndex = 0;
+                map<string, int> newIndexNameMap;
+                for (int i = 0; i < counts.size(); i++) {
+                    int num = counts[i][indexOfGroupToRemove];
+                    counts[i].erase(counts[i].begin()+indexOfGroupToRemove);
+                    totals[i] -= num;
+                    total -= num;
+                    if (totals[i] == 0) { //your sequences are only from the group we want to remove, then remove you.
+                        counts.erase(counts.begin()+i);
+                        totals.erase(totals.begin()+i);
+                        uniques--;
+                        i--;
+                    }
+                    newIndexNameMap[reverse[thisIndex]] = i;
+                    thisIndex++;
+                }
+                indexNameMap = newIndexNameMap;
+                
+                if (groups.size() == 0) { hasGroups = false; }
+            }
+        }else { m->mothurOut("[ERROR]: your count table does not contain group information, can not remove group " + groupName + ".\n"); m->control_pressed = true; }
+    
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "removeGroup");
+               exit(1);
+       }
+}
+/************************************************************/
+//vector of groups for the seq
+vector<string> CountTable::getGroups(string seqName) {
+    try {
+        vector<string> thisGroups;
+        if (hasGroups) {
+            vector<int> thisCounts = getGroupCounts(seqName);
+            for (int i = 0; i < thisCounts.size(); i++) {  
+                if (thisCounts[i] != 0) {  thisGroups.push_back(groups[i]); }
+            } 
+        }else{  m->mothurOut("[ERROR]: Your count table does not have group info. Please correct.\n");  m->control_pressed = true; }
+        
+        return thisGroups;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "getGroups");
+               exit(1);
+       }
+}
+/************************************************************/
+//total number of seqs represented by seq
+int CountTable::renameSeq(string oldSeqName, string newSeqName) {
+    try {
+        
+        map<string, int>::iterator it = indexNameMap.find(oldSeqName);
+        if (it == indexNameMap.end()) {
+            m->mothurOut("[ERROR]: " + oldSeqName + " is not in your count table. Please correct.\n"); m->control_pressed = true;
+        }else {  
+            int index = it->second;
+            indexNameMap.erase(it);
+            indexNameMap[newSeqName] = index;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "renameSeq");
+               exit(1);
+       }
+}
+
  /************************************************************/
  //total number of seqs represented by seq
  int CountTable::getNumSeqs(string seqName) {
@@ -174,6 +608,100 @@ int CountTable::get(string seqName) {
                 exit(1);
         }
  }
+/************************************************************/
+//add seqeunce without group info
+int CountTable::push_back(string seqName) {
+    try {
+        map<string, int>::iterator it = indexNameMap.find(seqName);
+        if (it == indexNameMap.end()) {
+            if (hasGroups) {  m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true;  }
+            indexNameMap[seqName] = uniques;
+            totals.push_back(1);
+            total++;
+            uniques++;
+        }else {
+            m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "push_back");
+               exit(1);
+       }
+}
+/************************************************************/
+//remove sequence
+int CountTable::remove(string seqName) {
+    try {
+        map<string, int>::iterator it = indexNameMap.find(seqName);
+        if (it != indexNameMap.end()) {
+            uniques--;
+            if (hasGroups){ //remove this sequences counts from group totals
+                for (int i = 0; i < totalGroups.size(); i++) {  totalGroups[i] -= counts[it->second][i];  counts[it->second][i] = 0; }
+            }
+            int thisTotal = totals[it->second]; totals[it->second] = 0;
+            total -= thisTotal;
+            indexNameMap.erase(it);
+        }else {
+            m->mothurOut("[ERROR]: Your count table contains does not include " + seqName + ", cannot remove."); m->mothurOutEndLine(); m->control_pressed = true;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "push_back");
+               exit(1);
+       }
+}
+/************************************************************/
+//add seqeunce without group info
+int CountTable::push_back(string seqName, int thisTotal) {
+    try {
+        map<string, int>::iterator it = indexNameMap.find(seqName);
+        if (it == indexNameMap.end()) {
+            if (hasGroups) {  m->mothurOut("[ERROR]: Your count table has groups and I have no group information for " + seqName + "."); m->mothurOutEndLine(); m->control_pressed = true;  }
+            indexNameMap[seqName] = uniques;
+            totals.push_back(thisTotal);
+            total+=thisTotal;
+            uniques++;
+        }else {
+            m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "push_back");
+               exit(1);
+       }
+}
+/************************************************************/
+//add sequence with group info
+int CountTable::push_back(string seqName, vector<int> groupCounts) {
+    try {
+        map<string, int>::iterator it = indexNameMap.find(seqName);
+        if (it == indexNameMap.end()) {
+            if ((hasGroups) && (groupCounts.size() != getNumGroups())) {  m->mothurOut("[ERROR]: Your count table has a " + toString(getNumGroups()) + " groups and " + seqName + " has " + toString(groupCounts.size()) + ", please correct."); m->mothurOutEndLine(); m->control_pressed = true;  }
+            int thisTotal = 0;
+            for (int i = 0; i < getNumGroups(); i++) {   totalGroups[i] += groupCounts[i];  thisTotal += groupCounts[i]; }
+            if (hasGroups) {  counts.push_back(groupCounts);  }
+            indexNameMap[seqName] = uniques;
+            totals.push_back(thisTotal);
+            total+= thisTotal;
+            uniques++;
+        }else {
+            m->mothurOut("[ERROR]: Your count table contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "push_back");
+               exit(1);
+       }
+}
+
  /************************************************************/
  //create ListVector from uniques
  ListVector CountTable::getListVector() {
@@ -208,7 +736,46 @@ vector<string> CountTable::getNamesOfSeqs() {
         }
  }
  /************************************************************/
-//returns names of seqs
+//returns the names of all unique sequences in file mapped to their seqCounts
+map<string, int> CountTable::getNameMap() {
+    try {
+        map<string, int> names;
+        for (map<string, int>::iterator it = indexNameMap.begin(); it != indexNameMap.end(); it++) {
+            names[it->first] = totals[it->second];
+        }
+        
+        return names;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "getNameMap");
+               exit(1);
+       }
+}
+/************************************************************/
+//returns the names of all unique sequences in file
+vector<string> CountTable::getNamesOfSeqs(string group) {
+    try {
+        vector<string> names;
+        if (hasGroups) {
+            map<string, int>::iterator it = indexGroupMap.find(group);
+            if (it == indexGroupMap.end()) {
+                m->mothurOut("[ERROR]: " + group + " is not in your count table. Please correct.\n"); m->control_pressed = true;
+            }else { 
+                for (map<string, int>::iterator it2 = indexNameMap.begin(); it2 != indexNameMap.end(); it2++) {
+                    if (counts[it2->second][it->second] != 0) {  names.push_back(it2->first); }
+                }
+            }
+        }else{  m->mothurOut("[ERROR]: Your count table does not have group info. Please correct.\n");  m->control_pressed = true; }
+        
+        return names;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "getNamesOfSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+//merges counts of seq1 and seq2, saving in seq1
  int CountTable::mergeCounts(string seq1, string seq2) {
      try {
          map<string, int>::iterator it = indexNameMap.find(seq1);
@@ -220,17 +787,12 @@ int CountTable::mergeCounts(string seq1, string seq2) {
                  m->mothurOut("[ERROR]: " + seq2 + " is not in your count table. Please correct.\n"); m->control_pressed = true;
              }else { 
                  //merge data
-                for (int i = 0; i < groups.size(); i++) {
-                    counts[it->second][i] += counts[it2->second][i];
-                    counts[it2->second][i] = 0;
-                }
+                for (int i = 0; i < groups.size(); i++) { counts[it->second][i] += counts[it2->second][i]; }
                  totals[it->second] += totals[it2->second];
-                totals[it2->second] = 0;
                  uniques--;
                  indexNameMap.erase(it2); 
              }
          }
-        
          return 0;
      }
         catch(exception& e) {
@@ -238,6 +800,25 @@ int CountTable::mergeCounts(string seq1, string seq2) {
                 exit(1);
         }
  }
+/************************************************************/
+int CountTable::copy(CountTable* ct) {
+    try {
+        vector<string> thisGroups = ct->getNamesOfGroups();
+        for (int i = 0; i < thisGroups.size(); i++) { addGroup(thisGroups[i]); }
+        vector<string> names = ct->getNamesOfSeqs();
+                                                               
+        for (int i = 0; i < names.size(); i++) {
+            vector<int> thisCounts = ct->getGroupCounts(names[i]);
+            push_back(names[i], thisCounts);
+        }
+                                                               
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "CountTable", "copy");
+               exit(1);
+       }
+}
  
  /************************************************************/
  
diff --git a/counttable.h b/counttable.h

index 8baff3080a5f5124214e0b6b6834a899b356f0f9..34c941bece659b832afd9f2ff8866a16d37c14f1 100644 (file)
--- a/counttable.h
+++ b/counttable.h
@@ -38,32 +38,55 @@
  
  #include "mothurout.h"
  #include "listvector.hpp"
+#include "groupmap.h"
  
  class CountTable {
      
      public:
      
-        CountTable() { m = MothurOut::getInstance(); hasGroups = false; total = 0; }
+        CountTable() { m = MothurOut::getInstance(); hasGroups = false; total = 0; uniques = 0; }
          ~CountTable() {}
      
-        int readTable(string);
+        //reads and creates smart enough to eliminate groups with zero counts 
+        int createTable(set<string>&, map<string, string>&, set<string>&); //seqNames, seqName->group, groupNames 
+        int createTable(string, string, bool); //namefile, groupfile, createGroup
+        int readTable(string); 
+    
+        int printTable(string);
+        int printHeaders(ofstream&);
+        int printSeq(ofstream&, string);
+        bool testGroups(string file); //used to check if file has group data without reading it.
+        int copy(CountTable*);
      
          bool hasGroupInfo() { return hasGroups; }
          int getNumGroups() { return groups.size(); }
          vector<string> getNamesOfGroups() {  return groups;   }  //returns group names, if no group info vector is blank.
+        int addGroup(string);
+        int removeGroup(string);
+        
+        int renameSeq(string, string); //used to change name of sequence for use with trees
+        int setAbund(string, string, int); //set abundance number of seqs for that group for that seq
+        int push_back(string); //add a sequence 
+        int push_back(string, int); //add a sequence 
+        int push_back(string, vector<int>); //add a sequence with group info
+        int remove(string); //remove seq
+        int get(string); //returns unique sequence index for reading distance matrices like NameAssignment
+        int size() { return indexNameMap.size(); }
      
+        vector<string> getGroups(string); //returns vector of groups represented by this sequences
          vector<int> getGroupCounts(string);  //returns group counts for a seq passed in, if no group info is in file vector is blank. Order is the same as the groups returned by getGroups function.
          int getGroupCount(string, string); //returns number of seqs for that group for that seq
          int getGroupCount(string); // returns total seqs for that group
-        int getNumSeqs(string); //returns total seqs for that seq
+        int getNumSeqs(string); //returns total seqs for that seq, 0 if not found 
          int getNumSeqs() { return total; } //return total number of seqs
          int getNumUniqueSeqs() { return uniques; } //return number of unique/representative seqs
          int getGroupIndex(string); //returns index in getGroupCounts vector of specific group
+    
          vector<string> getNamesOfSeqs();
+        vector<string> getNamesOfSeqs(string);
          int mergeCounts(string, string); //combines counts for 2 seqs, saving under the first name passed in.
-        int get(string); //returns unique sequence index for reading distance matrices like NameAssignment
          ListVector getListVector();
-        int size() { return indexNameMap.size(); }
+        map<string, int> getNameMap();
      
      private:
          string filename;
diff --git a/decisiontree.cpp b/decisiontree.cpp

new file mode 100644 (file)

index 0000000..99853f3
--- /dev/null
+++ b/decisiontree.cpp
@@ -0,0 +1,399 @@
+//
+//  decisiontree.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 10/1/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "decisiontree.hpp"
+
+DecisionTree::DecisionTree(vector< vector<int> > baseDataSet,
+             vector<int> globalDiscardedFeatureIndices,
+             OptimumFeatureSubsetSelector optimumFeatureSubsetSelector,
+             string treeSplitCriterion) : AbstractDecisionTree(baseDataSet,
+                       globalDiscardedFeatureIndices,
+                       optimumFeatureSubsetSelector,
+                       treeSplitCriterion), variableImportanceList(numFeatures, 0){
+    try {
+        m = MothurOut::getInstance();
+        createBootStrappedSamples();
+        buildDecisionTree();
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "DecisionTree");
+               exit(1);
+       } 
+}
+
+/***********************************************************************/
+
+int DecisionTree::calcTreeVariableImportanceAndError() {
+    try {
+        
+        int numCorrect;
+        double treeErrorRate;
+        calcTreeErrorRate(numCorrect, treeErrorRate);
+        
+        if (m->control_pressed) {return 0; }
+                
+        for (int i = 0; i < numFeatures; i++) {
+            if (m->control_pressed) {return 0; }
+            // NOTE: only shuffle the features, never shuffle the output vector
+            // so i = 0 and i will be alwaays <= (numFeatures - 1) as the index at numFeatures will denote
+            // the feature vector
+            vector< vector<int> > randomlySampledTestData = randomlyShuffleAttribute(bootstrappedTestSamples, i);
+            
+            int numCorrectAfterShuffle = 0;
+            for (int j = 0; j < randomlySampledTestData.size(); j++) {
+                if (m->control_pressed) {return 0; }
+                vector<int> shuffledSample = randomlySampledTestData[j];
+                int actualSampleOutputClass = shuffledSample[numFeatures];
+                int predictedSampleOutputClass = evaluateSample(shuffledSample);
+                if (actualSampleOutputClass == predictedSampleOutputClass) { numCorrectAfterShuffle++; }
+            }
+            variableImportanceList[i] += (numCorrect - numCorrectAfterShuffle);
+        }
+        
+        // TODO: do we need to save the variableRanks in the DecisionTree, do we need it later?
+        vector< vector<int> > variableRanks;
+        for (int i = 0; i < variableImportanceList.size(); i++) {
+            if (m->control_pressed) {return 0; }
+            if (variableImportanceList[i] > 0) {
+                // TODO: is there a way to optimize the follow line's code?
+                vector<int> variableRank(2, 0);
+                variableRank[0] = i; variableRank[1] = variableImportanceList[i];
+                variableRanks.push_back(variableRank);
+            }
+        }
+        VariableRankDescendingSorter variableRankDescendingSorter;
+        sort(variableRanks.begin(), variableRanks.end(), variableRankDescendingSorter);
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "calcTreeVariableImportanceAndError");
+               exit(1);
+       } 
+
+}
+/***********************************************************************/
+
+// TODO: there must be a way to optimize this function
+int DecisionTree::evaluateSample(vector<int> testSample) {
+    try {
+        RFTreeNode *node = rootNode;
+        while (true) {
+            if (m->control_pressed) {return 0; }
+            if (node->checkIsLeaf()) { return node->getOutputClass(); }
+            int sampleSplitFeatureValue = testSample[node->getSplitFeatureIndex()];
+            if (sampleSplitFeatureValue < node->getSplitFeatureValue()) { node = node->getLeftChildNode(); }
+            else { node = node->getRightChildNode(); } 
+        }
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "evaluateSample");
+               exit(1);
+       } 
+
+}
+/***********************************************************************/
+
+int DecisionTree::calcTreeErrorRate(int& numCorrect, double& treeErrorRate){
+    try {
+        numCorrect = 0;
+        for (int i = 0; i < bootstrappedTestSamples.size(); i++) {
+             if (m->control_pressed) {return 0; }
+            
+            vector<int> testSample = bootstrappedTestSamples[i];
+            int testSampleIndex = bootstrappedTestSampleIndices[i];
+            
+            int actualSampleOutputClass = testSample[numFeatures];
+            int predictedSampleOutputClass = evaluateSample(testSample);
+            
+            if (actualSampleOutputClass == predictedSampleOutputClass) { numCorrect++; } 
+            
+            outOfBagEstimates[testSampleIndex] = predictedSampleOutputClass;
+        }
+        
+        treeErrorRate = 1 - ((double)numCorrect / (double)bootstrappedTestSamples.size());   
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "calcTreeErrorRate");
+               exit(1);
+       } 
+}
+
+/***********************************************************************/
+
+// TODO: optimize the algo, instead of transposing two time, we can extarct the feature,
+// shuffle it and then re-insert in the original place, thus iproving runnting time
+//This function randomize abundances for a given OTU/feature.
+vector< vector<int> > DecisionTree::randomlyShuffleAttribute(vector< vector<int> > samples, int featureIndex) {
+    try {
+        // NOTE: we need (numFeatures + 1) featureVecotors, the last extra vector is actually outputVector
+        vector< vector<int> > shuffledSample = samples;
+        vector<int> featureVectors(samples.size(), 0);
+        
+        for (int j = 0; j < samples.size(); j++) {
+            if (m->control_pressed) { return shuffledSample; }
+            featureVectors[j] = samples[j][featureIndex];
+        }
+        
+        random_shuffle(featureVectors.begin(), featureVectors.end());
+
+        for (int j = 0; j < samples.size(); j++) {
+            if (m->control_pressed) {return shuffledSample; }
+            shuffledSample[j][featureIndex] = featureVectors[j];
+        }
+        
+        return shuffledSample;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "randomlyShuffleAttribute");
+               exit(1);
+       } 
+}
+/***********************************************************************/
+
+int DecisionTree::purgeTreeNodesDataRecursively(RFTreeNode* treeNode) {
+    try {
+        treeNode->bootstrappedTrainingSamples.clear();
+        treeNode->bootstrappedFeatureVectors.clear();
+        treeNode->bootstrappedOutputVector.clear();
+        treeNode->localDiscardedFeatureIndices.clear();
+        treeNode->globalDiscardedFeatureIndices.clear();
+        
+        if (treeNode->leftChildNode != NULL) { purgeTreeNodesDataRecursively(treeNode->leftChildNode); }
+        if (treeNode->rightChildNode != NULL) { purgeTreeNodesDataRecursively(treeNode->rightChildNode); }
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "purgeTreeNodesDataRecursively");
+               exit(1);
+       } 
+}
+/***********************************************************************/
+
+void DecisionTree::buildDecisionTree(){
+    try {
+    
+    int generation = 0;
+    rootNode = new RFTreeNode(bootstrappedTrainingSamples, globalDiscardedFeatureIndices, numFeatures, numSamples, numOutputClasses, generation);
+    
+    splitRecursively(rootNode);
+        }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "buildDecisionTree");
+               exit(1);
+       } 
+}
+
+/***********************************************************************/
+
+int DecisionTree::splitRecursively(RFTreeNode* rootNode) {
+    try {
+       
+        if (rootNode->getNumSamples() < 2){
+            rootNode->setIsLeaf(true);
+            rootNode->setOutputClass(rootNode->getBootstrappedTrainingSamples()[0][rootNode->getNumFeatures()]);
+            return 0;
+        }
+        
+        int classifiedOutputClass;
+        bool isAlreadyClassified = checkIfAlreadyClassified(rootNode, classifiedOutputClass);    
+        if (isAlreadyClassified == true){
+            rootNode->setIsLeaf(true);
+            rootNode->setOutputClass(classifiedOutputClass);
+            return 0;
+        }
+        if (m->control_pressed) {return 0;}
+        vector<int> featureSubsetIndices = selectFeatureSubsetRandomly(globalDiscardedFeatureIndices, rootNode->getLocalDiscardedFeatureIndices());
+        rootNode->setFeatureSubsetIndices(featureSubsetIndices);
+        if (m->control_pressed) {return 0;}
+      
+        findAndUpdateBestFeatureToSplitOn(rootNode);
+        
+        if (m->control_pressed) {return 0;}
+        
+        vector< vector<int> > leftChildSamples;
+        vector< vector<int> > rightChildSamples;
+        getSplitPopulation(rootNode, leftChildSamples, rightChildSamples);
+        
+        if (m->control_pressed) {return 0;}
+        
+        // TODO: need to write code to clear this memory
+        RFTreeNode* leftChildNode = new RFTreeNode(leftChildSamples, globalDiscardedFeatureIndices, numFeatures, (int)leftChildSamples.size(), numOutputClasses, rootNode->getGeneration() + 1);
+        RFTreeNode* rightChildNode = new RFTreeNode(rightChildSamples, globalDiscardedFeatureIndices, numFeatures, (int)rightChildSamples.size(), numOutputClasses, rootNode->getGeneration() + 1);
+        
+        rootNode->setLeftChildNode(leftChildNode);
+        leftChildNode->setParentNode(rootNode);
+        
+        rootNode->setRightChildNode(rightChildNode);
+        rightChildNode->setParentNode(rootNode);
+        
+        // TODO: This recursive split can be parrallelized later
+        splitRecursively(leftChildNode);
+        if (m->control_pressed) {return 0;}
+        
+        splitRecursively(rightChildNode);
+        return 0;
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "splitRecursively");
+               exit(1);
+       } 
+}
+/***********************************************************************/
+
+int DecisionTree::findAndUpdateBestFeatureToSplitOn(RFTreeNode* node){
+    try {
+
+        vector< vector<int> > bootstrappedFeatureVectors = node->getBootstrappedFeatureVectors();
+        if (m->control_pressed) {return 0;}
+        vector<int> bootstrappedOutputVector = node->getBootstrappedOutputVector();
+        if (m->control_pressed) {return 0;}
+        vector<int> featureSubsetIndices = node->getFeatureSubsetIndices();
+        if (m->control_pressed) {return 0;}
+        
+        vector<double> featureSubsetEntropies;
+        vector<int> featureSubsetSplitValues;
+        vector<double> featureSubsetIntrinsicValues;
+        vector<double> featureSubsetGainRatios;
+        
+        for (int i = 0; i < featureSubsetIndices.size(); i++) {
+            if (m->control_pressed) {return 0;}
+            
+            int tryIndex = featureSubsetIndices[i];
+                       
+            double featureMinEntropy;
+            int featureSplitValue;
+            double featureIntrinsicValue;
+            
+            getMinEntropyOfFeature(bootstrappedFeatureVectors[tryIndex], bootstrappedOutputVector, featureMinEntropy, featureSplitValue, featureIntrinsicValue);
+            if (m->control_pressed) {return 0;}
+            
+            featureSubsetEntropies.push_back(featureMinEntropy);
+            featureSubsetSplitValues.push_back(featureSplitValue);
+            featureSubsetIntrinsicValues.push_back(featureIntrinsicValue);
+            
+            double featureInformationGain = node->getOwnEntropy() - featureMinEntropy;
+            double featureGainRatio = (double)featureInformationGain / (double)featureIntrinsicValue;
+            featureSubsetGainRatios.push_back(featureGainRatio);
+            
+        }
+        
+        vector<double>::iterator minEntropyIterator = min_element(featureSubsetEntropies.begin(), featureSubsetEntropies.end());
+        vector<double>::iterator maxGainRatioIterator = max_element(featureSubsetGainRatios.begin(), featureSubsetGainRatios.end());
+        double featureMinEntropy = *minEntropyIterator;
+        //double featureMaxGainRatio = *maxGainRatioIterator;
+        
+        double bestFeatureSplitEntropy = featureMinEntropy;
+        int bestFeatureToSplitOnIndex = -1;
+        if (treeSplitCriterion == "gainRatio"){ 
+            bestFeatureToSplitOnIndex = (int)(maxGainRatioIterator - featureSubsetGainRatios.begin());
+            // if using 'gainRatio' measure, then featureMinEntropy must be re-updated, as the index
+            // for 'featureMaxGainRatio' would be different
+            bestFeatureSplitEntropy = featureSubsetEntropies[bestFeatureToSplitOnIndex];
+        }
+        else { bestFeatureToSplitOnIndex = (int)(minEntropyIterator - featureSubsetEntropies.begin()); }
+        
+        int bestFeatureSplitValue = featureSubsetSplitValues[bestFeatureToSplitOnIndex];
+        
+        node->setSplitFeatureIndex(featureSubsetIndices[bestFeatureToSplitOnIndex]);
+        node->setSplitFeatureValue(bestFeatureSplitValue);
+        node->setSplitFeatureEntropy(bestFeatureSplitEntropy);
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "findAndUpdateBestFeatureToSplitOn");
+               exit(1);
+       } 
+}
+/***********************************************************************/
+vector<int> DecisionTree::selectFeatureSubsetRandomly(vector<int> globalDiscardedFeatureIndices, vector<int> localDiscardedFeatureIndices){
+    try {
+
+        vector<int> featureSubsetIndices;
+        
+        vector<int> combinedDiscardedFeatureIndices;
+        combinedDiscardedFeatureIndices.insert(combinedDiscardedFeatureIndices.end(), globalDiscardedFeatureIndices.begin(), globalDiscardedFeatureIndices.end());
+        combinedDiscardedFeatureIndices.insert(combinedDiscardedFeatureIndices.end(), localDiscardedFeatureIndices.begin(), localDiscardedFeatureIndices.end());
+        
+        sort(combinedDiscardedFeatureIndices.begin(), combinedDiscardedFeatureIndices.end());
+        
+        int numberOfRemainingSuitableFeatures = (int)(numFeatures - combinedDiscardedFeatureIndices.size());
+        int currentFeatureSubsetSize = numberOfRemainingSuitableFeatures < optimumFeatureSubsetSize ? numberOfRemainingSuitableFeatures : optimumFeatureSubsetSize;
+        
+        while (featureSubsetIndices.size() < currentFeatureSubsetSize) {
+            
+            if (m->control_pressed) { return featureSubsetIndices; }
+            
+            // TODO: optimize rand() call here
+            int randomIndex = rand() % numFeatures;
+            vector<int>::iterator it = find(featureSubsetIndices.begin(), featureSubsetIndices.end(), randomIndex);
+            if (it == featureSubsetIndices.end()){    // NOT FOUND
+                vector<int>::iterator it2 = find(combinedDiscardedFeatureIndices.begin(), combinedDiscardedFeatureIndices.end(), randomIndex);
+                if (it2 == combinedDiscardedFeatureIndices.end()){  // NOT FOUND AGAIN
+                    featureSubsetIndices.push_back(randomIndex);
+                }
+            }
+        }
+        sort(featureSubsetIndices.begin(), featureSubsetIndices.end());
+        
+        //#ifdef DEBUG_LEVEL_3
+        //    PRINT_VAR(featureSubsetIndices);
+        //#endif
+        
+        return featureSubsetIndices;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "selectFeatureSubsetRandomly");
+               exit(1);
+       } 
+}
+/***********************************************************************/
+
+// TODO: printTree() needs a check if correct
+int DecisionTree::printTree(RFTreeNode* treeNode, string caption){
+    try { 
+        string tabs = "";
+        for (int i = 0; i < treeNode->getGeneration(); i++) { tabs += "   "; }
+        //    for (int i = 0; i < treeNode->getGeneration() - 1; i++) { tabs += "|  "; }
+        //    if (treeNode->getGeneration() != 0) { tabs += "|--"; }
+        
+        if (treeNode != NULL && treeNode->checkIsLeaf() == false){
+            m->mothurOut(tabs + caption + " [ gen: " + toString(treeNode->getGeneration()) + " ] ( " + toString(treeNode->getSplitFeatureValue()) + " < X" + toString(treeNode->getSplitFeatureIndex()) +" )\n");
+            
+            printTree(treeNode->getLeftChildNode(), "leftChild");
+            printTree(treeNode->getRightChildNode(), "rightChild");
+        }else {
+            m->mothurOut(tabs + caption + " [ gen: " + toString(treeNode->getGeneration()) + " ] ( classified to: " + toString(treeNode->getOutputClass()) + ", samples: " + toString(treeNode->getNumSamples()) + " )\n");
+        }
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "printTree");
+               exit(1);
+       } 
+}
+/***********************************************************************/
+void DecisionTree::deleteTreeNodesRecursively(RFTreeNode* treeNode) {
+    try {
+        if (treeNode == NULL) { return; }
+        deleteTreeNodesRecursively(treeNode->leftChildNode);
+        deleteTreeNodesRecursively(treeNode->rightChildNode);
+        delete treeNode;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DecisionTree", "deleteTreeNodesRecursively");
+               exit(1);
+       } 
+}
+/***********************************************************************/
+
diff --git a/decisiontree.hpp b/decisiontree.hpp

new file mode 100755 (executable)

index 0000000..d4441ed
--- /dev/null
+++ b/decisiontree.hpp
@@ -0,0 +1,59 @@
+  //
+  //  decisiontree.hpp
+  //  rrf-fs-prototype
+  //
+  //  Created by Abu Zaher Faridee on 5/28/12.
+  //  Copyright (c) 2012 Schloss Lab. All rights reserved.
+  //
+
+#ifndef rrf_fs_prototype_decisiontree_hpp
+#define rrf_fs_prototype_decisiontree_hpp
+
+#include "macros.h"
+#include "rftreenode.hpp"
+#include "abstractdecisiontree.hpp"
+
+/***********************************************************************/
+
+struct VariableRankDescendingSorter {
+  bool operator() (vector<int> first, vector<int> second){ return first[1] > second[1]; }
+};
+struct VariableRankDescendingSorterDouble {
+    bool operator() (vector<double> first, vector<double> second){ return first[1] > second[1]; }
+};
+/***********************************************************************/
+
+class DecisionTree: public AbstractDecisionTree{
+    
+    friend class RandomForest;
+    
+public:
+    
+    DecisionTree(vector< vector<int> > baseDataSet,
+                 vector<int> globalDiscardedFeatureIndices,
+                 OptimumFeatureSubsetSelector optimumFeatureSubsetSelector,
+                 string treeSplitCriterion);
+    virtual ~DecisionTree(){ deleteTreeNodesRecursively(rootNode); }
+    
+    int calcTreeVariableImportanceAndError();
+    int evaluateSample(vector<int> testSample);
+    int calcTreeErrorRate(int& numCorrect, double& treeErrorRate);
+    vector< vector<int> > randomlyShuffleAttribute(vector< vector<int> > samples, int featureIndex);  
+    void purgeDataSetsFromTree() { purgeTreeNodesDataRecursively(rootNode); }
+    int purgeTreeNodesDataRecursively(RFTreeNode* treeNode);
+    
+    
+private:
+    
+    void buildDecisionTree();
+    int splitRecursively(RFTreeNode* rootNode);
+    int findAndUpdateBestFeatureToSplitOn(RFTreeNode* node);
+    vector<int> selectFeatureSubsetRandomly(vector<int> globalDiscardedFeatureIndices, vector<int> localDiscardedFeatureIndices);
+    int printTree(RFTreeNode* treeNode, string caption);
+    void deleteTreeNodesRecursively(RFTreeNode* treeNode);
+    
+    vector<int> variableImportanceList;
+    map<int, int> outOfBagEstimates;
+};
+
+#endif
diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp

index bab5a634cd101507957dabc22e34e429c5ec5b1b..90a40ce8e86dbe6a6e3cba8cb9d070de374805cc 100644 (file)
--- a/deconvolutecommand.cpp
+++ b/deconvolutecommand.cpp
@@ -14,7 +14,8 @@
  vector<string> DeconvoluteCommand::setParameters(){    
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+               CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                 
@@ -31,7 +32,7 @@ vector<string> DeconvoluteCommand::setParameters(){
  string DeconvoluteCommand::getHelpString(){    
         try {
                 string helpString = "";
-               helpString += "The unique.seqs command reads a fastafile and creates a namesfile.\n";
+               helpString += "The unique.seqs command reads a fastafile and creates a name or count file.\n";
                 helpString += "It creates a file where the first column is the groupname and the second column is a list of sequence names who have the same sequence. \n";
                 helpString += "If the sequence is unique the second column will just contain its name. \n";
                 helpString += "The unique.seqs command parameters are fasta and name.  fasta is required, unless there is a valid current fasta file.\n";
@@ -56,6 +57,7 @@ string DeconvoluteCommand::getOutputFileNameTag(string type, string inputName=""
          else {
              if (type == "fasta") {  outputFileName =  "unique" + m->getExtension(inputName); }
              else if (type == "name") {  outputFileName =  "names"; }
+            else if (type == "count") {  outputFileName =  "count_table"; }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
          return outputFileName;
@@ -73,6 +75,7 @@ DeconvoluteCommand::DeconvoluteCommand(){
                 vector<string> tempOutNames;
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "DeconvoluteCommand", "DeconvoluteCommand");
@@ -106,6 +109,7 @@ DeconvoluteCommand::DeconvoluteCommand(string option)  {
                         vector<string> tempOutNames;
                         outputTypes["fasta"] = tempOutNames;
                         outputTypes["name"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                 
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -127,6 +131,14 @@ DeconvoluteCommand::DeconvoluteCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -149,11 +161,21 @@ DeconvoluteCommand::DeconvoluteCommand(string option)  {
                         if (oldNameMapFName == "not open") { oldNameMapFName = ""; abort = true; }
                         else if (oldNameMapFName == "not found"){       oldNameMapFName = "";   }
                         else { m->setNameFile(oldNameMapFName); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { m->setCountTableFile(countfile); }
                         
-                       if (oldNameMapFName == "") {
-                               vector<string> files; files.push_back(inFastaName);
-                               parser.getNameFile(files);
-                       }
+            if ((countfile != "") && (oldNameMapFName != "")) { m->mothurOut("When executing a unique.seqs command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+                       
+
+                       if (countfile == "") {
+                if (oldNameMapFName == "") {
+                    vector<string> files; files.push_back(inFastaName);
+                    parser.getNameFile(files);
+                }
+            }
                         
                 }
  
@@ -171,6 +193,7 @@ int DeconvoluteCommand::execute() {
  
                 //prepare filenames and open files
                 string outNameFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + getOutputFileNameTag("name");
+        string outCountFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + getOutputFileNameTag("count");
                 string outFastaFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + getOutputFileNameTag("fasta", inFastaName);
                 
                 map<string, string> nameMap;
@@ -179,6 +202,11 @@ int DeconvoluteCommand::execute() {
              m->readNames(oldNameMapFName, nameMap); 
              if (oldNameMapFName == outNameFile){ outNameFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "unique." + getOutputFileNameTag("name");   }
          }
+        CountTable ct;
+        if (countfile != "")  {  
+            ct.readTable(countfile);
+            if (countfile == outCountFile){ outCountFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "unique." + getOutputFileNameTag("count");   }
+        }
                 
                 if (m->control_pressed) { return 0; }
                 
@@ -222,7 +250,10 @@ int DeconvoluteCommand::execute() {
                                                         sequenceStrings[seq.getAligned()] = itNames->second;
                                                         nameFileOrder.push_back(seq.getAligned());
                                                 }
-                                       }else { sequenceStrings[seq.getAligned()] = seq.getName();      nameFileOrder.push_back(seq.getAligned()); }
+                                       }else if (countfile != "") { 
+                        ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table
+                        sequenceStrings[seq.getAligned()] = seq.getName();     nameFileOrder.push_back(seq.getAligned());
+                    }else {    sequenceStrings[seq.getAligned()] = seq.getName();      nameFileOrder.push_back(seq.getAligned()); }
                                 }else { //this is a dup
                                         if (oldNameMapFName != "") {
                                                 itNames = nameMap.find(seq.getName());
@@ -232,7 +263,12 @@ int DeconvoluteCommand::execute() {
                                                 }else {
                                                         sequenceStrings[seq.getAligned()] += "," + itNames->second;
                                                 }
-                                       }else { sequenceStrings[seq.getAligned()] += "," + seq.getName();       }
+                    }else if (countfile != "") { 
+                        int num = ct.getNumSeqs(seq.getName()); //checks to make sure seq is in table
+                        if (num != 0) { //its in the table
+                            ct.mergeCounts(itStrings->second, seq.getName()); //merges counts and saves in uniques name
+                        }
+                    }else {    sequenceStrings[seq.getAligned()] += "," + seq.getName();       }
                                 }
                                 
                                 count++;
@@ -252,34 +288,35 @@ int DeconvoluteCommand::execute() {
                 
                 //print new names file
                 ofstream outNames;
-               m->openOutputFile(outNameFile, outNames);
+               if (countfile == "") { m->openOutputFile(outNameFile, outNames); outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile);  }
+        else { m->openOutputFile(outCountFile, outNames); ct.printHeaders(outNames); outputTypes["count"].push_back(outCountFile); outputNames.push_back(outCountFile); }
                 
                 for (int i = 0; i < nameFileOrder.size(); i++) {
-               //for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) {
-                       if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); m->mothurRemove(outNameFile); return 0; }
+                       if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
                         
                         itStrings = sequenceStrings.find(nameFileOrder[i]);
                         
                         if (itStrings != sequenceStrings.end()) {
-                               //get rep name
-                               int pos = (itStrings->second).find_first_of(',');
-                       
-                               if (pos == string::npos) { // only reps itself
-                                       outNames << itStrings->second << '\t' << itStrings->second << endl;
-                               }else {
-                                       outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl;
-                               }
+                if (countfile == "") {
+                    //get rep name
+                    int pos = (itStrings->second).find_first_of(',');
+                    
+                    if (pos == string::npos) { // only reps itself
+                        outNames << itStrings->second << '\t' << itStrings->second << endl;
+                    }else {
+                        outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl;
+                    }
+                }else {  ct.printSeq(outNames, itStrings->second);  }
                         }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; }
                 }
                 outNames.close();
                 
-               if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); m->mothurRemove(outNameFile); return 0; }
+               if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); }  return 0; }
                 
                 m->mothurOutEndLine();
                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
-               m->mothurOut(outFastaFile); m->mothurOutEndLine();      
-               m->mothurOut(outNameFile); m->mothurOutEndLine();
-               outputNames.push_back(outFastaFile);  outputNames.push_back(outNameFile); outputTypes["fasta"].push_back(outFastaFile);  outputTypes["name"].push_back(outNameFile); 
+               outputNames.push_back(outFastaFile);   outputTypes["fasta"].push_back(outFastaFile);  
+        for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
                 m->mothurOutEndLine();
  
                 //set fasta file as new current fastafile
@@ -293,6 +330,11 @@ int DeconvoluteCommand::execute() {
                 if (itTypes != outputTypes.end()) {
                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
                 }
+        
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
                 
                 return 0;
         }
diff --git a/deconvolutecommand.h b/deconvolutecommand.h

index 7d4cb50023eb6312b652a8368d2d862882b3ffd4..673ffc9e4d2e689cd32c6b16867089fe43f216d1 100644 (file)
--- a/deconvolutecommand.h
+++ b/deconvolutecommand.h
@@ -11,6 +11,7 @@
  
  #include "command.hpp"
  #include "fastamap.h"
+#include "counttable.h"
  
  /* The unique.seqs command reads a fasta file, finds the duplicate sequences and outputs a names file
         containing 2 columns.  The first being the groupname and the second the list of identical sequence names. */ 
@@ -37,7 +38,7 @@ public:
         
         
  private:
-       string inFastaName, oldNameMapFName, outputDir;
+       string inFastaName, oldNameMapFName, outputDir, countfile;
         vector<string> outputNames;
  
         bool abort;
diff --git a/deuniquetreecommand.cpp b/deuniquetreecommand.cpp

index 662282b1d05d57e25ee2c259f178d00c5ecf96ca..d334f8f4ef107d12a57eaa4aff4d71b84808edb9 100644 (file)
--- a/deuniquetreecommand.cpp
+++ b/deuniquetreecommand.cpp
@@ -161,7 +161,8 @@ int DeuniqueTreeCommand::execute() {
                 
                 TreeReader* reader = new TreeReader(treefile, "", namefile);
          vector<Tree*> T = reader->getTrees();
-        map<string, string> nameMap = reader->getNameMap();
+        map<string, string> nameMap;
+        m->readNames(namefile, nameMap);
          delete reader;         
                 
                 //print new Tree
@@ -172,7 +173,7 @@ int DeuniqueTreeCommand::execute() {
                 T[0]->print(out, nameMap);
                 out.close();
                 
-        delete (T[0]->getTreeMap());
+        delete (T[0]->getCountTable());
                 for (int i = 0; i < T.size(); i++) { delete T[i]; }
                                 
                 //set phylip file as new current phylipfile
diff --git a/flowdata.cpp b/flowdata.cpp

index 1420f84b992a54ed6dfb6d3a28d8452a083691de..1fe7d7faf1e31ee486890980c4731ddebfdca8fb 100644 (file)
--- a/flowdata.cpp
+++ b/flowdata.cpp
@@ -43,13 +43,15 @@ bool FlowData::getNext(ifstream& flowFile){
         
         try {
                 flowFile >> seqName >> endFlow; 
-               //cout << "in Flowdata " + seqName << endl;
-               for(int i=0;i<numFlows;i++)     {       flowFile >> flowData[i];        }
-               //cout << "in Flowdata read " << seqName + " done" << endl;
-               updateEndFlow(); 
-               translateFlow();
-               
-               m->gobble(flowFile);
+        if (seqName.length() != 0) {
+            //cout << "in Flowdata " + seqName << endl;
+            for(int i=0;i<numFlows;i++)        {       flowFile >> flowData[i];        }
+            //cout << "in Flowdata read " << seqName + " done" << endl;
+            updateEndFlow(); 
+            translateFlow();
+            m->gobble(flowFile);
+               }else{ m->mothurOut("Error in reading your flowfile, at position " + toString(flowFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
+            
                 if(flowFile){   return 1;       }
                 else            {       return 0;       }
         }
diff --git a/getgroupscommand.cpp b/getgroupscommand.cpp

index fe6f571669bf820d1a153ef3f44327611666b8a1..69f4403ca0039dab9d3c4f5aa626a566fa0805d9 100644 (file)
--- a/getgroupscommand.cpp
+++ b/getgroupscommand.cpp
@@ -18,8 +18,9 @@ vector<string> GetGroupsCommand::setParameters(){
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(pfasta);
                 CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup);         
          CommandParameter pdesign("design", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pdesign);
                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(plist);
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(ptaxonomy);
@@ -43,7 +44,7 @@ string GetGroupsCommand::getHelpString(){
                 string helpString = "";
                 helpString += "The get.groups command selects sequences from a specfic group or set of groups from the following file types: fasta, name, group, list, taxonomy, design or shared file.\n";
                 helpString += "It outputs a file containing the sequences in the those specified groups, or a sharedfile containing only those groups.\n";
-               helpString += "The get.groups command parameters are accnos, fasta, name, group, list, taxonomy, shared, design and groups. The group parameter is required, unless you have a current group file, or are using a shared file.\n";
+               helpString += "The get.groups command parameters are accnos, fasta, name, group, list, taxonomy, shared, design and groups. The group or count parameter is required, unless you have a current group or count file, or are using a shared file.\n";
                 helpString += "You must also provide an accnos containing the list of groups to get or set the groups parameter to the groups you wish to select.\n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like.  You can separate group names with dashes.\n";
                 helpString += "The get.groups command should be in the following format: get.groups(accnos=yourAccnos, fasta=yourFasta, group=yourGroupFile).\n";
@@ -71,6 +72,7 @@ string GetGroupsCommand::getOutputFileNameTag(string type, string inputName=""){
              else if (type == "taxonomy")    {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "name")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "group")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
+            else if (type == "count")       {   outputFileName =  "pick.count_table";   }
              else if (type == "list")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "shared")      {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "design")      {   outputFileName =  "pick" + m->getExtension(inputName);   }
@@ -97,6 +99,7 @@ GetGroupsCommand::GetGroupsCommand(){
                 outputTypes["list"] = tempOutNames;
                 outputTypes["shared"] = tempOutNames;
          outputTypes["design"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "GetGroupsCommand", "GetGroupsCommand");
@@ -135,6 +138,7 @@ GetGroupsCommand::GetGroupsCommand(string option)  {
                         outputTypes["list"] = tempOutNames;
                         outputTypes["shared"] = tempOutNames;
              outputTypes["design"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
@@ -208,6 +212,14 @@ GetGroupsCommand::GetGroupsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["design"] = inputDir + it->second;           }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         
@@ -227,11 +239,6 @@ GetGroupsCommand::GetGroupsCommand(string option)  {
                         else if (namefile == "not found") {  namefile = "";  }  
                         else { m->setNameFile(namefile); }
                         
-                       groupfile = validParameter.validFile(parameters, "group", true);
-                       if (groupfile == "not open") { groupfile = ""; abort = true; }
-                       else if (groupfile == "not found") {  groupfile = "";                   }
-                       else { m->setGroupFile(groupfile); }    
-                       
                         listfile = validParameter.validFile(parameters, "list", true);
                         if (listfile == "not open") { abort = true; }
                         else if (listfile == "not found") {  listfile = "";  }
@@ -263,8 +270,22 @@ GetGroupsCommand::GetGroupsCommand(string option)  {
                         if (designfile == "not open") { designfile = ""; abort = true; }
                         else if (designfile == "not found") {   designfile = "";        }
                         else { m->setDesignFile(designfile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+            if (countfile == "not open") { countfile = ""; abort = true; }
+            else if (countfile == "not found") { countfile = "";  }    
+            else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+
                         
-                       if ((sharedfile == "") && (groupfile == "") && (designfile == "")) { 
+                       if ((sharedfile == "") && (groupfile == "") && (designfile == "") && (countfile == "")) { 
                                 //is there are current file available for any of these?
                                 if ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != "")) {
                                         //give priority to group, then shared
@@ -274,7 +295,11 @@ GetGroupsCommand::GetGroupsCommand(string option)  {
                                                 sharedfile = m->getSharedFile(); 
                                                 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
                                                 else { 
-                                                       m->mothurOut("You have no current groupfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true;
+                                                       countfile = m->getCountTableFile(); 
+                            if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                            else { 
+                                m->mothurOut("You have no current groupfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true;
+                            }
                                                 }
                                         }
                                 }else {
@@ -288,7 +313,12 @@ GetGroupsCommand::GetGroupsCommand(string option)  {
                                                         designfile = m->getDesignFile(); 
                              if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); }
                              else { 
-                                m->mothurOut("You have no current groupfile or sharedfile or designfile and one is required."); m->mothurOutEndLine(); abort = true;
+                                countfile = m->getCountTableFile(); 
+                                if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                                else { 
+                                    m->mothurOut("You have no current groupfile, designfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true;
+                                }
+
                              }
                                                 }
                                         }
@@ -297,13 +327,15 @@ GetGroupsCommand::GetGroupsCommand(string option)  {
                         
                         if ((accnosfile == "") && (Groups.size() == 0)) { m->mothurOut("You must provide an accnos file or specify groups using the groups parameter."); m->mothurOutEndLine(); abort = true; }
                         
-                       if ((fastafile == "") && (namefile == "") && (groupfile == "")  && (designfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared, design or list."); m->mothurOutEndLine(); abort = true; }
-                       if ((groupfile == "") && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != "")))  { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group file."); m->mothurOutEndLine(); abort = true; }
-
-                       if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
-                               vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
-                               parser.getNameFile(files);
-                       }
+                       if ((fastafile == "") && (namefile == "") && (countfile == "") && (groupfile == "")  && (designfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared, design, count or list."); m->mothurOutEndLine(); abort = true; }
+                       if (((groupfile == "") && (countfile == "")) && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != "")))  { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group or count file."); m->mothurOutEndLine(); abort = true; }
+            
+            if (countfile == "") {
+                if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
+                    vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
                 
         }
@@ -331,6 +363,7 @@ int GetGroupsCommand::execute(){
                         SharedUtil* util = new SharedUtil();
                         vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
                         util->setGroups(Groups, gNamesOfGroups);
+            m->setGroups(Groups);
                         groupMap->setNamesOfGroups(gNamesOfGroups);
                         delete util;
                         
@@ -338,7 +371,23 @@ int GetGroupsCommand::execute(){
                         fillNames();
                         
                         delete groupMap;
-               }
+               }else if (countfile != ""){
+            if ((fastafile != "") || (listfile != "") || (taxfile != "")) { 
+                m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n");
+            }
+            CountTable ct;
+            ct.readTable(countfile);
+            if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: your count file does not contain group info, aborting.\n"); return 0; }
+                
+            vector<string> gNamesOfGroups = ct.getNamesOfGroups();
+            SharedUtil util;
+            util.setGroups(Groups, gNamesOfGroups);
+            m->setGroups(Groups);
+            for (int i = 0; i < Groups.size(); i++) {
+                vector<string> thisGroupsSeqs = ct.getNamesOfSeqs(Groups[i]);
+                for (int j = 0; j < thisGroupsSeqs.size(); j++) { names.insert(thisGroupsSeqs[j]); }
+            }
+        }
                 
                 if (m->control_pressed) { return 0; }
                 
@@ -346,6 +395,7 @@ int GetGroupsCommand::execute(){
                 if (namefile != "")                     {               readName();             }
                 if (fastafile != "")            {               readFasta();    }
                 if (groupfile != "")            {               readGroup();    }
+        if (countfile != "")           {               readCount();    }
                 if (listfile != "")                     {               readList();             }
                 if (taxfile != "")                      {               readTax();              }
                 if (sharedfile != "")           {               readShared();   }
@@ -396,6 +446,11 @@ int GetGroupsCommand::execute(){
                         if (itTypes != outputTypes.end()) {
                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setDesignFile(current); }
                         }
+            
+            itTypes = outputTypes.find("count");
+                       if (itTypes != outputTypes.end()) {
+                               if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+                       }
                 }
                 
                 return 0;               
@@ -742,6 +797,82 @@ int GetGroupsCommand::readGroup(){
         }
  }
  //**********************************************************************************************************************
+int GetGroupsCommand::readCount(){
+       try {
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               
+               ifstream in;
+               m->openInputFile(countfile, in);
+               
+               bool wroteSomething = false;
+               int selectedCount = 0;
+               
+        string headers = m->getline(in); m->gobble(in);
+        vector<string> columnHeaders = m->splitWhiteSpace(headers);
+        
+        vector<string> groups;
+        map<int, string> originalGroupIndexes;
+        map<string, int> GroupIndexes;
+        set<int> indexOfGroupsChosen;
+        for (int i = 2; i < columnHeaders.size(); i++) {  groups.push_back(columnHeaders[i]);  originalGroupIndexes[i-2] = columnHeaders[i]; }
+        //sort groups to keep consistent with how we store the groups in groupmap
+        sort(groups.begin(), groups.end());
+        for (int i = 0; i < groups.size(); i++) {  GroupIndexes[groups[i]] = i; }
+        sort(Groups.begin(), Groups.end());
+        out << "Representative_Sequence\ttotal\t";
+        for (int i = 0; i < Groups.size(); i++) { out << Groups[i] << '\t'; indexOfGroupsChosen.insert(GroupIndexes[Groups[i]]); }
+        out << endl;
+        
+        string name; int oldTotal;
+        while (!in.eof()) {
+            
+            if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
+            
+            in >> name; m->gobble(in); in >> oldTotal; m->gobble(in);
+            if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + toString(oldTotal) + "\n"); }
+            
+            if (names.count(name) != 0) {
+                //if group info, then read it
+                vector<int> selectedCounts; int thisTotal = 0; int temp;
+                for (int i = 0; i < groups.size(); i++) {  
+                    int thisIndex = GroupIndexes[originalGroupIndexes[i]]; 
+                    in >> temp;  m->gobble(in);
+                    if (indexOfGroupsChosen.count(thisIndex) != 0) { //we want this group
+                        selectedCounts.push_back(temp); thisTotal += temp;
+                    }
+                }
+
+                out << name << '\t' << thisTotal << '\t';
+                for (int i = 0; i < selectedCounts.size(); i++) {  out << selectedCounts[i] << '\t'; }
+                out << endl;
+                
+                wroteSomething = true;
+                selectedCount+= thisTotal;
+            }else {  m->getline(in); }
+            
+            m->gobble(in);
+        }
+        in.close();
+               out.close();
+               
+               if (wroteSomething == false) {  m->mothurOut("Your file does NOT contain sequences from the groups you wish to get."); m->mothurOutEndLine();  }
+               outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName);
+               
+               m->mothurOut("Selected " + toString(selectedCount) + " sequences from your count file."); m->mothurOutEndLine();
+        
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "GetGroupsCommand", "readCount");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
  int GetGroupsCommand::readDesign(){
         try {
                 string thisOutputDir = outputDir;
diff --git a/getgroupscommand.h b/getgroupscommand.h

index 80230b49b9872ab4633dbd2d4120826b3cc1ee82..6bb608833a418f2c0854c2b8bf0ac9c16a66f790 100644 (file)
--- a/getgroupscommand.h
+++ b/getgroupscommand.h
@@ -40,7 +40,7 @@ private:
         map<string, string> uniqueToRedundant; //if a namefile is given and the first column name is not selected
                                                                                    //then the other files need to change the unique name in their file to match.
                                                                                    //only add the names that need to be changed to keep the map search quick
-       string accnosfile, fastafile, namefile, groupfile, listfile, designfile, taxfile, outputDir, groups, sharedfile;
+       string accnosfile, countfile, fastafile, namefile, groupfile, listfile, designfile, taxfile, outputDir, groups, sharedfile;
         bool abort;
         vector<string> outputNames, Groups;
         GroupMap* groupMap;
@@ -48,6 +48,7 @@ private:
         int readFasta();
         int readName();
         int readGroup();
+    int readCount();
         int readList();
         int readTax();
         int fillNames();
diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp

index 1aba0fed4e6e772de07718a544335b6e6bb58db9..645655d03f72bbebac45cfa108c499cc1daf8e3b 100644 (file)
--- a/getlineagecommand.cpp
+++ b/getlineagecommand.cpp
@@ -10,13 +10,15 @@
  #include "getlineagecommand.h"
  #include "sequence.hpp"
  #include "listvector.hpp"
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> GetLineageCommand::setParameters(){     
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist);
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,true); parameters.push_back(ptaxonomy);
                 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport);
@@ -38,9 +40,9 @@ vector<string> GetLineageCommand::setParameters(){
  string GetLineageCommand::getHelpString(){     
         try {
                 string helpString = "";
-               helpString += "The get.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, list or alignreport file.\n";
+               helpString += "The get.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, count, list or alignreport file.\n";
                 helpString += "It outputs a file containing only the sequences from the taxonomy file that are from the taxon requested.\n";
-               helpString += "The get.lineage command parameters are taxon, fasta, name, group, list, taxonomy, alignreport and dups.  You must provide taxonomy unless you have a valid current taxonomy file.\n";
+               helpString += "The get.lineage command parameters are taxon, fasta, name, group, count, list, taxonomy, alignreport and dups.  You must provide taxonomy unless you have a valid current taxonomy file.\n";
                 helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n";
                 helpString += "The taxon parameter allows you to select the taxons you would like to get and is required.\n";
                 helpString += "You may enter your taxons with confidence scores, doing so will get only those sequences that belong to the taxonomy and whose cofidence scores is above the scores you give.\n";
@@ -70,6 +72,7 @@ string GetLineageCommand::getOutputFileNameTag(string type, string inputName="")
              if (type == "fasta")            {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "taxonomy")    {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "name")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
+            else if (type == "count")       {   outputFileName =  "pick.count_table";                    }  
              else if (type == "group")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "list")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "alignreport") {   outputFileName =  "pick.align.report";   }
@@ -94,6 +97,7 @@ GetLineageCommand::GetLineageCommand(){
                 outputTypes["group"] = tempOutNames;
                 outputTypes["alignreport"] = tempOutNames;
                 outputTypes["list"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "GetLineageCommand", "GetLineageCommand");
@@ -131,6 +135,7 @@ GetLineageCommand::GetLineageCommand(string option)  {
                         outputTypes["group"] = tempOutNames;
                         outputTypes["alignreport"] = tempOutNames;
                         outputTypes["list"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
  
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
@@ -187,6 +192,14 @@ GetLineageCommand::GetLineageCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -230,6 +243,19 @@ GetLineageCommand::GetLineageCommand(string option)  {
                                 else                            {  temp = "false"; usedDups = "";       }
                         }
                         dups = m->isTrue(temp);
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+            if (countfile == "not open") { countfile = ""; abort = true; }
+            else if (countfile == "not found") { countfile = "";  }    
+            else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
                         
                         taxons = validParameter.validFile(parameters, "taxon", false);  
                         if (taxons == "not found") { taxons = "";  m->mothurOut("No taxons given, please correct."); m->mothurOutEndLine();  abort = true;  }
@@ -240,12 +266,14 @@ GetLineageCommand::GetLineageCommand(string option)  {
                         }
                         m->splitAtChar(taxons, listOfTaxons, '-');
                         
-                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == ""))  { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; }
+                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (countfile == ""))  { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; }
                 
-                       if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
-                               vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
-                               parser.getNameFile(files);
-                       }
+            if (countfile == "") {
+                if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
+                    vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
  
         }
@@ -262,11 +290,18 @@ int GetLineageCommand::execute(){
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
                 if (m->control_pressed) { return 0; }
+        
+        if (countfile != "") {
+            if ((fastafile != "") || (listfile != "") || (taxfile != "")) { 
+                m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n");
+            }
+        }
                 
                 //read through the correct file and output lines you want to keep
                 if (taxfile != "")                      {               readTax();              }  //fills the set of names to get
                 if (namefile != "")                     {               readName();             }
                 if (fastafile != "")            {               readFasta();    }
+        if (countfile != "")           {               readCount();    }
                 if (groupfile != "")            {               readGroup();    }
                 if (alignfile != "")            {               readAlign();    }
                 if (listfile != "")                     {               readList();             }
@@ -305,7 +340,12 @@ int GetLineageCommand::execute(){
                         itTypes = outputTypes.find("taxonomy");
                         if (itTypes != outputTypes.end()) {
                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
-                       }                       
+                       }
+                       
+            itTypes = outputTypes.find("count");
+                       if (itTypes != outputTypes.end()) {
+                               if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+                       }
                 }
                 
                 return 0;               
@@ -353,7 +393,7 @@ int GetLineageCommand::readFasta(){
                 in.close();     
                 out.close();
                 
-               if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine();  }
+               if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
                 outputNames.push_back(outputFileName);  outputTypes["fasta"].push_back(outputFileName);
                 
                 return 0;
@@ -365,6 +405,60 @@ int GetLineageCommand::readFasta(){
         }
  }
  //**********************************************************************************************************************
+int GetLineageCommand::readCount(){
+       try {
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               
+               ifstream in;
+               m->openInputFile(countfile, in);
+               
+               bool wroteSomething = false;
+               
+        string headers = m->getline(in); m->gobble(in);
+        out << headers << endl;
+        
+        string name, rest; int thisTotal;
+        while (!in.eof()) {
+            
+            if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
+            
+            in >> name; m->gobble(in); 
+            in >> thisTotal; m->gobble(in);
+            rest = m->getline(in); m->gobble(in);
+            if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); }
+            
+            if (names.count(name) != 0) {
+                out << name << '\t' << thisTotal << '\t' << rest << endl;
+                wroteSomething = true;
+            }
+        }
+        in.close();
+               out.close();
+        
+        //check for groups that have been eliminated
+        CountTable ct;
+        if (ct.testGroups(outputFileName)) {
+            ct.readTable(outputFileName);
+            ct.printTable(outputFileName);
+        }
+
+               
+               if (wroteSomething == false) {  m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
+               outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName);
+                      
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "GetLineageCommand", "readCount");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
  int GetLineageCommand::readList(){
         try {
                 string thisOutputDir = outputDir;
@@ -425,7 +519,7 @@ int GetLineageCommand::readList(){
                 in.close();     
                 out.close();
                 
-               if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine();  }
+               if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
                 outputNames.push_back(outputFileName); outputTypes["list"].push_back(outputFileName);
                 
                 return 0;
@@ -510,7 +604,7 @@ int GetLineageCommand::readName(){
                 in.close();
                 out.close();
                 
-               if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine();  }
+               if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
                 outputNames.push_back(outputFileName);  outputTypes["name"].push_back(outputFileName);
                 
                 return 0;
@@ -558,7 +652,7 @@ int GetLineageCommand::readGroup(){
                 in.close();
                 out.close();
                 
-               if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine();  }
+               if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
                 outputNames.push_back(outputFileName);  outputTypes["group"].push_back(outputFileName);
                 
                 return 0;
@@ -606,15 +700,17 @@ int GetLineageCommand::readTax(){
                         in >> name;                             //read from first column
                         in >> tax;                      //read from second column
                         
+            string noQuotesTax = m->removeQuotes(tax);
+            
                         for (int j = 0; j < listOfTaxons.size(); j++) {
                                                         
-                               string newtax = tax;
+                               string newtax = noQuotesTax;
                         
                                 //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them
                                 if (!taxonsHasConfidence[j]) {
-                                       int hasConfidences = tax.find_first_of('(');
+                                       int hasConfidences = noQuotesTax.find_first_of('(');
                                         if (hasConfidences != string::npos) { 
-                                               newtax = tax;
+                                               newtax = noQuotesTax;
                                                 m->removeConfidences(newtax);
                                         }
                                 
@@ -627,7 +723,7 @@ int GetLineageCommand::readTax(){
                                                 break;
                                         }
                                 }else{//if listOfTaxons[i] has them and you don't them remove taxons
-                                       int hasConfidences = tax.find_first_of('(');
+                                       int hasConfidences = noQuotesTax.find_first_of('(');
                                         if (hasConfidences == string::npos) { 
                                         
                                                 int pos = newtax.find(noConfidenceTaxons[j]);
@@ -641,10 +737,10 @@ int GetLineageCommand::readTax(){
                                         }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons
                                                 //first remove confidences from both and see if the taxonomy exists
                                         
-                                               string noNewTax = tax;
-                                               int hasConfidences = tax.find_first_of('(');
+                                               string noNewTax = noQuotesTax;
+                                               int hasConfidences = noQuotesTax.find_first_of('(');
                                                 if (hasConfidences != string::npos) { 
-                                                       noNewTax = tax;
+                                                       noNewTax = noQuotesTax;
                                                         m->removeConfidences(noNewTax);
                                                 }
                                         
@@ -814,7 +910,7 @@ int GetLineageCommand::readAlign(){
                 in.close();
                 out.close();
                 
-               if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine();  }
+               if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine();  }
                 outputNames.push_back(outputFileName); outputTypes["alignreport"].push_back(outputFileName);
                 
                 return 0;
diff --git a/getlineagecommand.h b/getlineagecommand.h

index 0ab042bac365b64840e32212a6705f62d46d5ff1..99bc0fa347313bb023e8bd1b41a5c3e2be5c93ed 100644 (file)
--- a/getlineagecommand.h
+++ b/getlineagecommand.h
@@ -36,11 +36,12 @@ class GetLineageCommand : public Command {
         private:
                 set<string> names;
                 vector<string> outputNames, listOfTaxons;
-               string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons;
+               string fastafile, namefile, groupfile, alignfile, countfile, listfile, taxfile, outputDir, taxons;
                 bool abort, dups;
                 
                 int readFasta();
                 int readName();
+        int readCount();
                 int readGroup();
                 int readAlign();
                 int readList();
diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp

index 4967f245fb11c2ff37195a83baf39d29667fc582..9f4dd54491bab042ae243998de20559b03285dfb 100644 (file)
--- a/getoturepcommand.cpp
+++ b/getoturepcommand.cpp
@@ -41,9 +41,10 @@ vector<string> GetOTURepCommand::setParameters(){
         try {
                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "ColumnName",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
@@ -68,9 +69,9 @@ vector<string> GetOTURepCommand::setParameters(){
  string GetOTURepCommand::getHelpString(){      
         try {
                 string helpString = "";
-               helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
+               helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, count, large, weighted, cutoff, precision, groups, sorted and label.  The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
-               helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n";
+               helpString += "The phylip or column parameter is required, but only one may be used.  If you use a column file the name or count filename is required. \n";
                 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
                 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
                 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
@@ -106,6 +107,7 @@ string GetOTURepCommand::getOutputFileNameTag(string type, string inputName=""){
          else {
              if (type == "fasta")            {   outputFileName =  "rep.fasta";   }
              else if (type == "name")        {   outputFileName =  "rep.names";   }
+            else if (type == "count")        {   outputFileName =  "rep.count_table";   }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
          return outputFileName;
@@ -123,6 +125,7 @@ GetOTURepCommand::GetOTURepCommand(){
                 vector<string> tempOutNames;
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
@@ -157,6 +160,7 @@ GetOTURepCommand::GetOTURepCommand(string option)  {
                         vector<string> tempOutNames;
                         outputTypes["fasta"] = tempOutNames;
                         outputTypes["name"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -210,6 +214,14 @@ GetOTURepCommand::GetOTURepCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -245,6 +257,24 @@ GetOTURepCommand::GetOTURepCommand(string option)  {
                         if (namefile == "not open") { abort = true; }   
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
+            
+            hasGroups = false;
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not found") { countfile =  "";   }
+                       else if (countfile == "not open") { abort = true; countfile =  ""; }    
+                       else {   
+                m->setCountTableFile(countfile); 
+                ct.readTable(countfile);
+                if (ct.hasGroupInfo()) { hasGroups = true; }
+            }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
                         
                         if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
                                 //give priority to column, then phylip
@@ -261,14 +291,18 @@ GetOTURepCommand::GetOTURepCommand(string option)  {
                         }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
                 
                         if (columnfile != "") {  
-                               if (namefile == "") {  
+                               if ((namefile == "") && (countfile == "")) { 
                                         namefile = m->getNameFile(); 
                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
                                         else { 
-                                               m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
-                                               abort = true; 
+                                               countfile = m->getCountTableFile();
+                        if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                        else { 
+                            m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
+                            abort = true; 
+                        }      
                                         }       
-                               } 
+                               }
                         }
  
                         //check for optional parameter and set defaults
@@ -292,15 +326,15 @@ GetOTURepCommand::GetOTURepCommand(string option)  {
                                 sorted = "";
                         }
                         
-                       if ((sorted == "group") && (groupfile == "")) {
-                               m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
+                       if ((sorted == "group") && ((groupfile == "")&& !hasGroups)) {
+                               m->mothurOut("You must provide a groupfile or have a count file with group info to sort by group. I will not sort."); m->mothurOutEndLine();
                                 sorted = "";
                         }
                         
                         groups = validParameter.validFile(parameters, "groups", false);                 
                         if (groups == "not found") { groups = ""; }
                         else { 
-                               if (groupfile == "") {
+                               if ((groupfile == "") && (!hasGroups)) {
                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
                                         abort = true;
                                 }else { 
@@ -340,106 +374,9 @@ int GetOTURepCommand::execute(){
                 int error;
                 list = NULL;
                 
-               if (!large) {
-                       //read distance files
-                       if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
-                       else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
-                       else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
-                       
-                       readMatrix->setCutoff(cutoff);
-       
-                       if(namefile != ""){     
-                               nameMap = new NameAssignment(namefile);
-                               nameMap->readMap();
-                       }else{  nameMap = NULL;         }
-                       
-                       readMatrix->read(nameMap);
-                       
-                       if (m->control_pressed) { delete readMatrix; return 0; }
-
-                       list = readMatrix->getListVector();
-
-                       SparseDistanceMatrix* matrix = readMatrix->getDMatrix();
-                       
-                       // Create a data structure to quickly access the distance information.
-                       // It consists of a vector of distance maps, where each map contains
-                       // all distances of a certain sequence. Vector and maps are accessed
-                       // via the index of a sequence in the distance matrix
-                       seqVec = vector<SeqMap>(list->size()); 
-            for (int i = 0; i < matrix->seqVec.size(); i++) {
-                for (int j = 0; j < matrix->seqVec[i].size(); j++) {
-                    if (m->control_pressed) { delete readMatrix; return 0; }
-                    //already added everyone else in row
-                    if (i < matrix->seqVec[i][j].index) {  seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist;  }
-                }
-                       }
-                       //add dummy map for unweighted calc
-                       SeqMap dummy;
-                       seqVec.push_back(dummy);
-                       
-                       delete matrix;
-                       delete readMatrix;
-                       delete nameMap;
-                       
-                       if (m->control_pressed) { return 0; }
-               }else {
-                       //process file and set up indexes
-                       if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
-                       else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
-                       else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
-                       
-                       formatMatrix->setCutoff(cutoff);
-       
-                       if(namefile != ""){     
-                               nameMap = new NameAssignment(namefile);
-                               nameMap->readMap();
-                       }else{  nameMap = NULL;         }
-                       
-                       formatMatrix->read(nameMap);
-                       
-                       if (m->control_pressed) { delete formatMatrix;  return 0; }
-
-                       list = formatMatrix->getListVector();
-                       
-                       distFile = formatMatrix->getFormattedFileName();
-                       
-                       //positions in file where the distances for each sequence begin
-                       //rowPositions[1] = position in file where distance related to sequence 1 start.
-                       rowPositions = formatMatrix->getRowPositions();
-                       rowPositions.push_back(-1); //dummy row for unweighted calc
-                       
-                       delete formatMatrix;
-                       delete nameMap;
-                       
-                       //openfile for getMap to use
-                       m->openInputFile(distFile, inRow);
-                       
-                       if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
-               }
-               
-               
-               //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
-               if (list != NULL) {
-                       vector<string> names;
-                       string binnames;
-                       //map names to rows in sparsematrix
-                       for (int i = 0; i < list->size(); i++) {
-                               names.clear();
-                               binnames = list->get(i);
-                               
-                               m->splitAtComma(binnames, names);
-                               
-                               for (int j = 0; j < names.size(); j++) {
-                                       nameToIndex[names[j]] = i;
-                               }
-                       }
-               } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
-               
+               readDist();             
                                 
-               if (m->control_pressed) { 
-                       if (large) {  inRow.close(); m->mothurRemove(distFile);  }
-                       return 0; 
-               }
+               if (m->control_pressed) { if (large) {  inRow.close(); m->mothurRemove(distFile);  } return 0; }
                 
                 if (groupfile != "") {
                         //read in group map info.
@@ -448,13 +385,18 @@ int GetOTURepCommand::execute(){
                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
                         
                         if (Groups.size() != 0) {
-                               SharedUtil* util = new SharedUtil();
+                               SharedUtil util;
                                 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
-                               util->setGroups(Groups, gNamesOfGroups, "getoturep");
+                               util.setGroups(Groups, gNamesOfGroups, "getoturep");
                                 groupMap->setNamesOfGroups(gNamesOfGroups);
-                               delete util;
                         }
-               }
+               }else if (hasGroups) {
+            if (Groups.size() != 0) {
+                               SharedUtil util;
+                               vector<string> gNamesOfGroups = ct.getNamesOfGroups();
+                               util.setGroups(Groups, gNamesOfGroups, "getoturep");
+                       }
+        }
                 
                 //done with listvector from matrix
                 if (list != NULL) { delete list; }
@@ -595,6 +537,11 @@ int GetOTURepCommand::execute(){
                 if (itTypes != outputTypes.end()) {
                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
                 }
+        
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
                 
                 m->mothurOutEndLine();
                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
@@ -608,7 +555,116 @@ int GetOTURepCommand::execute(){
                 exit(1);
         }
  }
+//**********************************************************************************************************************
+int GetOTURepCommand::readDist() {
+       try {
+        
+        if (!large) {
+                       //read distance files
+                       if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
+                       else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
+                       else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
+                       
+                       readMatrix->setCutoff(cutoff);
+            
+                       NameAssignment* nameMap = NULL;
+            if(namefile != ""){        
+                nameMap = new NameAssignment(namefile);
+                nameMap->readMap();
+                readMatrix->read(nameMap);
+            }else if (countfile != "") {
+                readMatrix->read(&ct);
+            }
+                       
+                       if (m->control_pressed) { delete readMatrix; return 0; }
+            
+                       list = readMatrix->getListVector();
+                       SparseDistanceMatrix* matrix = readMatrix->getDMatrix();
+                       
+                       // Create a data structure to quickly access the distance information.
+                       // It consists of a vector of distance maps, where each map contains
+                       // all distances of a certain sequence. Vector and maps are accessed
+                       // via the index of a sequence in the distance matrix
+                       seqVec = vector<SeqMap>(list->size()); 
+            for (int i = 0; i < matrix->seqVec.size(); i++) {
+                for (int j = 0; j < matrix->seqVec[i].size(); j++) {
+                    if (m->control_pressed) { delete readMatrix; return 0; }
+                    //already added everyone else in row
+                    if (i < matrix->seqVec[i][j].index) {  seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist;  }
+                }
+                       }
+                       //add dummy map for unweighted calc
+                       SeqMap dummy;
+                       seqVec.push_back(dummy);
+                       
+                       delete matrix;
+                       delete readMatrix;
+                       delete nameMap;
+                       
+                       if (m->control_pressed) { return 0; }
+               }else {
+                       //process file and set up indexes
+                       if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
+                       else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
+                       else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
+                       
+                       formatMatrix->setCutoff(cutoff);
+            
+                       NameAssignment* nameMap = NULL;
+            if(namefile != ""){        
+                nameMap = new NameAssignment(namefile);
+                nameMap->readMap();
+                readMatrix->read(nameMap);
+            }else if (countfile != "") {
+                readMatrix->read(&ct);
+            }
+                       
+                       if (m->control_pressed) { delete formatMatrix;  return 0; }
+            
+                       list = formatMatrix->getListVector();
+                       distFile = formatMatrix->getFormattedFileName();
+                       
+                       //positions in file where the distances for each sequence begin
+                       //rowPositions[1] = position in file where distance related to sequence 1 start.
+                       rowPositions = formatMatrix->getRowPositions();
+                       rowPositions.push_back(-1); //dummy row for unweighted calc
+                       
+                       delete formatMatrix;
+                       delete nameMap;
+                       
+                       //openfile for getMap to use
+                       m->openInputFile(distFile, inRow);
+                       
+                       if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
+               }
+               
+               
+               //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
+               if (list != NULL) {
+                       vector<string> names;
+                       string binnames;
+                       //map names to rows in sparsematrix
+                       for (int i = 0; i < list->size(); i++) {
+                               names.clear();
+                               binnames = list->get(i);
+                               
+                               m->splitAtComma(binnames, names);
+                               
+                               for (int j = 0; j < names.size(); j++) {
+                                       nameToIndex[names[j]] = i;
+                               }
+                       }
+               } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
  
+        if (m->control_pressed) { if (large) {  inRow.close(); m->mothurRemove(distFile);  }return 0; }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "GetOTURepCommand", "execute");
+               exit(1);
+       }
+}
  //**********************************************************************************************************************
  void GetOTURepCommand::readNamesFile() {
         try {
@@ -677,32 +733,38 @@ void GetOTURepCommand::readNamesFile(bool w) {
         }
  }
  //**********************************************************************************************************************
-string GetOTURepCommand::findRep(vector<string> names) {
+string GetOTURepCommand::findRep(vector<string> names, string group) {
         try{
                 // if only 1 sequence in bin or processing the "unique" label, then 
                 // the first sequence of the OTU is the representative one
                 if ((names.size() == 1)) {
                         return names[0];
                 }else{
-                       vector<int> seqIndex(names.size());
-                       vector<float> max_dist(names.size());
-                       vector<float> total_dist(names.size());
+                       vector<int> seqIndex; //(names.size());
                         map<string, string>::iterator itNameFile;
                         map<string, int>::iterator itNameIndex;
  
                         //fill seqIndex and initialize sums
                         for (size_t i = 0; i < names.size(); i++) {
                                 if (weighted) {
-                                       seqIndex[i] = nameToIndex[names[i]];
+                                       seqIndex.push_back(nameToIndex[names[i]]);
+                    if (countfile != "") {  //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone.
+                        int numRep = 0;
+                        if (group != "") {  numRep = ct.getGroupCount(names[i], group);  }
+                        else { numRep = ct.getGroupCount(names[i]);  }
+                        for (int j = 1; j < numRep; j++) { //don't add yourself again
+                            seqIndex.push_back(nameToIndex[names[i]]);
+                        }
+                    }
                                 }else { 
                                         if (namefile == "") {
                                                 itNameIndex = nameToIndex.find(names[i]);
                                                 
                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
-                                                       if (large) {  seqIndex[i] = (rowPositions.size()-1); }
-                                                       else {  seqIndex[i] = (seqVec.size()-1); }
+                                                       if (large) {  seqIndex.push_back((rowPositions.size()-1)); }
+                                                       else {  seqIndex.push_back((seqVec.size()-1)); }
                                                 }else {
-                                                       seqIndex[i] = itNameIndex->second;
+                                                       seqIndex.push_back(itNameIndex->second);
                                                 }
                                                 
                                         }else {
@@ -715,17 +777,18 @@ string GetOTURepCommand::findRep(vector<string> names) {
                                                         string name2 = itNameFile->second;
                                                         
                                                         if (name1 == name2) { //then you are unique so add your real dists
-                                                               seqIndex[i] = nameToIndex[names[i]];
+                                                               seqIndex.push_back(nameToIndex[names[i]]);
                                                         }else { //add dummy
-                                                               if (large) {  seqIndex[i] = (rowPositions.size()-1); }
-                                                               else {  seqIndex[i] = (seqVec.size()-1); }
+                                                               if (large) {  seqIndex.push_back((rowPositions.size()-1)); }
+                                                               else {  seqIndex.push_back((seqVec.size()-1)); }
                                                         }
                                                 }
                                         }
                                 }
-                               max_dist[i] = 0.0;
-                               total_dist[i] = 0.0;
                         }
+            
+            vector<float> max_dist(seqIndex.size(), 0.0);
+                       vector<float> total_dist(seqIndex.size(), 0.0);
                         
                         // loop through all entries in seqIndex
                         SeqMap::iterator it;
@@ -795,19 +858,33 @@ int GetOTURepCommand::process(ListVector* processList) {
                 map<string, ofstream*> filehandles;
                 
                 if (Groups.size() == 0) { //you don't want to use groups
-                       outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + getOutputFileNameTag("name");
-                       m->openOutputFile(outputNamesFile, newNamesOutput);
-                       outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
+                       outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".";
+            if (countfile == "") { 
+                outputNamesFile += getOutputFileNameTag("name");
+                outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
+            }else {
+                outputNamesFile += getOutputFileNameTag("count");
+                outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile); 
+            }
                         outputNameFiles[outputNamesFile] = processList->getLabel();
+            m->openOutputFile(outputNamesFile, newNamesOutput);
+            newNamesOutput << "noGroup" << endl;
                 }else{ //you want to use groups
                         ofstream* temp;
                         for (int i=0; i<Groups.size(); i++) {
                                 temp = new ofstream;
                                 filehandles[Groups[i]] = temp;
-                               outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + "." + getOutputFileNameTag("name");
+                               outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".";
+                if (countfile == "") { 
+                    outputNamesFile += getOutputFileNameTag("name");
+                    outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
+                }else {
+                    outputNamesFile += getOutputFileNameTag("count");
+                    outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile); 
+                }
                                 
                                 m->openOutputFile(outputNamesFile, *(temp));
-                               outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
+                *(temp) << Groups[i] << endl;
                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
                         }
                 }
@@ -832,7 +909,7 @@ int GetOTURepCommand::process(ListVector* processList) {
                         m->splitAtComma(temp, namesInBin);
                         
                         if (Groups.size() == 0) {
-                               nameRep = findRep(namesInBin);
+                               nameRep = findRep(namesInBin, "");
                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
                         }else{
                                 map<string, vector<string> > NamesInGroup;
@@ -841,20 +918,25 @@ int GetOTURepCommand::process(ListVector* processList) {
                                 }
                                 
                                 for (int j=0; j<namesInBin.size(); j++) {
-                                       string thisgroup = groupMap->getGroup(namesInBin[j]);
-                                       
-                                       if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
-                                       
-                                       if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
-                                               NamesInGroup[thisgroup].push_back(namesInBin[j]);
-                                       }
+                    if (groupfile != "") {
+                        string thisgroup = groupMap->getGroup(namesInBin[j]);
+                        if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+                        
+                        //add this name to correct group
+                        if (m->inUsersGroups(thisgroup, Groups)) { NamesInGroup[thisgroup].push_back(namesInBin[j]);  }
+                    }else {
+                        vector<string> thisSeqsGroups = ct.getGroups(namesInBin[j]);
+                        for (int k = 0; k < thisSeqsGroups.size(); k++) {
+                            if (m->inUsersGroups(thisSeqsGroups[k], Groups)) { NamesInGroup[thisSeqsGroups[k]].push_back(namesInBin[j]);  }
+                        }
+                    }
                                 }
                                 
                                 //get rep for each group in otu
                                 for (int j=0; j<Groups.size(); j++) {
                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
                                                 //get rep for each group
-                                               nameRep = findRep(NamesInGroup[Groups[j]]);
+                                               nameRep = findRep(NamesInGroup[Groups[j]], Groups[j]);
                                                 
                                                 //output group rep and other members of this group
                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
@@ -900,28 +982,51 @@ int GetOTURepCommand::processFastaNames(string filename, string label) {
                 ofstream out2;
                 string tempNameFile = filename + ".temp";
                 m->openOutputFile(tempNameFile, out2);
-               
+            
                 ifstream in;
                 m->openInputFile(filename, in);
                 
                 int i = 0;
+        string tempGroup = "";
+        in >> tempGroup; m->gobble(in);
+        
+        CountTable thisCt;
+        if (countfile != "") {
+            thisCt.readTable(countfile);
+            if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
+        }
+    
+        int thistotal = 0;
                 while (!in.eof()) {
                         string rep, binnames;
                         in >> i >> rep >> binnames; m->gobble(in);
-                       out2 << rep << '\t' << binnames << endl;
                         
                         vector<string> names;
                         m->splitAtComma(binnames, names);
                         int binsize = names.size();
-                       
+            
+            if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
+            else {
+                if (tempGroup == "noGroup") {
+                    for (int j = 0; j < names.size(); j++) {
+                        if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
+                    }
+                    binsize = thisCt.getNumSeqs(rep);
+                }else {
+                    int total = 0; 
+                    for (int j = 0; j < names.size(); j++) {  total += thisCt.getGroupCount(names[j], tempGroup);  }
+                    out2 << rep << '\t' << total << '\t' << total << endl;
+                    binsize = total;
+                }
+            }
+                       thistotal += binsize;
                         //if you have a groupfile
                         string group = "";
+            map<string, string> groups;
+            map<string, string>::iterator groupIt;
                         if (groupfile != "") {
-                               map<string, string> groups;
-                               map<string, string>::iterator groupIt;
-                               
                                 //find the groups that are in this bin
-                               for (size_t i = 0; i < names.size(); i++) {
+                               for (int i = 0; i < names.size(); i++) {
                                         string groupName = groupMap->getGroup(names[i]);
                                         if (groupName == "not found") {  
                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
@@ -937,7 +1042,21 @@ int GetOTURepCommand::processFastaNames(string filename, string label) {
                                 }
                                 //rip off last dash
                                 group = group.substr(0, group.length()-1);
-                       }else{ group = ""; }
+                       }else if (hasGroups) {
+                map<string, string> groups;
+                for (int i = 0; i < names.size(); i++) {
+                    vector<string> thisSeqsGroups = ct.getGroups(names[i]);
+                    for (int j = 0; j < thisSeqsGroups.size(); j++) { groups[thisSeqsGroups[j]] = thisSeqsGroups[j]; }
+                }
+                //turn the groups into a string
+                               for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
+                                       group += groupIt->first + "-";
+                               }
+                               //rip off last dash
+                               group = group.substr(0, group.length()-1);
+                //cout << group << endl;
+            }
+            else{ group = ""; }
  
                         
                         //print out name and sequence for that bin
@@ -947,7 +1066,7 @@ int GetOTURepCommand::processFastaNames(string filename, string label) {
                                 if (sorted == "") { //print them out
                                         rep = rep + "\t" + toString(i+1);
                                         rep = rep + "|" + toString(binsize);
-                                       if (groupfile != "") {
+                                       if (group != "") {
                                                 rep = rep + "|" + group;
                                         }
                                         out << ">" << rep << endl;
@@ -973,7 +1092,7 @@ int GetOTURepCommand::processFastaNames(string filename, string label) {
                                 string sequence = fasta->getSequence(reps[i].name);
                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
                                 outputName = outputName + "|" + toString(reps[i].size);
-                               if (groupfile != "") {
+                               if (reps[i].group != "") {
                                         outputName = outputName + "|" + reps[i].group;
                                 }
                                 out << ">" << outputName << endl;
@@ -984,9 +1103,11 @@ int GetOTURepCommand::processFastaNames(string filename, string label) {
                 in.close();
                 out.close();
                 out2.close();
-               
+               
                 m->mothurRemove(filename);
                 rename(tempNameFile.c_str(), filename.c_str());
+        
+        if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); } 
                 
                 return 0;
  
@@ -1012,10 +1133,35 @@ int GetOTURepCommand::processNames(string filename, string label) {
                 
                 int i = 0;
                 string rep, binnames;
+        
+        string tempGroup = "";
+        in >> tempGroup; m->gobble(in);
+        
+        CountTable thisCt;
+        if (countfile != "") {
+            thisCt.readTable(countfile);
+            if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
+        }
+        
                 while (!in.eof()) {
                         if (m->control_pressed) { break; }
                         in >> i >> rep >> binnames; m->gobble(in);
-                       out2 << rep << '\t' << binnames << endl;
+            
+                       if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
+            else {
+                vector<string> names;
+                m->splitAtComma(binnames, names);
+                if (tempGroup == "noGroup") {
+                    for (int j = 0; j < names.size(); j++) {
+                        if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
+                    }
+                }else {
+                    int total = 0; 
+                    for (int j = 0; j < names.size(); j++) {  total += thisCt.getGroupCount(names[j], tempGroup);  }
+                    out2 << rep << '\t' << total << '\t' << total << endl;
+                }
+            }
+
                 }
                 in.close();
                 out2.close();
@@ -1023,6 +1169,8 @@ int GetOTURepCommand::processNames(string filename, string label) {
                 m->mothurRemove(filename);
                 rename(tempNameFile.c_str(), filename.c_str());
                 
+        if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); } 
+        
                 return 0;
         }
         catch(exception& e) {
diff --git a/getoturepcommand.h b/getoturepcommand.h

index d19a396410f2e2c9850f9b20a93d65c03000bac2..390632975afc85aac5b714c7194c81e74d58e16e 100644 (file)
--- a/getoturepcommand.h
+++ b/getoturepcommand.h
@@ -18,6 +18,7 @@
  #include "groupmap.h"
  #include "readmatrix.hpp"
  #include "formatmatrix.h"
+#include "counttable.h"
  
  typedef map<int, float> SeqMap;
  
@@ -60,10 +61,11 @@ private:
         ReadMatrix* readMatrix;
         FormatMatrix* formatMatrix;
         NameAssignment* nameMap;
-       string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, columnfile, distFile, format, outputDir, groups;
+    CountTable ct;
+       string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, countfile, columnfile, distFile, format, outputDir, groups;
         ofstream out;
         ifstream in, inNames, inRow;
-       bool abort, allLines, groupError, large, weighted;
+       bool abort, allLines, groupError, large, weighted, hasGroups;
         set<string> labels; //holds labels to be used
         map<string, int> nameToIndex;  //maps sequence name to index in sparsematrix
         map<string, string> nameFileMap;
@@ -79,9 +81,10 @@ private:
         void readNamesFile(bool);
         int process(ListVector*);
         SeqMap getMap(int);
-       string findRep(vector<string>);         // returns the name of the "representative" sequence of given bin or subset of a bin, for groups
+       string findRep(vector<string>, string);         // returns the name of the "representative" sequence of given bin or subset of a bin, for groups
         int processNames(string, string);
         int processFastaNames(string, string);
+    int readDist();
  };
  
  #endif
diff --git a/getseqscommand.cpp b/getseqscommand.cpp

index ccabafb6d19c008eb6957a35e79b5cd0623021d0..6b16111fdcda83099c0e36396543d40ee345c865 100644 (file)
--- a/getseqscommand.cpp
+++ b/getseqscommand.cpp
@@ -10,13 +10,15 @@
  #include "getseqscommand.h"
  #include "sequence.hpp"
  #include "listvector.hpp"
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> GetSeqsCommand::setParameters(){        
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist);
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy);
                 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport);
@@ -40,7 +42,7 @@ vector<string> GetSeqsCommand::setParameters(){
  string GetSeqsCommand::getHelpString(){        
         try {
                 string helpString = "";
-               helpString += "The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, list, taxonomy, quality or alignreport file.\n";
+               helpString += "The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, count, list, taxonomy, quality or alignreport file.\n";
                 helpString += "It outputs a file containing only the sequences in the .accnos file.\n";
                 helpString += "The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups.  You must provide accnos unless you have a valid current accnos file, and at least one of the other parameters.\n";
                 helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n";
@@ -68,6 +70,7 @@ GetSeqsCommand::GetSeqsCommand(){
                 outputTypes["alignreport"] = tempOutNames;
                 outputTypes["list"] = tempOutNames;
                 outputTypes["qfile"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
                 outputTypes["accnosreport"] = tempOutNames;
         }
         catch(exception& e) {
@@ -88,6 +91,7 @@ string GetSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
              if (type == "fasta")            {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "taxonomy")    {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "name")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
+            else if (type == "count")       {   outputFileName =  "pick.count_table";   }
              else if (type == "group")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "list")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "qfile")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
@@ -135,6 +139,7 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                         outputTypes["list"] = tempOutNames;
                         outputTypes["qfile"] = tempOutNames;
                         outputTypes["accnosreport"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
@@ -215,6 +220,14 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -270,17 +283,32 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                         if (accnosfile2 == "not open") { abort = true; }
                         else if (accnosfile2 == "not found") {  accnosfile2 = "";  }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+            if (countfile == "not open") { countfile = ""; abort = true; }
+            else if (countfile == "not found") { countfile = "";  }    
+            else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+
                         
                         string usedDups = "true";
                         string temp = validParameter.validFile(parameters, "dups", false);      if (temp == "not found") { temp = "true"; usedDups = ""; }
                         dups = m->isTrue(temp);
                         
-                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == ""))  { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; }
-               
-                       if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
-                               vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
-                               parser.getNameFile(files);
-                       }
+                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "") && (countfile == ""))  { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; }
+            
+            if (countfile == "") {
+                if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
+                    vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
  
         }
@@ -300,11 +328,18 @@ int GetSeqsCommand::execute(){
                 names = m->readAccnos(accnosfile);
                 
                 if (m->control_pressed) { return 0; }
+        
+        if (countfile != "") {
+            if ((fastafile != "") || (listfile != "") || (taxfile != "")) { 
+                m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n");
+            }
+        }
                 
                 //read through the correct file and output lines you want to keep
                 if (namefile != "")                     {               readName();                     }
                 if (fastafile != "")            {               readFasta();            }
                 if (groupfile != "")            {               readGroup();            }
+        if (countfile != "")           {               readCount();            }
                 if (alignfile != "")            {               readAlign();            }
                 if (listfile != "")                     {               readList();                     }
                 if (taxfile != "")                      {               readTax();                      }
@@ -354,6 +389,10 @@ int GetSeqsCommand::execute(){
                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
                         }
                         
+            itTypes = outputTypes.find("count");
+                       if (itTypes != outputTypes.end()) {
+                               if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+                       }
                 }
                 
                 return 0;               
@@ -493,6 +532,64 @@ int GetSeqsCommand::readQual(){
                 exit(1);
         }
  }
+//**********************************************************************************************************************
+int GetSeqsCommand::readCount(){
+       try {
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               
+               ifstream in;
+               m->openInputFile(countfile, in);
+               
+               bool wroteSomething = false;
+               int selectedCount = 0;
+               
+        string headers = m->getline(in); m->gobble(in);
+        out << headers << endl;
+        
+        string name, rest; int thisTotal;
+        while (!in.eof()) {
+            
+            if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
+            
+            in >> name; m->gobble(in); 
+            in >> thisTotal; m->gobble(in);
+            rest = m->getline(in); m->gobble(in);
+            if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); }
+            
+            if (names.count(name) != 0) {
+                out << name << '\t' << thisTotal << '\t' << rest << endl;
+                wroteSomething = true;
+                selectedCount+= thisTotal;
+            }
+        }
+        in.close();
+               out.close();
+        
+        //check for groups that have been eliminated
+        CountTable ct;
+        if (ct.testGroups(outputFileName)) {
+            ct.readTable(outputFileName);
+            ct.printTable(outputFileName);
+        }
+               
+               if (wroteSomething == false) {  m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine();  }
+               outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName);
+               
+               m->mothurOut("Selected " + toString(selectedCount) + " sequences from your count file."); m->mothurOutEndLine();
+        
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "GetSeqsCommand", "readCount");
+               exit(1);
+       }
+}
+
  //**********************************************************************************************************************
  int GetSeqsCommand::readList(){
         try {
@@ -528,19 +625,16 @@ int GetSeqsCommand::readList(){
                         
                                 //parse out names that are in accnos file
                                 string binnames = list.get(i);
+                vector<string> bnames;
+                m->splitAtComma(binnames, bnames);
                                 
                                 string newNames = "";
-                               while (binnames.find_first_of(',') != -1) { 
-                                       string name = binnames.substr(0,binnames.find_first_of(','));
-                                       binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length());
-                                       
+                for (int i = 0; i < bnames.size(); i++) {
+                                       string name = bnames[i];
                                         //if that name is in the .accnos file, add it
                                         if (names.count(name) != 0) {  newNames += name + ",";  selectedCount++; if (m->debug) { sanity["list"].insert(name); } }
                                 }
                         
-                               //get last name
-                               if (names.count(binnames) != 0) {  newNames += binnames + ",";  selectedCount++;  if (m->debug) { sanity["list"].insert(binnames); } }
-
                                 //if there are names in this bin add to new list
                                 if (newNames != "") { 
                                         newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma
diff --git a/getseqscommand.h b/getseqscommand.h

index c71b5f2804a71190c6919f0dc80bf80d32f15508..60e471eebf7865d5924406ef7c720c3bdcdf4648 100644 (file)
--- a/getseqscommand.h
+++ b/getseqscommand.h
@@ -35,7 +35,7 @@ class GetSeqsCommand : public Command {
         private:
                 set<string> names;
                 vector<string> outputNames;
-               string accnosfile, accnosfile2, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir;
+               string accnosfile, accnosfile2, fastafile, namefile, countfile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir;
                 bool abort, dups;
      
          //for debug
@@ -44,6 +44,7 @@ class GetSeqsCommand : public Command {
                 int readFasta();
                 int readName();
                 int readGroup();
+        int readCount();
                 int readAlign();
                 int readList();
                 int readTax();
diff --git a/groupmap.cpp b/groupmap.cpp

index 612b2364d617432d64819413b8d8eb03d9865a64..fb2495c13fe8c09095221fbff3fb79c29c9d129a 100644 (file)
--- a/groupmap.cpp
+++ b/groupmap.cpp
@@ -44,6 +44,7 @@ int GroupMap::readMap() {
                  if (pairDone) { 
                      setNamesOfGroups(seqGroup);
                      
+                    if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
                      it = groupmap.find(seqName);
                      
                      if (it != groupmap.end()) { error = 1; m->mothurOut("Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
@@ -57,6 +58,30 @@ int GroupMap::readMap() {
          }
                 fileHandle.close();
          
+        if (rest != "") {
+            vector<string> pieces = m->splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  seqName = pieces[i]; columnOne=false; }
+                else  { seqGroup = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    setNamesOfGroups(seqGroup);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
+                    
+                    it = groupmap.find(seqName);
+                    
+                    if (it != groupmap.end()) { error = 1; m->mothurOut("Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
+                    else {
+                        groupmap[seqName] = seqGroup;  //store data in map
+                        seqsPerGroup[seqGroup]++;  //increment number of seqs in that group
+                    }
+                    pairDone = false; 
+                } 
+            }
+        }
+        
                 m->setAllGroups(namesOfGroups);
                 return error;
      }
@@ -88,6 +113,8 @@ int GroupMap::readDesignMap() {
                  if (pairDone) { 
                      setNamesOfGroups(seqGroup);
                      
+                    if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
+                    
                      it = groupmap.find(seqName);
                      
                      if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
@@ -101,6 +128,31 @@ int GroupMap::readDesignMap() {
          }
                 fileHandle.close();
          
+        if (rest != "") {
+            vector<string> pieces = m->splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  seqName = pieces[i]; columnOne=false; }
+                else  { seqGroup = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    setNamesOfGroups(seqGroup);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
+                    
+                    it = groupmap.find(seqName);
+                    
+                    if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
+                    else {
+                        groupmap[seqName] = seqGroup;  //store data in map
+                        seqsPerGroup[seqGroup]++;  //increment number of seqs in that group
+                    }
+                    pairDone = false; 
+                } 
+            }
+
+        }
+        
                 m->setAllGroups(namesOfGroups);
                 return error;
      }
@@ -110,6 +162,79 @@ int GroupMap::readDesignMap() {
         }
  }
  /************************************************************/
+int GroupMap::readMap(string filename) {
+    try {
+        groupFileName = filename;
+        m->openInputFile(filename, fileHandle);
+        index = 0;
+        string seqName, seqGroup;
+               int error = 0;
+        string rest = "";
+        char buffer[4096];
+        bool pairDone = false;
+        bool columnOne = true;
+        
+        while (!fileHandle.eof()) {
+            if (m->control_pressed) { fileHandle.close();  return 1; }
+            
+            fileHandle.read(buffer, 4096);
+            vector<string> pieces = m->splitWhiteSpace(rest, buffer, fileHandle.gcount());
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  seqName = pieces[i]; columnOne=false; }
+                else  { seqGroup = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    setNamesOfGroups(seqGroup);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
+                    
+                    it = groupmap.find(seqName);
+                    
+                    if (it != groupmap.end()) { error = 1; m->mothurOut("Your group file contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
+                    else {
+                        groupmap[seqName] = seqGroup;  //store data in map
+                        seqsPerGroup[seqGroup]++;  //increment number of seqs in that group
+                    }
+                    pairDone = false; 
+                } 
+            }
+        }
+               fileHandle.close();
+        
+        if (rest != "") {
+            vector<string> pieces = m->splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  seqName = pieces[i]; columnOne=false; }
+                else  { seqGroup = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    setNamesOfGroups(seqGroup);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
+                    
+                    it = groupmap.find(seqName);
+                    
+                    if (it != groupmap.end()) { error = 1; m->mothurOut("Your group file contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
+                    else {
+                        groupmap[seqName] = seqGroup;  //store data in map
+                        seqsPerGroup[seqGroup]++;  //increment number of seqs in that group
+                    }
+                    pairDone = false; 
+                } 
+            }
+        }
+        
+               m->setAllGroups(namesOfGroups);
+               return error;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "GroupMap", "readMap");
+               exit(1);
+       }
+}
+/************************************************************/
  int GroupMap::readDesignMap(string filename) {
      try {
          groupFileName = filename;
@@ -135,6 +260,8 @@ int GroupMap::readDesignMap(string filename) {
                  if (pairDone) { 
                      setNamesOfGroups(seqGroup);
                      
+                    if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
+                    
                      it = groupmap.find(seqName);
                      
                      if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
@@ -148,6 +275,30 @@ int GroupMap::readDesignMap(string filename) {
          }
                 fileHandle.close();
          
+        if (rest != "") {
+            vector<string> pieces = m->splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  seqName = pieces[i]; columnOne=false; }
+                else  { seqGroup = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    setNamesOfGroups(seqGroup);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); }
+                    
+                    it = groupmap.find(seqName);
+                    
+                    if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
+                    else {
+                        groupmap[seqName] = seqGroup;  //store data in map
+                        seqsPerGroup[seqGroup]++;  //increment number of seqs in that group
+                    }
+                    pairDone = false; 
+                } 
+            }
+        }
+        
                 m->setAllGroups(namesOfGroups);
                 return error;
      }
diff --git a/groupmap.h b/groupmap.h

index 567165de820fd480ed2a4ecf8ff785bc9ec1c92c..d6984952188de368c765bf2997198f1fefb8bce5 100644 (file)
--- a/groupmap.h
+++ b/groupmap.h
@@ -21,6 +21,7 @@ public:
         GroupMap(string);
         ~GroupMap();
         int readMap();
+    int readMap(string);
         int readDesignMap();
      int readDesignMap(string);
         int getNumGroups();
diff --git a/hcluster.cpp b/hcluster.cpp

index 6cd45314194fccce192c7654e2d8994aff6766c5..f8f48095b2ec55a8dce91d55881271f2f1837218 100644 (file)
--- a/hcluster.cpp
+++ b/hcluster.cpp
@@ -10,7 +10,6 @@
  #include "hcluster.h"
  #include "rabundvector.hpp"
  #include "listvector.hpp"
-#include "sparsematrix.hpp"
  
  /***********************************************************************/
  HCluster::HCluster(RAbundVector* rav, ListVector* lv, string ms, string d, NameAssignment* n, float c) :  rabund(rav), list(lv), method(ms), distfile(d), nameMap(n), cutoff(c) {
diff --git a/heatmapsimcommand.cpp b/heatmapsimcommand.cpp

index 3de10e6a8cd9411d6829ac699ffd11991af81007..8a4a12bc70f2d707fb923e74036043f12c1dcd3f 100644 (file)
--- a/heatmapsimcommand.cpp
+++ b/heatmapsimcommand.cpp
@@ -25,7 +25,8 @@ vector<string> HeatMapSimCommand::setParameters(){
         try {
                 CommandParameter pshared("shared", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pshared);        
                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pphylip);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
+               CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "ColumnName",false,false); parameters.push_back(pcolumn);          
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
@@ -48,9 +49,8 @@ string HeatMapSimCommand::getHelpString(){
         try {
                 string helpString = "";
                 ValidCalculators validCalculator;
-               helpString += "The heatmap.sim command parameters are shared, phylip, column, name, groups, calc, fontsize and label.  shared or phylip or column and name are required unless valid current files exist.\n";
-               helpString += "There are two ways to use the heatmap.sim command. The first is with the read.otu command. \n";
-               helpString += "With the read.otu command you may use the groups, label and calc parameters. \n";
+               helpString += "The heatmap.sim command parameters are shared, phylip, column, name, count, groups, calc, fontsize and label.  shared or phylip or column and name are required unless valid current files exist.\n";
+               helpString += "There are two ways to use the heatmap.sim command. The first is with a shared file, and you may use the groups, label and calc parameter. \n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included in your heatmap.\n";
                 helpString += "The group names are separated by dashes. The label parameter allows you to select what distance levels you would like a heatmap created for, and is also separated by dashes.\n";
                 helpString += "The fontsize parameter allows you to adjust the font size of the picture created, default=24.\n";
@@ -174,6 +174,14 @@ HeatMapSimCommand::HeatMapSimCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         //required parameters
@@ -197,6 +205,12 @@ HeatMapSimCommand::HeatMapSimCommand(string option)  {
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { m->setCountTableFile(countfile); }
+                       
+            if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
                         
                         //error checking on files                       
                         if ((sharedfile == "") && ((phylipfile == "") && (columnfile == "")))   { 
@@ -224,8 +238,12 @@ HeatMapSimCommand::HeatMapSimCommand(string option)  {
                                         namefile = m->getNameFile(); 
                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
                                         else { 
-                                               m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
-                                               abort = true; 
+                        countfile = m->getCountTableFile(); 
+                                               if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                        else { 
+                            m->mothurOut("You need to provide a name or count file if you are going to use the column format."); m->mothurOutEndLine(); 
+                            abort = true; 
+                        }      
                                         }       
                                 }
                         }
@@ -520,20 +538,28 @@ int HeatMapSimCommand::runCommandDist() {
                         in.close();
                 }else {
                         //read names file
-                       NameAssignment* nameMap = new NameAssignment(namefile);
-                       nameMap->readMap();
-                       
-                       //put names in order in vector
-                       for (int i = 0; i < nameMap->size(); i++) {
-                               names.push_back(nameMap->get(i));
-                       }
-                       
-                       //resize matrix
-                       matrix.resize(nameMap->size());
-                       for (int i = 0; i < nameMap->size(); i++) {
-                               matrix[i].resize(nameMap->size(), 0.0);
-                       }
-                       
+                       NameAssignment* nameMap;
+            CountTable ct; 
+            if (namefile != "") { 
+                nameMap = new NameAssignment(namefile);
+                nameMap->readMap();
+                
+                //put names in order in vector
+                for (int i = 0; i < nameMap->size(); i++) {
+                    names.push_back(nameMap->get(i));
+                }
+             }else if (countfile != "") {
+                nameMap = NULL;
+                ct.readTable(countfile);
+                names = ct.getNamesOfSeqs();
+            }
+                       
+            //resize matrix
+            matrix.resize(names.size());
+            for (int i = 0; i < names.size(); i++) {
+                matrix[i].resize(names.size(), 0.0);
+            }
+                                               
                         //read column file
                         string first, second;
                         double dist;
@@ -544,19 +570,26 @@ int HeatMapSimCommand::runCommandDist() {
                                 
                                 if (m->control_pressed) { return 0; }
                                 
-                               map<string, int>::iterator itA = nameMap->find(first);
-                               map<string, int>::iterator itB = nameMap->find(second);
-                               
-                               if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + first + "' was not found in the names file, please correct\n"); exit(1);  }
-                               if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + second + "' was not found in the names file, please correct\n"); exit(1);  }
-                               
-                               //save distance
-                               matrix[itA->second][itB->second] = dist;
-                               matrix[itB->second][itA->second] = dist;
+                if (namefile != "") {
+                    map<string, int>::iterator itA = nameMap->find(first);
+                    map<string, int>::iterator itB = nameMap->find(second);
+                    
+                    if(itA == nameMap->end()){  m->mothurOut("AAError: Sequence '" + first + "' was not found in the names file, please correct\n"); exit(1);  }
+                    if(itB == nameMap->end()){  m->mothurOut("ABError: Sequence '" + second + "' was not found in the names file, please correct\n"); exit(1);  }
+                    
+                    //save distance
+                    matrix[itA->second][itB->second] = dist;
+                    matrix[itB->second][itA->second] = dist;
+                }else if (countfile != "") {
+                    int itA = ct.get(first);
+                    int itB = ct.get(second);
+                    matrix[itA][itB] = dist;
+                    matrix[itB][itA] = dist;
+                }
                         }
                         in.close();
                         
-                       delete nameMap;
+                       if (namefile != "") { delete nameMap; }
                 }
                 
                 
diff --git a/heatmapsimcommand.h b/heatmapsimcommand.h

index 7b74880955594da0bb863c4b1755cf32c7b429e7..2c3a470761d719af0bff13bc4d194e5164bd42e0 100644 (file)
--- a/heatmapsimcommand.h
+++ b/heatmapsimcommand.h
@@ -43,7 +43,7 @@ private:
         OptionParser* parser;
         bool abort, allLines;
         set<string> labels; //holds labels to be used
-       string format, groups, label, calc, sharedfile, phylipfile, columnfile, namefile, outputDir, inputfile;
+       string format, groups, label, calc, sharedfile, phylipfile, columnfile, countfile, namefile, outputDir, inputfile;
         vector<string> Estimators, Groups, outputNames;
         int fontsize;
         
diff --git a/indicatorcommand.cpp b/indicatorcommand.cpp

index f98620b2ee0bd4cc70bc10d1db908d8de40de995..dc9f121a0e83758d4ecf4977294e8221b3da0712 100644 (file)
--- a/indicatorcommand.cpp
+++ b/indicatorcommand.cpp
@@ -287,17 +287,22 @@ int IndicatorCommand::execute(){
                         string groupfile = ""; 
                         m->setTreeFile(treefile);
                         Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
-                       treeMap = new TreeMap();
+                       ct = new CountTable();
                         bool mismatch = false;
-                               
-                       for (int i = 0; i < m->Treenames.size(); i++) { 
-                               //sanity check - is this a group that is not in the sharedfile?
+            
+            set<string> nameMap;
+            map<string, string> groupMap;
+            set<string> gps;
+            for (int i = 0; i < m->Treenames.size(); i++) { 
+                nameMap.insert(m->Treenames[i]); 
+                //sanity check - is this a group that is not in the sharedfile?
                                 if (designfile == "") {
+                    if (i == 0) { gps.insert("Group1"); }
                                         if (!(m->inUsersGroups(m->Treenames[i], m->getAllGroups()))) {
                                                 m->mothurOut("[ERROR]: " + m->Treenames[i] + " is not a group in your shared or relabund file."); m->mothurOutEndLine();
                                                 mismatch = true;
                                         }
-                                       treeMap->addSeq(m->Treenames[i], "Group1"); 
+                                       groupMap[m->Treenames[i]] = "Group1"; 
                                 }else{
                                         vector<string> myGroups; myGroups.push_back(m->Treenames[i]);
                                         vector<string> myNames = designMap->getNamesSeqs(myGroups);
@@ -308,9 +313,10 @@ int IndicatorCommand::execute(){
                                                         mismatch = true;
                                                 }
                                         }
-                                       treeMap->addSeq(m->Treenames[i], "Group1");
+                                       groupMap[m->Treenames[i]] = "Group1";
                                 }
-                       }
+            }
+            ct->createTable(nameMap, groupMap, gps);
                         
                         if ((designfile != "") && (m->Treenames.size() != Groups.size())) { cout << Groups.size() << '\t' << m->Treenames.size() << endl; m->mothurOut("[ERROR]: You design file does not match your tree, aborting."); m->mothurOutEndLine(); mismatch = true; }
                                         
@@ -318,14 +324,14 @@ int IndicatorCommand::execute(){
                                 if (designfile != "") { delete designMap; }
                                 if (sharedfile != "") {  for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } }
                                 else { for (int i = 0; i < lookupFloat.size(); i++) {  delete lookupFloat[i];  } }
-                               delete treeMap;
+                               delete ct;
                                 return 0;
                         }
                  
                         read = new ReadNewickTree(treefile);
-                       int readOk = read->read(treeMap); 
+                       int readOk = read->read(ct); 
                         
-                       if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete treeMap; delete read; return 0; }
+                       if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete ct; delete read; return 0; }
                         
                         vector<Tree*> T = read->getTrees();
                         
@@ -335,19 +341,18 @@ int IndicatorCommand::execute(){
                                 if (designfile != "") { delete designMap; }
                                 if (sharedfile != "") {  for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } }
                                 else { for (int i = 0; i < lookupFloat.size(); i++) {  delete lookupFloat[i];  } }
-                               for (int i = 0; i < T.size(); i++) {  delete T[i];  }  delete treeMap; return 0; 
+                               for (int i = 0; i < T.size(); i++) {  delete T[i];  }  delete ct; return 0; 
                         }
              
-                       map<string, string> nameMap;    
-                       T[0]->assembleTree(nameMap);
+                       T[0]->assembleTree();
                                         
                         /***************************************************/
                         //    create ouptut tree - respecting pickedGroups //
                         /***************************************************/
-                       Tree* outputTree = new Tree(m->getNumGroups(), treeMap); 
+                       Tree* outputTree = new Tree(m->getNumGroups(), ct); 
                         
                         outputTree->getSubTree(T[0], m->getGroups());
-                       outputTree->assembleTree(nameMap);
+                       outputTree->assembleTree();
                                 
                         //no longer need original tree, we have output tree to use and label
                         for (int i = 0; i < T.size(); i++) {  delete T[i];  } 
@@ -356,14 +361,14 @@ int IndicatorCommand::execute(){
                                 if (designfile != "") { delete designMap; }
                                 if (sharedfile != "") {  for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } }
                                 else { for (int i = 0; i < lookupFloat.size(); i++) {  delete lookupFloat[i];  } }
-                               delete outputTree; delete treeMap;  return 0; 
+                               delete outputTree; delete ct;  return 0; 
                         }
                         
                         /***************************************************/
                         //              get indicator species values                       //
                         /***************************************************/
                         GetIndicatorSpecies(outputTree);
-                       delete outputTree; delete treeMap;
+                       delete outputTree; delete ct;
                         
                 }else { //run with design file only
                         //get indicator species
diff --git a/indicatorcommand.h b/indicatorcommand.h

index 2c36c35f6f3025bb3f07760055512b2e0e257914..3c24dfb843e770567a1486436536b97fabfe12b5 100644 (file)
--- a/indicatorcommand.h
+++ b/indicatorcommand.h
@@ -12,7 +12,7 @@
  
  #include "command.hpp"
  #include "readtree.h"
-#include "treemap.h"
+#include "counttable.h"
  #include "sharedrabundvector.h"
  #include "sharedrabundfloatvector.h"
  #include "inputdata.h"
@@ -36,7 +36,7 @@ public:
         
  private:
         ReadTree* read;
-       TreeMap* treeMap;
+       CountTable* ct;
         GroupMap* designMap;
         string treefile, sharedfile, relabundfile, groups, label, inputFileName, outputDir, designfile;
         bool abort;
diff --git a/kmernode.cpp b/kmernode.cpp

new file mode 100755 (executable)

index 0000000..c087cac
--- /dev/null
+++ b/kmernode.cpp
@@ -0,0 +1,209 @@
+/*
+ *  kmerNode.cpp
+ *  bayesian
+ *
+ *  Created by Pat Schloss on 10/11/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "kmernode.h"
+
+
+/**********************************************************************************************************************/
+
+KmerNode::KmerNode(string s, int l, int n) : TaxonomyNode(s, l), kmerSize(n) {
+       try {
+        int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+        
+        numPossibleKmers = power4s[kmerSize];
+        numUniqueKmers = 0;
+        
+        kmerVector.assign(numPossibleKmers, 0);
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "KmerNode");
+               exit(1);
+       }
+}
+
+/**********************************************************************************************************************/
+
+void KmerNode::loadSequence(vector<int>& kmerProfile){
+       try {
+        for(int i=0;i<numPossibleKmers;i++){
+            if (m->control_pressed) { break; }
+            if(kmerVector[i] == 0 && kmerProfile[i] != 0)      {       numUniqueKmers++;       }
+            
+            kmerVector[i] += kmerProfile[i];
+        }
+        
+        numSeqs++;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "loadSequence");
+               exit(1);
+       }
+}      
+
+/**********************************************************************************************************************/
+
+string KmerNode::getKmerBases(int kmerNumber){
+       try {
+        //     Here we convert the kmer number into the kmer in terms of bases.
+        //
+        //     Example:        Score = 915 (for a 6-mer)
+        //                             Base6 = (915 / 4^0) % 4 = 915 % 4 = 3 => T      [T]
+        //                             Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A      [AT]
+        //                             Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C       [CAT]
+        //                             Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G       [GCAT]
+        //                             Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T        [TGCAT]
+        //                             Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A        [ATGCAT] -> this checks out with the previous method
+        
+        int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+        
+        string kmer = "";
+        
+        if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){      //      if the kmer number is the same as the maxKmer then it must
+            for(int i=0;i<kmerSize;i++){                                       //      have had an N in it and so we'll just call it N x kmerSize
+                kmer += 'N';
+            }
+        }
+        else{
+            for(int i=0;i<kmerSize;i++){
+                if (m->control_pressed) { return kmer; }
+                int nt = (int)(kmerNumber / (float)power4s[i]) % 4;            //      the '%' operator returns the remainder 
+                if(nt == 0)            {       kmer = 'A' + kmer;      }                               //      from int-based division ]
+                else if(nt == 1){      kmer = 'C' + kmer;      }
+                else if(nt == 2){      kmer = 'G' + kmer;      }
+                else if(nt == 3){      kmer = 'T' + kmer;      }
+            }
+        }
+        return kmer;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "getKmerBases");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+void KmerNode::addThetas(vector<int> newTheta, int newNumSeqs){
+       try {
+        for(int i=0;i<numPossibleKmers;i++){
+            if (m->control_pressed) { break; }
+            kmerVector[i] += newTheta[i];              
+        }
+        
+        //     if(alignLength == 0){
+        //             alignLength = (int)newTheta.size();
+        //             theta.resize(alignLength);
+        //             columnCounts.resize(alignLength);
+        //     }
+        //     
+        //     for(int i=0;i<alignLength;i++){ 
+        //             theta[i].A += newTheta[i].A;            columnCounts[i] += newTheta[i].A;
+        //             theta[i].T += newTheta[i].T;            columnCounts[i] += newTheta[i].T;
+        //             theta[i].G += newTheta[i].G;            columnCounts[i] += newTheta[i].G;
+        //             theta[i].C += newTheta[i].C;            columnCounts[i] += newTheta[i].C;
+        //             theta[i].gap += newTheta[i].gap;        columnCounts[i] += newTheta[i].gap;
+        //     }
+        
+        numSeqs += newNumSeqs;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "addThetas");
+               exit(1);
+       }
+}
+
+/**********************************************************************************************************************/
+
+int KmerNode::getNumUniqueKmers(){
+    try {
+        if(numUniqueKmers == 0){
+            
+            for(int i=0;i<numPossibleKmers;i++){
+                if (m->control_pressed) { return numUniqueKmers; }
+                if(kmerVector[i] != 0){
+                    numUniqueKmers++;
+                }
+                
+            }
+            
+        }
+        
+        return numUniqueKmers; 
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "getNumUniqueKmers");
+               exit(1);
+       }
+}
+
+/**********************************************************************************************************************/
+
+void KmerNode::printTheta(){
+       try {
+        m->mothurOut(name + "\n");
+        for(int i=0;i<numPossibleKmers;i++){
+            if(kmerVector[i] != 0){
+                m->mothurOut(getKmerBases(i) + '\t' + toString(kmerVector[i]) + "\n");
+            }
+        }
+        m->mothurOutEndLine(); 
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "printTheta");
+               exit(1);
+       }
+       
+}
+/**************************************************************************************************/
+
+double KmerNode::getSimToConsensus(vector<int>& queryKmerProfile){
+       try {
+        double present = 0;
+        
+        for(int i=0;i<numPossibleKmers;i++){
+            if (m->control_pressed) { return present; }
+            if(queryKmerProfile[i] != 0 && kmerVector[i] != 0){
+                present++;
+            }
+        }      
+        
+        return present / double(queryKmerProfile.size() - kmerSize + 1);
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "getSimToConsensus");
+               exit(1);
+       }
+}
+
+/**********************************************************************************************************************/
+
+double KmerNode::getPxGivenkj_D_j(vector<int>& queryKmerProfile)       {       
+       try {
+        double sumLogProb = 0.0000;
+        double alpha = 1.0 / (double)totalSeqs;        //flat prior
+        //     double alpha = pow((1.0 / (double)numUniqueKmers), numSeqs)+0.0001;     //non-flat prior
+        
+        for(int i=0;i<numPossibleKmers;i++){
+            if (m->control_pressed) { return sumLogProb; }
+            if(queryKmerProfile[i] != 0){              //numUniqueKmers needs to be the value from Root;
+                sumLogProb += log((kmerVector[i] + alpha) / (numSeqs + numUniqueKmers * alpha));
+            }
+            
+        }
+        return sumLogProb;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerNode", "getPxGivenkj_D_j");
+               exit(1);
+       }
+
+}
+
+/**********************************************************************************************************************/
diff --git a/kmernode.h b/kmernode.h

new file mode 100755 (executable)

index 0000000..e15fb1d
--- /dev/null
+++ b/kmernode.h
@@ -0,0 +1,45 @@
+#ifndef KMERNODE
+#define KMERNODE
+
+/*
+ *  kmerNode.h
+ *  bayesian
+ *
+ *  Created by Pat Schloss on 10/11/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+
+#include "taxonomynode.h"
+
+/**********************************************************************************************************************/
+
+class KmerNode : public TaxonomyNode {
+       
+public:
+       KmerNode(string, int, int);
+       void loadSequence(vector<int>&);
+       void printTheta();
+       double getPxGivenkj_D_j(vector<int>&);
+       double getSimToConsensus(vector<int>&);
+       void checkTheta(){};
+       void setNumUniqueKmers(int num) {       numUniqueKmers = num;   }
+       int getNumUniqueKmers();
+       void addThetas(vector<int>, int);
+       vector<int> getTheta()  {       return kmerVector;      }
+
+
+private:
+       string getKmerBases(int);
+       int kmerSize;                                                           //      value of k
+       int numPossibleKmers;                                           //      4^kmerSize
+       int numUniqueKmers;                                                     //      number of unique kmers seen in a group ~ O_kj
+       int numKmers;                                                           //      number of kmers in a sequence
+       vector<int> kmerVector;                                         //      counts of kmers across all sequences in a node
+};
+
+/**********************************************************************************************************************/
+
+#endif
+
diff --git a/kmertree.cpp b/kmertree.cpp

new file mode 100755 (executable)

index 0000000..fbf2bfb
--- /dev/null
+++ b/kmertree.cpp
@@ -0,0 +1,386 @@
+//
+//  kmerTree.cpp
+//  pdsBayesian
+//
+//  Created by Patrick Schloss on 4/3/12.
+//  Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#include "kmernode.h"
+#include "kmertree.h"
+
+/**************************************************************************************************/
+
+KmerTree::KmerTree(string referenceFileName, string taxonomyFileName, int k, int cutoff) : Classify(), confidenceThreshold(cutoff), kmerSize(k){
+       try {
+        KmerNode* newNode = new KmerNode("Root", 0, kmerSize);
+        tree.push_back(newNode);                       //      the tree is stored as a vector of elements of type TaxonomyNode
+        
+        int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+        numPossibleKmers = power4s[kmerSize];
+        
+        string refTaxonomy;
+        
+        readTaxonomy(taxonomyFileName);
+        
+        ifstream referenceFile;
+        m->openInputFile(referenceFileName, referenceFile);
+        bool error = false;
+        while(!referenceFile.eof()){
+            
+            if (m->control_pressed) { break; }
+            
+            Sequence seq(referenceFile);  m->gobble(referenceFile);
+            
+            if (seq.getName() != "") {
+                map<string, string>::iterator it = taxonomy.find(seq.getName());
+                
+                if (it != taxonomy.end()) {
+                    refTaxonomy = it->second;          //      lookup the taxonomy string for the current reference sequence
+                    vector<int> kmerProfile = ripKmerProfile(seq.getUnaligned());      //convert to kmer vector
+                    addTaxonomyToTree(seq.getName(), refTaxonomy, kmerProfile);
+                }else {
+                    m->mothurOut(seq.getName() + " is in your reference file, but not in your taxonomy file, please correct.\n"); error = true;
+                }
+            }
+        }
+        referenceFile.close();
+        
+        if (error) { m->control_pressed = true; }
+        
+        numTaxa = (int)tree.size();
+        numLevels = 0;
+        for(int i=0;i<numTaxa;i++){
+            int level = tree[i]->getLevel();
+            if(level > numLevels){     numLevels = level;      }
+        }
+        numLevels++;
+        
+        aggregateThetas();
+        
+        int dbSize = tree[0]->getNumSeqs();
+        
+        for(int i=0;i<numTaxa;i++){
+            tree[i]->checkTheta();
+            tree[i]->setNumUniqueKmers(tree[0]->getNumUniqueKmers());
+            tree[i]->setTotalSeqs(dbSize);
+        }
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "KmerTree");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+KmerTree::~KmerTree(){
+       
+       for(int i=0;i<tree.size();i++){
+               delete tree[i];
+       }
+       
+}      
+/**********************************************************************************************************************/
+
+vector<int> KmerTree::ripKmerProfile(string sequence){
+    try {
+        //     assume all input sequences are unaligned
+        
+        int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+        
+        int nKmers = (int)sequence.length() - kmerSize + 1;
+        
+        vector<int> kmerProfile(numPossibleKmers + 1, 0);
+        
+        for(int i=0;i<nKmers;i++){
+            
+            if (m->control_pressed) { break; }
+            
+            int kmer = 0;
+            for(int j=0;j<kmerSize;j++){
+                if(toupper(sequence[j+i]) == 'A')              {       kmer += (0 * power4s[kmerSize-j-1]);    }
+                else if(toupper(sequence[j+i]) == 'C') {       kmer += (1 * power4s[kmerSize-j-1]);    }
+                else if(toupper(sequence[j+i]) == 'G') {       kmer += (2 * power4s[kmerSize-j-1]);    }
+                else if(toupper(sequence[j+i]) == 'U') {       kmer += (3 * power4s[kmerSize-j-1]);    }
+                else if(toupper(sequence[j+i]) == 'T') {       kmer += (3 * power4s[kmerSize-j-1]);    }
+                else                                                                   {       kmer = power4s[kmerSize]; j = kmerSize; }
+            }
+            kmerProfile[kmer] = 1;
+        }
+        
+        return kmerProfile;    
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "ripKmerProfile");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int KmerTree::addTaxonomyToTree(string seqName, string taxonomy, vector<int>& sequence){
+       try {
+        KmerNode* newNode;
+        string taxonName = "";
+        int treePosition = 0;                                                  //      the root is element 0
+        
+        
+        int level = 1;
+        
+        for(int i=0;i<taxonomy.length();i++){                  //      step through taxonomy string...
+            
+            if (m->control_pressed) { break; }
+            if(taxonomy[i] == ';'){                                            //      looking for semicolons...
+                
+                if (taxonName == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); m->control_pressed = true; }
+                
+                int newIndex = tree[treePosition]->getChildIndex(taxonName);// look to see if your current node already
+                //        has a child with the new taxonName
+                if(newIndex != -1)     {       treePosition = newIndex;        }               //      if you've seen it before, jump to that
+                else {                                                                                                         //         position in the tree
+                    int newChildIndex = (int)tree.size();                                      //      otherwise, we'll have to create one...
+                    tree[treePosition]->makeChild(taxonName, newChildIndex);
+                    
+                    newNode = new KmerNode(taxonName, level, kmerSize);
+                    
+                    newNode->setParent(treePosition);
+                    
+                    tree.push_back(newNode);
+                    treePosition = newChildIndex;
+                }
+                
+                //     sequence data to that node to update that node's theta - seems slow...                          
+                taxonName = "";                                                                //      clear out the taxon name that we will build as we look 
+                level++;
+                
+            }                                                                                          //      for a semicolon
+            else{
+                taxonName += taxonomy[i];                                      //      keep adding letters until we reach a semicolon
+            }
+        }
+        
+        tree[treePosition]->loadSequence(sequence);    //      now that we've gotten to the correct node, add the
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "addTaxonomyToTree");
+               exit(1);
+       }
+       
+}
+
+/**************************************************************************************************/
+
+int KmerTree::aggregateThetas(){
+       try {
+        vector<vector<int> > levelMatrix(numLevels+1);
+        
+        for(int i=0;i<tree.size();i++){
+            if (m->control_pressed) { return 0; }
+            levelMatrix[tree[i]->getLevel()].push_back(i);
+        }
+        
+        for(int i=numLevels-1;i>0;i--) {
+            if (m->control_pressed) { return 0; }
+            
+            for(int j=0;j<levelMatrix[i].size();j++){
+                
+                KmerNode* holder = tree[levelMatrix[i][j]];
+                
+                tree[holder->getParent()]->addThetas(holder->getTheta(), holder->getNumSeqs());                                
+            }
+        }
+        
+        return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "aggregateThetas");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int KmerTree::getMinRiskIndexKmer(vector<int>& sequence, vector<int>& taxaIndices, vector<double>& probabilities){
+       try {
+        int numProbs = (int)probabilities.size();
+        
+        vector<double> G(numProbs, 0.2);       //a random sequence will, on average, be 20% similar to any other sequence; not sure that this holds up for kmers; whatever.
+        vector<double> risk(numProbs, 0);
+        
+        for(int i=1;i<numProbs;i++){ //use if you want the outlier group
+            if (m->control_pressed) { return 0; }
+            G[i] = tree[taxaIndices[i]]->getSimToConsensus(sequence);
+        }
+        
+        double minRisk = 1e6;
+        int minRiskIndex = 0;
+        
+        for(int i=0;i<numProbs;i++){
+            if (m->control_pressed) { return 0; }
+            for(int j=0;j<numProbs;j++){
+                if(i != j){
+                    risk[i] += probabilities[j] * G[j];
+                }                      
+            }
+            
+            if(risk[i] < minRisk){
+                minRisk = risk[i];
+                minRiskIndex = i;
+            }
+        }
+        
+        return minRiskIndex;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "getMinRiskIndexKmer");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int KmerTree::sanityCheck(vector<vector<int> >& indices, vector<int>& maxIndices){
+       try {
+        int finalLevel = (int)indices.size()-1;
+        
+        for(int position=1;position<indices.size();position++){
+            if (m->control_pressed) { return 0; }
+            int predictedParent = tree[indices[position][maxIndices[position]]]->getParent();
+            int actualParent = indices[position-1][maxIndices[position-1]];
+            
+            if(predictedParent != actualParent){
+                finalLevel = position - 1;
+                return finalLevel;
+            }
+        }
+        return finalLevel;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "sanityCheck");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+string KmerTree::getTaxonomy(Sequence* thisSeq){
+       try {
+        string seqName = thisSeq->getName(); string querySequence = thisSeq->getAligned(); string taxonProbabilityString = "";
+        string unalignedSeq = thisSeq->getUnaligned();
+        
+        double logPOutlier = (querySequence.length() - kmerSize + 1) * log(1.0/(double)tree[0]->getNumUniqueKmers());
+        
+        vector<int> queryProfile = ripKmerProfile(unalignedSeq);       //convert to kmer vector
+        
+        vector<vector<double> > pXgivenKj_D_j(numLevels);
+        vector<vector<int> > indices(numLevels);
+        for(int i=0;i<numLevels;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            pXgivenKj_D_j[i].push_back(logPOutlier);
+            indices[i].push_back(-1);
+        }
+        
+        for(int i=0;i<numTaxa;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            pXgivenKj_D_j[tree[i]->getLevel()].push_back(tree[i]->getPxGivenkj_D_j(queryProfile));
+            indices[tree[i]->getLevel()].push_back(i);
+        }
+        
+        vector<double> sumLikelihood(numLevels, 0);
+        vector<double> bestPosterior(numLevels, 0);
+        vector<int> maxIndex(numLevels, 0);
+        int maxPosteriorIndex;
+        
+        //let's find the best level and taxa within that level
+        for(int i=0;i<numLevels;i++){ //go across all j's - from the root to genus
+            if (m->control_pressed) { return taxonProbabilityString; }
+            
+            int numTaxaInLevel = (int)indices[i].size();
+            
+            vector<double> posteriors(numTaxaInLevel, 0);              
+            sumLikelihood[i] = getLogExpSum(pXgivenKj_D_j[i], maxPosteriorIndex);
+            
+            maxPosteriorIndex = 0;
+            for(int j=0;j<numTaxaInLevel;j++){
+                posteriors[j] = exp(pXgivenKj_D_j[i][j] - sumLikelihood[i]);
+                if(posteriors[j] > posteriors[maxPosteriorIndex]){     
+                    maxPosteriorIndex = j;
+                }
+                
+            }
+            
+            maxIndex[i] = getMinRiskIndexKmer(queryProfile, indices[i], posteriors);
+            
+            maxIndex[i] = maxPosteriorIndex;
+            bestPosterior[i] = posteriors[maxIndex[i]];        
+        }
+        
+        //     vector<double> pX_level(numLevels, 0);
+        //     
+        //     for(int i=0;i<numLevels;i++){
+        //             pX_level[i] = pXgivenKj_D_j[i][maxIndex[i]] - tree[indices[i][maxIndex[i]]]->getNumSeqs();
+        //     }
+        //     
+        //     int max_pLevel_X_index = -1;
+        //     double pX_level_sum = getLogExpSum(pX_level, max_pLevel_X_index);
+        //     double max_pLevel_X = exp(pX_level[max_pLevel_X_index] - pX_level_sum);
+        //     
+        //     vector<double> pLevel_X(numLevels, 0);
+        //     for(int i=0;i<numLevels;i++){
+        //             pLevel_X[i] = exp(pX_level[i] - pX_level_sum);
+        //     }
+        
+        int saneDepth = sanityCheck(indices, maxIndex);
+        
+        
+        //     stringstream levelProbabilityOutput;
+        //     levelProbabilityOutput.setf(ios::fixed, ios::floatfield);
+        //     levelProbabilityOutput.setf(ios::showpoint);
+        
+        
+        //taxonProbabilityOutput << seqName << '\t';
+        //     taxonProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
+        //     levelProbabilityOutput << seqName << '(' << max_pLevel_X_index << ';' << max_pLevel_X << ')' << '\t';
+        simpleTax = "";
+        int savedspot = 1;
+        taxonProbabilityString = "";
+        for(int i=1;i<=saneDepth;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            int confidenceScore = (int) (bestPosterior[i] * 100);
+            if (confidenceScore >= confidenceThreshold) {
+                if(indices[i][maxIndex[i]] != -1){
+                    taxonProbabilityString += tree[indices[i][maxIndex[i]]]->getName() + "(" + toString(confidenceScore) + ");";
+                    simpleTax += tree[indices[i][maxIndex[i]]]->getName() + ";";
+                    
+                    //                 levelProbabilityOutput << tree[indices[i][maxIndex[i]]]->getName() << '(' << setprecision(6) << pLevel_X[i] << ");";
+                }
+                else{
+                    taxonProbabilityString += "unclassified" + '(' + toString(confidenceScore) + ");";
+                    //                 levelProbabilityOutput << "unclassified" << '(' << setprecision(6) << pLevel_X[i] << ");";
+                    simpleTax += "unclassified;";
+                }
+            }else { break; }
+            savedspot = i;
+        }
+        
+        
+        
+        for(int i=savedspot+1;i<numLevels;i++){
+            if (m->control_pressed) { return taxonProbabilityString; }
+            taxonProbabilityString += "unclassified(0);";
+            simpleTax += "unclassified;";
+        }
+        
+        return taxonProbabilityString;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerTree", "getTaxonomy");
+               exit(1);
+       }
+}
+
+
+/**************************************************************************************************/
+
diff --git a/kmertree.h b/kmertree.h

new file mode 100755 (executable)

index 0000000..f7c10ef
--- /dev/null
+++ b/kmertree.h
@@ -0,0 +1,37 @@
+//
+//  kmerTree.h
+//  pdsBayesian
+//
+//  Created by Patrick Schloss on 4/3/12.
+//  Copyright (c) 2012 University of Michigan. All rights reserved.
+//
+
+#ifndef pdsBayesian_kmerTree_h
+#define pdsBayesian_kmerTree_h
+
+#include "classify.h"
+
+class KmerNode;
+
+class KmerTree : public Classify {
+       
+public:
+       KmerTree(string, string, int, int);
+       ~KmerTree();
+       
+    string getTaxonomy(Sequence*);
+
+private:
+    int addTaxonomyToTree(string, string, vector<int>&);
+       vector<int> ripKmerProfile(string);
+       int getMinRiskIndexKmer(vector<int>&, vector<int>&, vector<double>&);
+       int aggregateThetas();
+       int sanityCheck(vector<vector<int> >&, vector<int>&);
+
+       int kmerSize;
+       int numPossibleKmers, confidenceThreshold;
+       vector<KmerNode*> tree;
+
+};
+
+#endif
diff --git a/knn.cpp b/knn.cpp

index 837fa6d18d4f1aed6cf34e5861942b4230c30dd1..81b21b265785c2f8a83392ee52e7aeffbc9d4370 100644 (file)
--- a/knn.cpp
+++ b/knn.cpp
@@ -14,6 +14,7 @@ Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOp
  : Classify(), num(n), search(method) {
         try {
                 threadID = tid;
+        shortcuts = true;
                 
                 //create search database and names vector
                 generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch);
diff --git a/listseqscommand.cpp b/listseqscommand.cpp

index bfbb0788c5092382f22bea643c5405674361dd67..7c3f07f96e092d81c307673192dad29a31fee091 100644 (file)
--- a/listseqscommand.cpp
+++ b/listseqscommand.cpp
@@ -10,6 +10,7 @@
  #include "listseqscommand.h"
  #include "sequence.hpp"
  #include "listvector.hpp"
+#include "counttable.h"
  
  
  //**********************************************************************************************************************
@@ -17,6 +18,7 @@ vector<string> ListSeqsCommand::setParameters(){
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pfasta);
                 CommandParameter pname("name", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pcount);
                 CommandParameter pgroup("group", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter plist("list", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(plist);
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy);
@@ -37,8 +39,8 @@ vector<string> ListSeqsCommand::setParameters(){
  string ListSeqsCommand::getHelpString(){       
         try {
                 string helpString = "";
-               helpString += "The list.seqs command reads a fasta, name, group, list, taxonomy or alignreport file and outputs a .accnos file containing sequence names.\n";
-               helpString += "The list.seqs command parameters are fasta, name, group, list, taxonomy and alignreport.  You must provide one of these parameters.\n";
+               helpString += "The list.seqs command reads a fasta, name, group, count, list, taxonomy or alignreport file and outputs a .accnos file containing sequence names.\n";
+               helpString += "The list.seqs command parameters are fasta, name, group, count, list, taxonomy and alignreport.  You must provide one of these parameters.\n";
                 helpString += "The list.seqs command should be in the following format: list.seqs(fasta=yourFasta).\n";
                 helpString += "Example list.seqs(fasta=amazon.fasta).\n";
                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
@@ -164,6 +166,14 @@ ListSeqsCommand::ListSeqsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         //check for required parameters
@@ -195,8 +205,13 @@ ListSeqsCommand::ListSeqsCommand(string option)  {
                         if (taxfile == "not open") { abort = true; }
                         else if (taxfile == "not found") {  taxfile = "";  }
                         else { m->setTaxonomyFile(taxfile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; }
+                       else if (countfile == "not found") {  countfile = "";  }
+                       else { m->setCountTableFile(countfile); }
                         
-                       if ((fastafile == "") && (namefile == "") && (listfile == "") && (groupfile == "") && (alignfile == "") && (taxfile == ""))  { m->mothurOut("You must provide a file."); m->mothurOutEndLine(); abort = true; }
+                       if ((countfile == "") && (fastafile == "") && (namefile == "") && (listfile == "") && (groupfile == "") && (alignfile == "") && (taxfile == ""))  { m->mothurOut("You must provide a file."); m->mothurOutEndLine(); abort = true; }
                         
                         int okay = 1;
                         if (outputDir != "") { okay++; }
@@ -225,6 +240,7 @@ int ListSeqsCommand::execute(){
                 else if (alignfile != "")       {       inputFileName = alignfile;      readAlign();    }
                 else if (listfile != "")        {       inputFileName = listfile;       readList();             }
                 else if (taxfile != "")         {       inputFileName = taxfile;        readTax();              }
+        else if (countfile != "")      {       inputFileName = countfile;      readCount();    }
                 
                 if (m->control_pressed) { outputTypes.clear();  return 0; }
                 
@@ -293,12 +309,6 @@ int ListSeqsCommand::readFasta(){
                         
                         Sequence currSeq(in);
                         name = currSeq.getName();
-                       //if (lastName == "") { lastName = name; }
-                       //if (name != lastName) { count = 1; }
-               //      lastName = name;
-                       
-                       //Sequence newSeq(name+"_"+toString(count), currSeq.getAligned());
-                       //newSeq.printSequence(out);
                         
                         if (name != "") {  names.push_back(name);  }
                         
@@ -404,7 +414,24 @@ int ListSeqsCommand::readGroup(){
                 exit(1);
         }
  }
-
+//**********************************************************************************************************************
+int ListSeqsCommand::readCount(){
+       try {
+               CountTable ct;
+               ct.readTable(countfile);
+        
+        if (m->control_pressed) { return 0; }
+        
+        names = ct.getNamesOfSeqs();
+        
+        return 0;
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ListSeqsCommand", "readCount");
+               exit(1);
+       }
+}
  //**********************************************************************************************************************
  //alignreport file has a column header line then all other lines contain 16 columns.  we just want the first column since that contains the name
  int ListSeqsCommand::readAlign(){
diff --git a/listseqscommand.h b/listseqscommand.h

index 1a31a6dd432e837159be8d916f7ce2a25e4d4dec..8e4cce3c932100ff1ab88d49b346ff87e7791a06 100644 (file)
--- a/listseqscommand.h
+++ b/listseqscommand.h
@@ -34,7 +34,7 @@ class ListSeqsCommand : public Command {
         
         private:
                 vector<string> names, outputNames;
-               string fastafile, namefile, groupfile, alignfile, inputFileName, outputDir, listfile, taxfile;
+               string fastafile, namefile, groupfile, countfile, alignfile, inputFileName, outputDir, listfile, taxfile;
                 bool abort;
                 
                 int readFasta();
@@ -43,6 +43,7 @@ class ListSeqsCommand : public Command {
                 int readAlign();
                 int readList();
                 int readTax();
+        int readCount();
                 
  };
  
diff --git a/macros.h b/macros.h

new file mode 100755 (executable)

index 0000000..f95acbe
--- /dev/null
+++ b/macros.h
@@ -0,0 +1,32 @@
+//
+//  macros.h
+//  rrf-fs-prototype
+//
+//  Created by Abu Zaher Faridee on 5/28/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef rrf_fs_prototype_macros_h
+#define rrf_fs_prototype_macros_h
+
+#include "mothurout.h" 
+
+/***********************************************************************/
+class OptimumFeatureSubsetSelector{
+public:
+  OptimumFeatureSubsetSelector(string selectionType = "log2"): selectionType(selectionType){
+  }
+  
+  int getOptimumFeatureSubsetSize(int numFeatures){
+
+    if (selectionType == "log2"){ return (int)ceil(log2(numFeatures)); }
+    else if (selectionType == "squareRoot"){ return (int)ceil(sqrt(numFeatures)); } 
+    return -1;
+  }
+private:
+  string selectionType;
+};
+
+/***********************************************************************/
+  
+#endif
diff --git a/makebiomcommand.cpp b/makebiomcommand.cpp

index 9e8d3e39cb58e31e9f3dca2eeff35257eadd537a..68e70ee3e48e58a89f75ef7547d7409620947cb2 100644 (file)
--- a/makebiomcommand.cpp
+++ b/makebiomcommand.cpp
@@ -549,15 +549,16 @@ vector<string> MakeBiomCommand::getMetaData(vector<SharedRAbundVector*>& lookup)
                  if (m->control_pressed) { return metadata; }
                  
                  //if there is a bin label use it otherwise make one
-                string binLabel = binTag;
-                string sbinNumber = otuLabels[i];
-                if (sbinNumber.length() < snumBins.length()) { 
-                    int diff = snumBins.length() - sbinNumber.length();
-                    for (int h = 0; h < diff; h++) { binLabel += "0"; }
-                }
-                binLabel += sbinNumber;
-                
-                labelTaxMap[binLabel] = taxs[i];
+                if (m->isContainingOnlyDigits(otuLabels[i])) {
+                    string binLabel = binTag;
+                    string sbinNumber = otuLabels[i];
+                    if (sbinNumber.length() < snumBins.length()) { 
+                        int diff = snumBins.length() - sbinNumber.length();
+                        for (int h = 0; h < diff; h++) { binLabel += "0"; }
+                    }
+                    binLabel += sbinNumber;
+                    labelTaxMap[binLabel] = taxs[i];
+                }else {  labelTaxMap[otuLabels[i]] = taxs[i]; }
              }
              
              
diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp

index 691d706ab00efa8823412753b72f08e5198873b1..4ae25ce0e2623ec9db0dc6568a37df7e64382140 100644 (file)
--- a/makecontigscommand.cpp
+++ b/makecontigscommand.cpp
@@ -13,7 +13,15 @@ vector<string> MakeContigsCommand::setParameters(){
         try {
                 CommandParameter pfasta("ffastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
          CommandParameter prfasta("rfastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(prfasta);
-               CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "",false,false); parameters.push_back(palign);
+        CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
+               CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
+               CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
+        CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
+               CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
+        CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
+
+        CommandParameter palign("align", "Multiple", "needleman-gotoh", "needleman", "", "", "",false,false); parameters.push_back(palign);
+        CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
                 CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch);
                 CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch);
                 CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen);
@@ -37,15 +45,22 @@ string MakeContigsCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The make.contigs command reads a forward fastq file and a reverse fastq file and outputs new fasta and quality files.\n";
-               helpString += "The make.contigs command parameters are ffastq, rfastq, align, match, mismatch, gapopen, gapextend and processors.\n";
+        helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n";
+               helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n";
                 helpString += "The ffastq and rfastq parameters are required.\n";
                 helpString += "The align parameter allows you to specify the alignment method to use.  Your options are: gotoh and needleman. The default is needleman.\n";
+        helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
+               helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
+               helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
+        helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
+               helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
                 helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n";
                 helpString += "The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n";
                 helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n";
                 helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n";
          helpString += "The threshold parameter allows you to set a quality scores threshold. In the case where we are trying to decide whether to keep a base or remove it because the base is compared to a gap in the other fragment, if the base has a quality score below the threshold we eliminate it. Default=40.\n";
          helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
+        helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
          helpString += "The make.contigs command should be in the following format: \n";
                 helpString += "make.contigs(ffastq=yourForwardFastqFile, rfastq=yourReverseFastqFile, align=yourAlignmentMethod) \n";
                 helpString += "Note: No spaces between parameter labels (i.e. ffastq), '=' and parameters (i.e.yourForwardFastqFile).\n";
@@ -68,6 +83,7 @@ string MakeContigsCommand::getOutputFileNameTag(string type, string inputName=""
          else {
              if (type == "fasta")             {   outputFileName =  "contigs.fasta";         }
              else if (type == "qfile")        {   outputFileName =  "contigs.qual";          }
+            else if (type == "group")            {   outputFileName =  "groups";   }
              else if (type == "mismatch")     {   outputFileName =  "contigs.mismatch";      }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
@@ -86,6 +102,7 @@ MakeContigsCommand::MakeContigsCommand(){
                 vector<string> tempOutNames;
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["qfile"] = tempOutNames;
+        outputTypes["group"] = tempOutNames;
          outputTypes["mismatch"] = tempOutNames;
         }
         catch(exception& e) {
@@ -121,6 +138,7 @@ MakeContigsCommand::MakeContigsCommand(string option)  {
                         outputTypes["fasta"] = tempOutNames;
                         outputTypes["qfile"] = tempOutNames;
              outputTypes["mismatch"] = tempOutNames;
+            outputTypes["group"] = tempOutNames;
                         
              
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
@@ -143,6 +161,14 @@ MakeContigsCommand::MakeContigsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["rfastq"] = inputDir + it->second;           }
                                 }
+                
+                it = parameters.find("oligos");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
+                               }
              }
              
              ffastqfile = validParameter.validFile(parameters, "ffastq", true);
@@ -153,6 +179,11 @@ MakeContigsCommand::MakeContigsCommand(string option)  {
                         if (rfastqfile == "not open") { rfastqfile = ""; abort = true; }        
                         else if (rfastqfile == "not found") { rfastqfile = ""; abort=true;  m->mothurOut("The rfastq parameter is required.\n"); }
              
+            oligosfile = validParameter.validFile(parameters, "oligos", true);
+                       if (oligosfile == "not found")      {   oligosfile = "";        }
+                       else if(oligosfile == "not open")   {   abort = true;       } 
+                       else {   m->setOligosFile(oligosfile);          }
+            
              //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(ffastqfile);             }
                         
@@ -182,6 +213,26 @@ MakeContigsCommand::MakeContigsCommand(string option)  {
                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
                         m->setProcessors(temp);
                         m->mothurConvert(temp, processors);
+            
+            temp = validParameter.validFile(parameters, "bdiffs", false);              if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, bdiffs);
+                       
+                       temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, pdiffs);
+            
+            temp = validParameter.validFile(parameters, "ldiffs", false);              if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, ldiffs);
+            
+            temp = validParameter.validFile(parameters, "sdiffs", false);              if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, sdiffs);
+                       
+                       temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs;  temp = toString(tempTotal); }
+                       m->mothurConvert(temp, tdiffs);
+                       
+                       if(tdiffs == 0){        tdiffs = bdiffs + pdiffs + ldiffs + sdiffs;     }
+
+            temp = validParameter.validFile(parameters, "allfiles", false);            if (temp == "not found") { temp = "F"; }
+                       allFiles = m->isTrue(temp);
                         
                         align = validParameter.validFile(parameters, "align", false);           if (align == "not found"){      align = "needleman";    }
                         if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; }
@@ -239,6 +290,12 @@ int MakeContigsCommand::execute(){
                 if (itTypes != outputTypes.end()) {
                         if ((itTypes->second).size() != 0) { currentQual = (itTypes->second)[0]; m->setQualFile(currentQual); }
                 }
+        
+        string currentGroup = "";
+               itTypes = outputTypes.find("group");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { currentGroup = (itTypes->second)[0]; m->setGroupFile(currentGroup); }
+               }
                 
          //output files created by command
                 m->mothurOutEndLine();
@@ -246,7 +303,6 @@ int MakeContigsCommand::execute(){
                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
                 m->mothurOutEndLine();
  
-        
          return 0;
      }
         catch(exception& e) {
@@ -700,6 +756,262 @@ bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse){
          exit(1);
      }
  }
+//***************************************************************************************************************
+//illumina data requires paired forward and reverse data
+//BARCODE   atgcatgc   atgcatgc    groupName 
+//PRIMER   atgcatgc   atgcatgc    groupName  
+//PRIMER   atgcatgc   atgcatgc  
+bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<vector<string> >& qualFileNames){
+       try {
+               ifstream in;
+               m->openInputFile(oligosfile, in);
+               
+               ofstream test;
+               
+               string type, foligo, roligo, group;
+        
+               int indexPrimer = 0;
+               int indexBarcode = 0;
+        set<string> uniquePrimers;
+        set<string> uniqueBarcodes;
+               
+               while(!in.eof()){
+            
+                       in >> type; 
+            
+                       if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }      
+            
+                       if(type[0] == '#'){
+                               while (!in.eof())       {       char c = in.get();  if (c == 10 || c == 13){    break;  }       } // get rest of line if there's any crap there
+                               m->gobble(in);
+                       }
+                       else{
+                               m->gobble(in);
+                               //make type case insensitive
+                               for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
+                               
+                               in >> foligo;
+                
+                if (m->debug) { m->mothurOut("[DEBUG]: reading - " + foligo + ".\n"); }
+                               
+                               for(int i=0;i<foligo.length();i++){
+                                       foligo[i] = toupper(foligo[i]);
+                                       if(foligo[i] == 'U')    {       foligo[i] = 'T';        }
+                               }
+                               
+                               if(type == "PRIMER"){
+                                       m->gobble(in);
+                                       
+                    in >> roligo;
+                    
+                    for(int i=0;i<roligo.length();i++){
+                        roligo[i] = toupper(roligo[i]);
+                        if(roligo[i] == 'U')   {       roligo[i] = 'T';        }
+                    }
+                    roligo = reverseOligo(roligo);
+                    
+                    group = "";
+                    
+                                       // get rest of line in case there is a primer name
+                                       while (!in.eof())       {       
+                                               char c = in.get(); 
+                                               if (c == 10 || c == 13){        break;  }
+                                               else if (c == 32 || c == 9){;} //space or tab
+                                               else {  group += c;  }
+                                       } 
+                    
+                    oligosPair newPrimer(foligo, roligo);
+                                       
+                                       //check for repeat barcodes
+                    string tempPair = foligo+roligo;
+                    if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine();  }
+                    else { uniquePrimers.insert(tempPair); }
+                                       
+                    if (m->debug) {  if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer pair " + newPrimer.forward + " " + newPrimer.reverse + ".\n"); }  }
+                    
+                                       primers[indexPrimer]=newPrimer; indexPrimer++;          
+                                       primerNameVector.push_back(group);
+                               }else if(type == "BARCODE"){
+                                       m->gobble(in);
+                                       
+                    in >> roligo;
+                    
+                    for(int i=0;i<roligo.length();i++){
+                        roligo[i] = toupper(roligo[i]);
+                        if(roligo[i] == 'U')   {       roligo[i] = 'T';        }
+                    }
+                    roligo = reverseOligo(roligo);
+                    
+                    oligosPair newPair(foligo, roligo);
+                    
+                    group = "";
+                    while (!in.eof())  {       
+                                               char c = in.get(); 
+                                               if (c == 10 || c == 13){        break;  }
+                                               else if (c == 32 || c == 9){;} //space or tab
+                                               else {  group += c;  }
+                                       } 
+                                       
+                    if (m->debug) { m->mothurOut("[DEBUG]: barcode pair " + newPair.forward + " " + newPair.reverse + ", and group = " + group + ".\n"); }
+                        
+                    //check for repeat barcodes
+                    string tempPair = foligo+roligo;
+                    if (uniqueBarcodes.count(tempPair) != 0) { m->mothurOut("barcode pair " + newPair.forward + " " + newPair.reverse +  " is in your oligos file already, disregarding."); m->mothurOutEndLine();  }
+                    else { uniqueBarcodes.insert(tempPair); }
+                        
+                    barcodes[indexBarcode]=newPair; indexBarcode++;
+                                       barcodeNameVector.push_back(group);
+                               }else if(type == "LINKER"){
+                                       linker.push_back(foligo);
+                               }else if(type == "SPACER"){
+                                       spacer.push_back(foligo);
+                               }
+                               else{   m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are primer, barcode, linker and spacer. Ignoring " + foligo + "."); m->mothurOutEndLine(); }
+                       }
+                       m->gobble(in);
+               }       
+               in.close();
+               
+               if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0;   }
+               
+               //add in potential combos
+               if(barcodeNameVector.size() == 0){
+            oligosPair temp("", "");
+                       barcodes[0] = temp;
+                       barcodeNameVector.push_back("");                        
+               }
+               
+               if(primerNameVector.size() == 0){
+            oligosPair temp("", "");
+                       primers[0] = temp;
+                       primerNameVector.push_back("");                 
+               }
+               
+               fastaFileNames.resize(barcodeNameVector.size());
+               for(int i=0;i<fastaFileNames.size();i++){
+                       fastaFileNames[i].assign(primerNameVector.size(), "");
+               }
+               qualFileNames = fastaFileNames; 
+               
+               if(allFiles){
+                       set<string> uniqueNames; //used to cleanup outputFileNames
+                       for(map<int, oligosPair>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
+                               for(map<int, oligosPair>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
+                                       
+                                       string primerName = primerNameVector[itPrimer->first];
+                                       string barcodeName = barcodeNameVector[itBar->first];
+                                       
+                                       string comboGroupName = "";
+                                       string fastaFileName = "";
+                                       string qualFileName = "";
+                                       string nameFileName = "";
+                    string countFileName = "";
+                                       
+                                       if(primerName == ""){
+                                               comboGroupName = barcodeNameVector[itBar->first];
+                                       }
+                                       else{
+                                               if(barcodeName == ""){
+                                                       comboGroupName = primerNameVector[itPrimer->first];
+                                               }
+                                               else{
+                                                       comboGroupName = barcodeNameVector[itBar->first] + "." + primerNameVector[itPrimer->first];
+                                               }
+                                       }
+                                       
+                                       
+                                       ofstream temp;
+                                       fastaFileName = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + comboGroupName + ".fasta";
+                                       if (uniqueNames.count(fastaFileName) == 0) {
+                                               outputNames.push_back(fastaFileName);
+                                               outputTypes["fasta"].push_back(fastaFileName);
+                                               uniqueNames.insert(fastaFileName);
+                                       }
+                                       
+                                       fastaFileNames[itBar->first][itPrimer->first] = fastaFileName;
+                                       m->openOutputFile(fastaFileName, temp);         temp.close();
+                                       
+                                       
+                    qualFileName = outputDir + m->getRootName(m->getSimpleName(ffastqfile)) + comboGroupName + ".qual";
+                    if (uniqueNames.count(qualFileName) == 0) {
+                        outputNames.push_back(qualFileName);
+                        outputTypes["qfile"].push_back(qualFileName);
+                    }
+                                               
+                    qualFileNames[itBar->first][itPrimer->first] = qualFileName;
+                    m->openOutputFile(qualFileName, temp);             temp.close();
+                               }
+                       }
+               }
+               
+               bool allBlank = true;
+               for (int i = 0; i < barcodeNameVector.size(); i++) {
+                       if (barcodeNameVector[i] != "") {
+                               allBlank = false;
+                               break;
+                       }
+               }
+               for (int i = 0; i < primerNameVector.size(); i++) {
+                       if (primerNameVector[i] != "") {
+                               allBlank = false;
+                               break;
+                       }
+               }
+        
+               if (allBlank) {
+                       m->mothurOut("[WARNING]: your oligos file does not contain any group names.  mothur will not create a groupfile."); m->mothurOutEndLine();
+                       allFiles = false;
+                       return false;
+               }
+               
+               return true;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeContigsCommand", "getOligos");
+               exit(1);
+       }
+}
+//********************************************************************/
+string MakeContigsCommand::reverseOligo(string oligo){
+       try {
+        string reverse = "";
+        
+        for(int i=oligo.length()-1;i>=0;i--){
+            
+            if(oligo[i] == 'A')                {       reverse += 'T'; }
+            else if(oligo[i] == 'T'){  reverse += 'A'; }
+            else if(oligo[i] == 'U'){  reverse += 'A'; }
+            
+            else if(oligo[i] == 'G'){  reverse += 'C'; }
+            else if(oligo[i] == 'C'){  reverse += 'G'; }
+            
+            else if(oligo[i] == 'R'){  reverse += 'Y'; }
+            else if(oligo[i] == 'Y'){  reverse += 'R'; }
+            
+            else if(oligo[i] == 'M'){  reverse += 'K'; }
+            else if(oligo[i] == 'K'){  reverse += 'M'; }
+            
+            else if(oligo[i] == 'W'){  reverse += 'W'; }
+            else if(oligo[i] == 'S'){  reverse += 'S'; }
+            
+            else if(oligo[i] == 'B'){  reverse += 'V'; }
+            else if(oligo[i] == 'V'){  reverse += 'B'; }
+            
+            else if(oligo[i] == 'D'){  reverse += 'H'; }
+            else if(oligo[i] == 'H'){  reverse += 'D'; }
+            
+            else                                               {       reverse += 'N'; }
+        }
+        
+        
+        return reverse;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "MakeContigsCommand", "reverseOligo");
+               exit(1);
+       }
+}
  //**********************************************************************************************************************
  
  
diff --git a/makecontigscommand.h b/makecontigscommand.h

index 2308b657acf7951be6373a527f44546b555a7d2f..84e43c01c9a0b8c7775c6a46494a6fc0fcb642a6 100644 (file)
--- a/makecontigscommand.h
+++ b/makecontigscommand.h
@@ -17,7 +17,7 @@
  #include "needlemanoverlap.hpp"
  #include "blastalign.hpp"
  #include "noalign.hpp"
-
+#include "trimoligos.h"
  
  struct fastqRead {
         vector<int> scores;
@@ -50,17 +50,31 @@ public:
      void help() { m->mothurOut(getHelpString()); }     
      
  private:
-    bool abort;
-    string outputDir, ffastqfile, rfastqfile, align;
+    bool abort, allFiles;
+    string outputDir, ffastqfile, rfastqfile, align, oligosfile;
         float match, misMatch, gapOpen, gapExtend;
-       int processors, longestBase, threshold;
+       int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
      vector<string> outputNames;
      
+    map<int, oligosPair> barcodes;
+       map<int, oligosPair> primers;
+    vector<string>  linker;
+    vector<string>  spacer;
+       vector<string> primerNameVector;        
+       vector<string> barcodeNameVector;       
+    
+       map<string, int> groupCounts;  
+    //map<string, int> combos;
+       //map<string, int> groupToIndex;
+    //vector<string> groupVector;
+    
      fastqRead readFastq(ifstream&);
      vector< vector<string> > readFastqFiles(int&);
      bool checkReads(fastqRead&, fastqRead&);
      int createProcesses(vector< vector<string> >, string, string, string);
      int driver(vector<string>, string, string, string);
+    bool getOligos(vector<vector<string> >&, vector<vector<string> >&);
+    string reverseOligo(string);
  };
  
  /**************************************************************************************************/
diff --git a/makefile b/makefile

index 32ede6e5fa70a2ab42f4aa6ebcc0382b32dc8d76..bc5a569e31cd2e12526bb20244adb75e58333ece 100644 (file)
--- a/makefile
+++ b/makefile
@@ -17,7 +17,7 @@ USECOMPRESSION ?= no
  MOTHUR_FILES="\"Enter_your_default_path_here\""
  RELEASE_DATE = "\"7/9/2012\""
  VERSION = "\"1.26.0\""
-FORTAN_COMPILER = gfortran
+FORTAN_COMPILER = /usr/local/gfortran/bin/gfortran
  FORTRAN_FLAGS = 
  
  # Optimize to level 3:
diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp

index 477450475d479eb4308d9c2503086b9d7c03b051..1861aa5b5d8c19f547ed90e3ec57866a3b53ecd2 100644 (file)
--- a/mgclustercommand.cpp
+++ b/mgclustercommand.cpp
@@ -13,8 +13,8 @@
  vector<string> MGClusterCommand::setParameters(){      
         try {
                 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pblast);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-        CommandParameter pcount("count", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname);
+               CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pcount);
                 CommandParameter plength("length", "Number", "", "5", "", "", "",false,false); parameters.push_back(plength);
                 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "",false,false); parameters.push_back(ppenalty);
                 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pcutoff);
@@ -147,6 +147,14 @@ MGClusterCommand::MGClusterCommand(string option) {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
diff --git a/mgclustercommand.h b/mgclustercommand.h

index c9c23c80f88d0ab0c98ae058748c544a561b76d3..b5b295f1edb4495d321a7e614db1e8309d8bc481 100644 (file)
--- a/mgclustercommand.h
+++ b/mgclustercommand.h
@@ -12,7 +12,6 @@
  
  #include "command.hpp"
  #include "readblast.h"
-#include "sparsematrix.hpp"
  #include "nameassignment.hpp"
  #include "cluster.hpp"
  #include "hcluster.h"
diff --git a/mothur.h b/mothur.h

index 25b803fa393e27d0c2811e62efdeb042ce390dec..cd14056b21ba82f9d6c07e650fcf210616e7a882 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -177,7 +177,13 @@ inline bool compareSpearman(spearmanRank left, spearmanRank right){
  //********************************************************************************************************************
  //sorts highest to lowest
  inline bool compareSeqPriorityNodes(seqPriorityNode left, seqPriorityNode right){
-       return (left.numIdentical > right.numIdentical);        
+       if (left.numIdentical > right.numIdentical) {
+        return true;
+    }else if (left.numIdentical == right.numIdentical) {
+        if (left.seq > right.seq) { return true; }
+        else { return false; }
+    }
+    return false;      
  } 
  //********************************************************************************************************************
  //sorts lowest to highest
diff --git a/mothurout.cpp b/mothurout.cpp

index 9704464bf80326c87d4afa556e3353b95fcbf37e..7d40e80ead727205f4bcc00269f78bd7f81057ca 100644 (file)
--- a/mothurout.cpp
+++ b/mothurout.cpp
@@ -939,7 +939,7 @@ string MothurOut::getFullPathName(string fileName){
                                 }
                         
                                 for (int i = index; i >= 0; i--) {
-                                       newFileName = dirs[i] +  "\\" + newFileName;            
+                                       newFileName = dirs[i] +  "\\\\" + newFileName;          
                                 }
                                 
                                 return newFileName;
@@ -1544,7 +1544,6 @@ vector<string> MothurOut::splitWhiteSpace(string input){
  //**********************************************************************************************************************
  int MothurOut::readTax(string namefile, map<string, string>& taxMap) {
         try {
-               
          //open input file
                 ifstream in;
                 openInputFile(namefile, in);
@@ -1575,6 +1574,23 @@ int MothurOut::readTax(string namefile, map<string, string>& taxMap) {
              }
                 }
                 in.close();
+        
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    //are there confidence scores, if so remove them
+                    if (secondCol.find_first_of('(') != -1) {  removeConfidences(secondCol);   }
+                    taxMap[firstCol] = secondCol;
+                    if (debug) {  mothurOut("[DEBUG]: name = '" + firstCol + "' tax = '" + secondCol + "'\n");  }
+                    pairDone = false; 
+                }
+            } 
+        }
                 
                 return taxMap.size();
  
@@ -1587,7 +1603,6 @@ int MothurOut::readTax(string namefile, map<string, string>& taxMap) {
  /**********************************************************************************************************************/
  int MothurOut::readNames(string namefile, map<string, string>& nameMap, bool redund) { 
         try {
-               
                 //open input file
                 ifstream in;
                 openInputFile(namefile, in);
@@ -1618,6 +1633,23 @@ int MothurOut::readNames(string namefile, map<string, string>& nameMap, bool red
              }
                 }
                 in.close();
+        
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    //parse names into vector
+                    vector<string> theseNames;
+                    splitAtComma(secondCol, theseNames);
+                    for (int i = 0; i < theseNames.size(); i++) {  nameMap[theseNames[i]] = firstCol;  }
+                    pairDone = false; 
+                }
+            }  
+        }
                 
                 return nameMap.size();
                 
@@ -1630,7 +1662,6 @@ int MothurOut::readNames(string namefile, map<string, string>& nameMap, bool red
  /**********************************************************************************************************************/
  int MothurOut::readNames(string namefile, map<string, string>& nameMap, int flip) { 
         try {
-               
                 //open input file
                 ifstream in;
                 openInputFile(namefile, in);
@@ -1658,6 +1689,20 @@ int MothurOut::readNames(string namefile, map<string, string>& nameMap, int flip
              }
                 }
                 in.close();
+        
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    nameMap[secondCol] = firstCol;
+                    pairDone = false; 
+                }
+            } 
+        }
                 
                 return nameMap.size();
                 
@@ -1670,7 +1715,7 @@ int MothurOut::readNames(string namefile, map<string, string>& nameMap, int flip
  /**********************************************************************************************************************/
  int MothurOut::readNames(string namefile, map<string, string>& nameMap, map<string, int>& nameCount) { 
         try {
-               nameMap.clear(); nameCount.clear();
+               nameMap.clear(); nameCount.clear();
                 //open input file
                 ifstream in;
                 openInputFile(namefile, in);
@@ -1703,6 +1748,24 @@ int MothurOut::readNames(string namefile, map<string, string>& nameMap, map<stri
                 }
                 in.close();
                 
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    //parse names into vector
+                    vector<string> theseNames;
+                    splitAtComma(secondCol, theseNames);
+                    for (int i = 0; i < theseNames.size(); i++) {  nameMap[theseNames[i]] = firstCol;  }
+                    nameCount[firstCol] = theseNames.size();
+                    pairDone = false; 
+                }
+            }
+
+        }
                 return nameMap.size();
                 
         }
@@ -1714,7 +1777,6 @@ int MothurOut::readNames(string namefile, map<string, string>& nameMap, map<stri
  /**********************************************************************************************************************/
  int MothurOut::readNames(string namefile, map<string, string>& nameMap) { 
         try {
-               
                 //open input file
                 ifstream in;
                 openInputFile(namefile, in);
@@ -1739,6 +1801,17 @@ int MothurOut::readNames(string namefile, map<string, string>& nameMap) {
              }
                 }
                 in.close();
+        
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { nameMap[firstCol] = secondCol; pairDone = false; }
+            }
+        }
                 
                 return nameMap.size();
                 
@@ -1750,8 +1823,7 @@ int MothurOut::readNames(string namefile, map<string, string>& nameMap) {
  }
  /**********************************************************************************************************************/
  int MothurOut::readNames(string namefile, map<string, vector<string> >& nameMap) { 
-       try {
-               
+       try {        
                 //open input file
                 ifstream in;
                 openInputFile(namefile, in);
@@ -1782,6 +1854,22 @@ int MothurOut::readNames(string namefile, map<string, vector<string> >& nameMap)
                 }
                 in.close();
          
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    vector<string> temp;
+                    splitAtComma(secondCol, temp);
+                    nameMap[firstCol] = temp;
+                    pairDone = false;  
+                } 
+            }
+        }
+        
                 return nameMap.size();
         }
         catch(exception& e) {
@@ -1792,7 +1880,6 @@ int MothurOut::readNames(string namefile, map<string, vector<string> >& nameMap)
  /**********************************************************************************************************************/
  map<string, int> MothurOut::readNames(string namefile) { 
         try {
-               
                 map<string, int> nameMap;
                 
                 //open input file
@@ -1823,6 +1910,20 @@ map<string, int> MothurOut::readNames(string namefile) {
              }
                 }
          in.close();
+        
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    int num = getNumNames(secondCol);
+                    nameMap[firstCol] = num;
+                    pairDone = false;  
+                } 
+            }
+        }
                 
                 return nameMap;
                 
@@ -1875,6 +1976,29 @@ int MothurOut::readNames(string namefile, vector<seqPriorityNode>& nameVector, m
                 }
          in.close();
          
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    int num = getNumNames(secondCol);
+                    
+                    map<string, string>::iterator it = fastamap.find(firstCol);
+                    if (it == fastamap.end()) {
+                        error = 1;
+                        mothurOut("[ERROR]: " + firstCol + " is not in your fastafile, but is in your namesfile, please correct."); mothurOutEndLine();
+                    }else {
+                        seqPriorityNode temp(num, it->second, firstCol);
+                        nameVector.push_back(temp);
+                    }
+                    
+                    pairDone = false;  
+                } 
+            }
+        }
                 return error;
         }
         catch(exception& e) {
@@ -1885,7 +2009,7 @@ int MothurOut::readNames(string namefile, vector<seqPriorityNode>& nameVector, m
  //**********************************************************************************************************************
  set<string> MothurOut::readAccnos(string accnosfile){
         try {
-               set<string> names;
+               set<string> names;
                 ifstream in;
                 openInputFile(accnosfile, in);
                 string name;
@@ -1903,6 +2027,10 @@ set<string> MothurOut::readAccnos(string accnosfile){
          }
                 in.close();     
                 
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            for (int i = 0; i < pieces.size(); i++) {  names.insert(pieces[i]);  } 
+        }
                 return names;
         }
         catch(exception& e) {
@@ -1930,6 +2058,11 @@ int MothurOut::readAccnos(string accnosfile, vector<string>& names){
              for (int i = 0; i < pieces.size(); i++) {  names.push_back(pieces[i]);  }
          }
                 in.close();     
+        
+        if (rest != "") {
+            vector<string> pieces = splitWhiteSpace(rest);
+            for (int i = 0; i < pieces.size(); i++) {  names.push_back(pieces[i]);  }
+        }
                 
                 return 0;
         }
@@ -1981,6 +2114,32 @@ int MothurOut::getNumChar(string line, char c){
                 exit(1);
         }
  }
+//**********************************************************************************************************************
+bool MothurOut::isSubset(vector<string> bigset, vector<string> subset) {
+       try {
+               
+        
+               if (subset.size() > bigset.size()) { return false;  }
+               
+               //check if each guy in suset is also in bigset
+               for (int i = 0; i < subset.size(); i++) {
+                       bool match = false;
+                       for (int j = 0; j < bigset.size(); j++) {
+                               if (subset[i] == bigset[j]) { match = true; break; }
+                       }
+                       
+                       //you have a guy in subset that had no match in bigset
+                       if (match == false) { return false; }
+               }
+               
+               return true;
+        
+       }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "isSubset");
+               exit(1);
+       }
+}
  /***********************************************************************/
  int MothurOut::mothurRemove(string filename){
         try {
@@ -2298,30 +2457,29 @@ void MothurOut::splitAtDash(string& estim, vector<string>& container) {
         try {
                 string individual = "";
                 int estimLength = estim.size();
+               bool prevEscape = false;
                 for(int i=0;i<estimLength;i++){
-                       if(estim[i] == '-'){
-                               container.push_back(individual);
-                               individual = "";                                
+                       if(prevEscape){
+                               individual += estim[i];
+                               prevEscape = false;
                         }
                         else{
-                               individual += estim[i];
+                               if(estim[i] == '\\'){
+                                       prevEscape = true;
+                               }
+                               else if(estim[i] == '-'){
+                                       container.push_back(individual);
+                                       individual = "";
+                                       prevEscape = false;                             
+                               }
+                               else{
+                                       individual += estim[i];
+                                       prevEscape = false;
+                               }
                         }
                 }
                 container.push_back(individual);
-
-       
-       /*      string individual;
-               
-               while (estim.find_first_of('-') != -1) {
-                       individual = estim.substr(0,estim.find_first_of('-'));
-                       if ((estim.find_first_of('-')+1) <= estim.length()) { //checks to make sure you don't have dash at end of string
-                               estim = estim.substr(estim.find_first_of('-')+1, estim.length());
-                               container.push_back(individual);
-                       }
-               }
-               //get last one
-               container.push_back(estim); */
-       }
+       }
         catch(exception& e) {
                 errorOut(e, "MothurOut", "splitAtDash");
                 exit(1);
@@ -2334,29 +2492,29 @@ void MothurOut::splitAtDash(string& estim, set<string>& container) {
         try {
                 string individual = "";
                 int estimLength = estim.size();
+               bool prevEscape = false;
                 for(int i=0;i<estimLength;i++){
-                       if(estim[i] == '-'){
-                               container.insert(individual);
-                               individual = "";                                
+                       if(prevEscape){
+                               individual += estim[i];
+                               prevEscape = false;
                         }
                         else{
-                               individual += estim[i];
+                               if(estim[i] == '\\'){
+                                       prevEscape = true;
+                               }
+                               else if(estim[i] == '-'){
+                                       container.insert(individual);
+                                       individual = "";
+                                       prevEscape = false;                             
+                               }
+                               else{
+                                       individual += estim[i];
+                                       prevEscape = false;
+                               }
                         }
                 }
                 container.insert(individual);
-
-       //      string individual;
-               
-       //      while (estim.find_first_of('-') != -1) {
-       //              individual = estim.substr(0,estim.find_first_of('-'));
-       //              if ((estim.find_first_of('-')+1) <= estim.length()) { //checks to make sure you don't have dash at end of string
-       //                      estim = estim.substr(estim.find_first_of('-')+1, estim.length());
-       //                      container.insert(individual);
-       //              }
-       //      }
-               //get last one
-       //      container.insert(estim);
-       
+        
         }
         catch(exception& e) {
                 errorOut(e, "MothurOut", "splitAtDash");
@@ -2367,19 +2525,32 @@ void MothurOut::splitAtDash(string& estim, set<string>& container) {
  //This function parses the line options and puts them in a set
  void MothurOut::splitAtDash(string& estim, set<int>& container) {
         try {
-               string individual;
+               string individual = "";
                 int lineNum;
-               
-               while (estim.find_first_of('-') != -1) {
-                       individual = estim.substr(0,estim.find_first_of('-'));
-                       if ((estim.find_first_of('-')+1) <= estim.length()) { //checks to make sure you don't have dash at end of string
-                               estim = estim.substr(estim.find_first_of('-')+1, estim.length());
-                               convert(individual, lineNum); //convert the string to int
-                               container.insert(lineNum);
+               int estimLength = estim.size();
+               bool prevEscape = false;
+               for(int i=0;i<estimLength;i++){
+                       if(prevEscape){
+                               individual += estim[i];
+                               prevEscape = false;
+                       }
+                       else{
+                               if(estim[i] == '\\'){
+                                       prevEscape = true;
+                               }
+                               else if(estim[i] == '-'){
+                                       convert(individual, lineNum); //convert the string to int
+                                       container.insert(lineNum);
+                                       individual = "";
+                                       prevEscape = false;                             
+                               }
+                               else{
+                                       individual += estim[i];
+                                       prevEscape = false;
+                               }
                         }
                 }
-               //get last one
-               convert(estim, lineNum); //convert the string to int
+               convert(individual, lineNum); //convert the string to int
                 container.insert(lineNum);
         }
         catch(exception& e) {
@@ -2719,8 +2890,53 @@ int MothurOut::removeConfidences(string& tax) {
         }
  }
  /**************************************************************************************************/
-
-
+string MothurOut::removeQuotes(string tax) {
+       try {
+               
+               string taxon;
+               string newTax = "";
+               
+               for (int i = 0; i < tax.length(); i++) {
+                       
+                       if (control_pressed) { return newTax; }
+            
+            if ((tax[i] != '\'') && (tax[i] != '\"')) { newTax += tax[i]; }
+                       
+        }
+               
+               return newTax;
+       }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "removeQuotes");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+// function for calculating standard deviation
+double MothurOut::getStandardDeviation(vector<int>& featureVector){
+    try {
+        //finds sum
+        double average = 0; 
+        for (int i = 0; i < featureVector.size(); i++) { average += featureVector[i]; }
+        average /= (double) featureVector.size();
+        
+        //find standard deviation
+        double stdDev = 0;
+        for (int i = 0; i < featureVector.size(); i++) { //compute the difference of each dist from the mean, and square the result of each
+            stdDev += ((featureVector[i] - average) * (featureVector[i] - average));
+        }
+          
+        stdDev /= (double) featureVector.size(); 
+        stdDev = sqrt(stdDev);
+        
+        return stdDev;
+    }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "getStandardDeviation");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
  
  
  
diff --git a/mothurout.h b/mothurout.h

index 77c5a804070eaa2dd3229375df999a5b9521dc92..53d4250c771cdbdac15c715c44d97fabf0a92c1a 100644 (file)
--- a/mothurout.h
+++ b/mothurout.h
@@ -140,7 +140,9 @@ class MothurOut {
                 void splitAtChar(string&, vector<string>&, char);
          void splitAtChar(string&, string&, char);
                 int removeConfidences(string&);
+        string removeQuotes(string);
          string makeList(vector<string>&);
+        bool isSubset(vector<string>, vector<string>); //bigSet, subset
                 
                 //math operation
                 int factorial(int num);
@@ -149,6 +151,7 @@ class MothurOut {
                 float roundDist(float, int);
                 unsigned int fromBase36(string);
                 int getRandomIndex(int); //highest
+        double getStandardDeviation(vector<int>&);
  
                 int control_pressed;
                 bool executing, runParse, jumble, gui, mothurCalling, debug;
diff --git a/parsefastaqcommand.cpp b/parsefastaqcommand.cpp

index 1331b7f47f4e131cb5a76913889d275b7ec93121..816bdb5d38c53afb849be7d5e65b458ac8582148 100644 (file)
--- a/parsefastaqcommand.cpp
+++ b/parsefastaqcommand.cpp
@@ -16,7 +16,8 @@ vector<string> ParseFastaQCommand::setParameters(){
                 CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfastq);
                 CommandParameter pfasta("fasta", "Bool", "", "T", "", "", "",false,false); parameters.push_back(pfasta);
                 CommandParameter pqual("qfile", "Bool", "", "T", "", "", "",false,false); parameters.push_back(pqual);
-               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+               CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa", "sanger", "", "", "",false,false); parameters.push_back(pformat);
+        CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                 
                 vector<string> myArray;
@@ -33,8 +34,9 @@ string ParseFastaQCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The fastq.info command reads a fastq file and creates a fasta and quality file.\n";
-               helpString += "The fastq.info command parameters are fastq, fasta and qfile; fastq is required.\n";
-               helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n";
+               helpString += "The fastq.info command parameters are fastq, fasta, qfile and format; fastq is required.\n";
+        helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n";
+               helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa or illumina, default=sanger.\n";
          helpString += "The fasta parameter allows you to indicate whether you want a fasta file generated. Default=T.\n";
          helpString += "The qfile parameter allows you to indicate whether you want a quality file generated. Default=T.\n";
                 helpString += "Example fastq.info(fastaq=test.fastaq).\n";
@@ -138,6 +140,13 @@ ParseFastaQCommand::ParseFastaQCommand(string option){
                         temp = validParameter.validFile(parameters, "qfile", false);    if(temp == "not found"){        temp = "T";     }
                         qual = m->isTrue(temp); 
                         
+            format = validParameter.validFile(parameters, "format", false);            if (format == "not found"){     format = "sanger";      }
+            
+            if ((format != "sanger") && (format != "illumina") && (format != "solexa"))  { 
+                               m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa and illumina, aborting." ); m->mothurOutEndLine();
+                               abort=true;
+                       }
+
                         if ((!fasta) && (!qual)) { m->mothurOut("[ERROR]: no outputs selected. Aborting."); m->mothurOutEndLine(); abort=true; }
  
                 }               
@@ -163,6 +172,12 @@ int ParseFastaQCommand::execute(){
                 
                 ifstream in;
                 m->openInputFile(fastaQFile, in);
+        
+        //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference.
+        for (int i = -64; i < 65; i++) { 
+            char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499));
+            convertTable.push_back(temp);
+        }
                 
                 while (!in.eof()) {
                         
@@ -238,12 +253,18 @@ vector<int> ParseFastaQCommand::convertQual(string qual) {
         try {
                 vector<int> qualScores;
                 
-               int controlChar = int('@');
-               
                 for (int i = 0; i < qual.length(); i++) { 
-                       int temp = int(qual[i]);
-                       temp -= controlChar;
-                       
+            
+            int temp = 0;
+            temp = int(qual[i]);
+            if (format == "illumina") {
+                temp -= 64; //char '@'
+            }else if (format == "solexa") {
+                temp = int(convertTable[temp]); //convert to sanger
+                temp -= 33; //char '!'
+            }else {
+                temp -= 33; //char '!'
+            }
                         qualScores.push_back(temp);
                 }
                 
diff --git a/parsefastaqcommand.h b/parsefastaqcommand.h

index 4481b98bb23bdbcd47a6059b2df0904c176ae9bf..96fcb7d6799358ff4e10f930d1ad4165763b46a0 100644 (file)
--- a/parsefastaqcommand.h
+++ b/parsefastaqcommand.h
@@ -34,10 +34,11 @@ public:
  private:
  
         vector<string> outputNames;     
-       string outputDir, fastaQFile;
+       string outputDir, fastaQFile, format;
         bool abort, fasta, qual;
         
         vector<int> convertQual(string);
+    vector<char> convertTable;
  };
  
  #endif
diff --git a/parsimony.cpp b/parsimony.cpp

index 3b0f31759c0563031874aa16cb0d3d515a65653b..6a0485c133b2868670e49e5873186e5d10df474d 100644 (file)
--- a/parsimony.cpp
+++ b/parsimony.cpp
@@ -15,7 +15,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) {
         try {
                 processors = p;
                 outputDir = o;
-        TreeMap* tmap = t->getTreeMap();
+        CountTable* ct = t->getCountTable();
                 
                 //if the users enters no groups then give them the score of all groups
                 vector<string> mGroups = m->getGroups();
@@ -38,7 +38,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) {
                         vector<string> groups;
                         if (numGroups == 0) {
                                 //get score for all users groups
-                               vector<string> tGroups = tmap->getNamesOfGroups();
+                               vector<string> tGroups = ct->getNamesOfGroups();
                                 for (int i = 0; i < tGroups.size(); i++) {
                                         if (tGroups[i] != "xxx") {
                                                 groups.push_back(tGroups[i]);
@@ -57,7 +57,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) {
                 
         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 if(processors == 1){
-                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
+                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct);
                 }else{
                         lines.clear();
                         int numPairs = namesOfGroupCombos.size();
@@ -74,10 +74,10 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) {
                                 lines.push_back(linePair(startPos, numPairsPerProcessor));
                         }
                         
-                       data = createProcesses(t, namesOfGroupCombos, tmap);
+                       data = createProcesses(t, namesOfGroupCombos, ct);
                 }
         #else
-               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
+               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct);
         #endif
                 
                 return data;
@@ -90,7 +90,7 @@ EstOutput Parsimony::getValues(Tree* t, int p, string o) {
  }
  /**************************************************************************************************/
  
-EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, TreeMap* tmap) {
+EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, CountTable* ct) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 int process = 1;
@@ -107,7 +107,7 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGr
                                 process++;
                         }else if (pid == 0){
                                 EstOutput myresults;
-                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap);
+                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, ct);
                                 
                                 if (m->control_pressed) { exit(0); }
                                 
@@ -127,7 +127,7 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGr
                         }
                 }
                 
-               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap);
+               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, ct);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<processIDS.size();i++) { 
@@ -170,12 +170,12 @@ EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGr
         }
  }
  /**************************************************************************************************/
-EstOutput Parsimony::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, TreeMap* tmap) { 
+EstOutput Parsimony::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, CountTable* ct) { 
         try {
                 
                 EstOutput results; results.resize(num);
                 
-               Tree* copyTree = new Tree(tmap);
+               Tree* copyTree = new Tree(ct);
                 int count = 0;
                 
                 for (int h = start; h < (start+num); h++) {
diff --git a/parsimony.h b/parsimony.h

index 7316d508dd52729c8ddec87c34a08c76586cd4d1..bf0e0d4f90198ef353c2c1899414f699454c8a7e 100644 (file)
--- a/parsimony.h
+++ b/parsimony.h
@@ -12,7 +12,7 @@
   */
  
  #include "treecalculator.h"
-#include "treemap.h"
+#include "counttable.h"
  
  /***********************************************************************/
  
@@ -35,8 +35,8 @@ class Parsimony : public TreeCalculator  {
                 int processors;
                 string outputDir;
         
-               EstOutput driver(Tree*, vector< vector<string> >, int, int, TreeMap*); 
-               EstOutput createProcesses(Tree*, vector< vector<string> >, TreeMap*);
+               EstOutput driver(Tree*, vector< vector<string> >, int, int, CountTable*); 
+               EstOutput createProcesses(Tree*, vector< vector<string> >, CountTable*);
  };
  
  /***********************************************************************/
diff --git a/parsimonycommand.cpp b/parsimonycommand.cpp

index f124b6002ee2c247db14716b4be81cb9ec10d8f2..eabbb5934445f92345eccd30f9bfb2139e652543 100644 (file)
--- a/parsimonycommand.cpp
+++ b/parsimonycommand.cpp
@@ -14,8 +14,9 @@
  vector<string> ParsimonyCommand::setParameters(){      
         try {
                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                 CommandParameter prandom("random", "String", "", "", "", "", "",false,false); parameters.push_back(prandom);
                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
@@ -36,7 +37,7 @@ vector<string> ParsimonyCommand::setParameters(){
  string ParsimonyCommand::getHelpString(){      
         try {
                 string helpString = "";
-               helpString += "The parsimony command parameters are tree, group, name, random, groups, processors and iters.  tree parameter is required unless you have valid current tree file or are using random.\n";
+               helpString += "The parsimony command parameters are tree, group, name, count, random, groups, processors and iters.  tree parameter is required unless you have valid current tree file or are using random.\n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 1 valid group.\n";
                 helpString += "The group names are separated by dashes.  The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
                 helpString += "The parsimony command should be in the following format: parsimony(random=yourOutputFilename, groups=yourGroups, iters=yourIters).\n";
@@ -145,6 +146,14 @@ ParsimonyCommand::ParsimonyCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
@@ -172,6 +181,20 @@ ParsimonyCommand::ParsimonyCommand(string option)  {
                                 if (namefile == "not open") { namefile = ""; abort = true; }
                                 else if (namefile == "not found") { namefile = ""; }
                                 else { m->setNameFile(namefile); }
+                
+                countfile = validParameter.validFile(parameters, "count", true);
+                if (countfile == "not open") { countfile = ""; abort = true; }
+                else if (countfile == "not found") { countfile = "";  }        
+                else { m->setCountTableFile(countfile); }
+                
+                if ((namefile != "") && (countfile != "")) {
+                    m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+                }
+                
+                if ((groupfile != "") && (countfile != "")) {
+                    m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+                }
+
                         }
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
@@ -193,10 +216,12 @@ ParsimonyCommand::ParsimonyCommand(string option)  {
                         m->setProcessors(temp);
                         m->mothurConvert(temp, processors);
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(treefile);
-                               parser.getNameFile(files);
-                       }
+                       if (countfile=="") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(treefile);
+                    parser.getNameFile(files);
+                } 
+            }
                         
                 }
  
@@ -219,9 +244,11 @@ int ParsimonyCommand::execute() {
                         
                         m->setTreeFile(treefile);
                         
-            TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+            TreeReader* reader;
+            if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); }
+            else { reader = new TreeReader(treefile, countfile); }
              T = reader->getTrees();
-            tmap = T[0]->getTreeMap();
+            ct = T[0]->getCountTable();
              delete reader;
         
                         if(outputDir == "") { outputDir += m->hasPath(treefile); }
@@ -245,7 +272,7 @@ int ParsimonyCommand::execute() {
                 //set users groups to analyze
                 SharedUtil util;
                 vector<string> mGroups = m->getGroups();
-               vector<string> tGroups = tmap->getNamesOfGroups();
+               vector<string> tGroups = ct->getNamesOfGroups();
                 util.setGroups(mGroups, tGroups, allGroups, numGroups, "parsimony");    //sets the groups the user wants to analyze
                 util.getCombos(groupComb, mGroups, numComp);
                 m->setGroups(mGroups);
@@ -260,7 +287,7 @@ int ParsimonyCommand::execute() {
                 
                 if (m->control_pressed) { 
                         delete reading; delete output;
-                       delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+                       delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }
                         if (randomtree == "") {  outSum.close();  }
                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
                         m->clearGroups();
@@ -285,7 +312,7 @@ int ParsimonyCommand::execute() {
                                 
                                 if (m->control_pressed) { 
                                         delete reading; delete output;
-                                       delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+                                       delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }
                                         if (randomtree == "") {  outSum.close();  }
                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
                                         m->clearGroups();
@@ -314,7 +341,7 @@ int ParsimonyCommand::execute() {
                         for (int j = 0; j < iters; j++) {
                                                                 
                                 //create new tree with same num nodes and leaves as users
-                               randT = new Tree(tmap);
+                               randT = new Tree(ct);
  
                                 //create random relationships between nodes
                                 randT->assembleRandomTree();
@@ -326,7 +353,7 @@ int ParsimonyCommand::execute() {
                                         delete reading;  delete output; delete randT;
                                         if (randomtree == "") {  outSum.close();  }
                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
-                                       delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+                                       delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }
                                         m->clearGroups();
                                         return 0;
                                 }
@@ -355,13 +382,13 @@ int ParsimonyCommand::execute() {
                         for (int j = 0; j < iters; j++) {
                                                                 
                                 //create new tree with same num nodes and leaves as users
-                               randT = new Tree(tmap);
+                               randT = new Tree(ct);
                                 //create random relationships between nodes
  
                                 randT->assembleRandomTree();
                                 
                                 if (m->control_pressed) { 
-                                       delete reading; delete output; delete randT; delete tmap; 
+                                       delete reading; delete output; delete randT; delete ct; 
                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;
                                 }
  
@@ -370,7 +397,7 @@ int ParsimonyCommand::execute() {
                                 randomData = pars.getValues(randT, processors, outputDir);
                                 
                                 if (m->control_pressed) { 
-                                       delete reading; delete output; delete randT; delete tmap; 
+                                       delete reading; delete output; delete randT; delete ct; 
                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;
                                 }
                         
@@ -424,7 +451,7 @@ int ParsimonyCommand::execute() {
                 
                 if (m->control_pressed) { 
                                 delete reading; delete output;
-                               delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+                               delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }
                                 if (randomtree == "") {  outSum.close();  }
                                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
                                 return 0;
@@ -437,7 +464,7 @@ int ParsimonyCommand::execute() {
                 printParsimonyFile();
                 if (randomtree == "") { printUSummaryFile(); }
                                 
-        delete output; delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+        delete output; delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }
                 
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } outputTypes.clear(); return 0;}
                 
@@ -529,7 +556,7 @@ void ParsimonyCommand::getUserInput() {
         try {
         
                 //create treemap
-               tmap = new TreeMap();
+               ct = new CountTable();
  
                 m->mothurOut("Please enter the number of groups you would like to analyze: ");
                 cin >> numGroups;
@@ -539,30 +566,31 @@ void ParsimonyCommand::getUserInput() {
                 count = 1;
                 numEachGroup.resize(numGroups, 0);  
                 
-               
+        set<string> nameMap;
+        map<string, string> groupMap;
+        set<string> gps;
+                
                 for (int i = 1; i <= numGroups; i++) {
                         m->mothurOut("Please enter the number of sequences in group " + toString(i) +  ": ");
                         cin >> num;
                         m->mothurOutJustToLog(toString(num)); m->mothurOutEndLine();
-                               
-                       //set tmaps seqsPerGroup
-                       tmap->seqsPerGroup[toString(i)] = num;
-                       tmap->addGroup(toString(i));
                         
+            gps.insert(toString(i));
+            
                         //set tmaps namesOfSeqs
                         for (int j = 0; j < num; j++) {
-                               tmap->namesOfSeqs.push_back(toString(count));
-                               tmap->treemap[toString(count)].groupname = toString(i);
+                               groupMap[toString(count)] = i;
+                               nameMap.insert(toString(count));
                                 count++;
                         }
                 }
-               
+               ct->createTable(nameMap, groupMap, gps);
+        
                 //clears buffer so next command doesn't have error
                 string s;       
                 getline(cin, s);
                 
-               m->Treenames = tmap->namesOfSeqs; 
-               
+               m->Treenames = ct->getNamesOfSeqs(); 
         }
         catch(exception& e) {
                 m->errorOut(e, "ParsimonyCommand", "getUserInput");
diff --git a/parsimonycommand.h b/parsimonycommand.h

index 79613f560746e443697e4e92bfed755605a6fe4a..38a7505125e52b39ad37f36df592dfc9cac3b54c 100644 (file)
--- a/parsimonycommand.h
+++ b/parsimonycommand.h
@@ -11,7 +11,7 @@
  
  #include "command.hpp"
  #include "parsimony.h"
-#include "treemap.h"
+#include "counttable.h"
  #include "progress.hpp"
  #include "sharedutilities.h"
  #include "fileoutput.h"
@@ -41,10 +41,10 @@ private:
         vector<Tree*> T;           //user trees
         Tree* randT;  //random tree
         Tree* copyUserTree; 
-       TreeMap* tmap; 
-       TreeMap* savetmap;
+       CountTable* ct; 
+       CountTable* savect;
         vector<string> groupComb; // AB. AC, BC...
-       string sumFile, randomtree, allGroups, outputDir, treefile, groupfile, namefile;
+       string sumFile, randomtree, allGroups, outputDir, treefile, groupfile, namefile, countfile;
         int iters, numGroups, numComp, counter, processors, numUniquesInName;
         vector<int> numEachGroup; //vector containing the number of sequences in each group the users wants for random distrib.
         vector< vector<float> > userTreeScores; //scores for users trees for each comb.
diff --git a/pcrseqscommand.h b/pcrseqscommand.h

index baeca4eedc66ac9aa38acaa4b7d65bfebca56410..d35850ce74296fa4f4471927817becd21145dbee 100644 (file)
--- a/pcrseqscommand.h
+++ b/pcrseqscommand.h
@@ -15,6 +15,7 @@
  #include "trimoligos.h"
  #include "alignment.hpp"
  #include "needlemanoverlap.hpp"
+#include "counttable.h"
  
  class PcrSeqsCommand : public Command {
  public:
@@ -45,7 +46,7 @@ private:
      vector<linePair> lines;
         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
      bool abort, keepprimer, keepdots;
-       string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch;
+       string fastafile, oligosfile, taxfile, groupfile, namefile, countfile, ecolifile, outputDir, nomatch;
         int start, end, processors, length;
         
      vector<string> revPrimer, outputNames;
@@ -55,6 +56,7 @@ private:
      int readName(set<string>&);
      int readGroup(set<string>);
      int readTax(set<string>);
+    int readCount(set<string>);
      bool readOligos();
      bool readEcoli();
         int driverPcr(string, string, string, set<string>&, linePair);  
diff --git a/phylodiversitycommand.cpp b/phylodiversitycommand.cpp

index ddd2b316d507b8477d84d529ae9fc4d88c69e518..b0c11f68ff26dad59634375c80c44ecf0dd0382b 100644 (file)
--- a/phylodiversitycommand.cpp
+++ b/phylodiversitycommand.cpp
@@ -15,8 +15,9 @@ vector<string> PhyloDiversityCommand::setParameters(){
         try {
  
                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
                 CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq);
@@ -41,7 +42,7 @@ vector<string> PhyloDiversityCommand::setParameters(){
  string PhyloDiversityCommand::getHelpString(){ 
         try {
                 string helpString = "";
-               helpString += "The phylo.diversity command parameters are tree, group, name, groups, iters, freq, processors, scale, rarefy, collect and summary.  tree and group are required, unless you have valid current files.\n";
+               helpString += "The phylo.diversity command parameters are tree, group, name, count, groups, iters, freq, processors, scale, rarefy, collect and summary.  tree and group are required, unless you have valid current files.\n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n";
                 helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n";
                 helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n";
@@ -156,6 +157,14 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //check for required parameters
@@ -179,6 +188,19 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
                         
                         string temp;
@@ -214,10 +236,12 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
                         
                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(treefile);
-                               parser.getNameFile(files);
-                       }
+                       if (countfile=="") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(treefile);
+                    parser.getNameFile(files);
+                } 
+            }
                 }
                 
         }
@@ -236,14 +260,16 @@ int PhyloDiversityCommand::execute(){
          int start = time(NULL);
          
                 m->setTreeFile(treefile);
-        TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+        TreeReader* reader;
+        if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); }
+        else { reader = new TreeReader(treefile, countfile); }
          vector<Tree*> trees = reader->getTrees();
-        tmap = trees[0]->getTreeMap();
+        ct = trees[0]->getCountTable();
          delete reader;
  
                 SharedUtil util;
                 vector<string> mGroups = m->getGroups();
-               vector<string> tGroups = tmap->getNamesOfGroups();
+               vector<string> tGroups = ct->getNamesOfGroups();
                 util.setGroups(mGroups, tGroups, "phylo.diversity");    //sets the groups the user wants to analyze
                 
                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
@@ -255,7 +281,7 @@ int PhyloDiversityCommand::execute(){
                 //for each of the users trees
                 for(int i = 0; i < trees.size(); i++) {
                 
-                       if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        } return 0; }
+                       if (m->control_pressed) { delete ct; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]);        } return 0; }
                         
                         ofstream outSum, outRare, outCollect;
                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + "." + getOutputFileNameTag("summary");
@@ -286,15 +312,16 @@ int PhyloDiversityCommand::execute(){
                         
                         //find largest group total 
                         int largestGroup = 0;
-                       for (int j = 0; j < mGroups.size(); j++) {  
-                               if (tmap->seqsPerGroup[mGroups[j]] > largestGroup) { largestGroup = tmap->seqsPerGroup[mGroups[j]]; }
+                       for (int j = 0; j < mGroups.size(); j++) { 
+                int numSeqsThisGroup = ct->getGroupCount(mGroups[j]);
+                               if (numSeqsThisGroup > largestGroup) { largestGroup = numSeqsThisGroup; }
                                 
                                 //initialize diversity
-                               diversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0);            //numSampled
+                               diversity[mGroups[j]].resize(numSeqsThisGroup+1, 0.0);          //numSampled
                                                                                                                                                                                                                         //groupA                0.0                     0.0
                                                                                                                                                                                                                         
                                 //initialize sumDiversity
-                               sumDiversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0);
+                               sumDiversity[mGroups[j]].resize(numSeqsThisGroup+1, 0.0);
                         }       
  
                         //convert freq percentage to number
@@ -649,7 +676,7 @@ map<string, int> PhyloDiversityCommand::getRootForGroups(Tree* t){
                 map<string, bool> done;
         
                 //initialize root for all groups to -1
-               for (int k = 0; k < (t->getTreeMap())->getNamesOfGroups().size(); k++) { done[(t->getTreeMap())->getNamesOfGroups()[k]] = false; }
+               for (int k = 0; k < (t->getCountTable())->getNamesOfGroups().size(); k++) { done[(t->getCountTable())->getNamesOfGroups()[k]] = false; }
          
          for (int i = 0; i < t->getNumLeaves(); i++) {
              
diff --git a/phylodiversitycommand.h b/phylodiversitycommand.h

index 95276921149b5237a923928bc9f23a5e8e7aed1b..ee76f05697cfce07213857432c9d786b150fc033 100644 (file)
--- a/phylodiversitycommand.h
+++ b/phylodiversitycommand.h
@@ -11,7 +11,7 @@
   */
  
  #include "command.hpp"
-#include "treemap.h"
+#include "counttable.h"
  #include "sharedutilities.h"
  #include "tree.h"
  
@@ -33,11 +33,11 @@ class PhyloDiversityCommand : public Command {
                 int execute();
                 void help() { m->mothurOut(getHelpString()); }
  private:
-               TreeMap* tmap;
+               CountTable* ct;
                 float freq;
                 int iters, processors, numUniquesInName;  
                 bool abort, rarefy, summary, collect, scale;
-               string groups, outputDir, treefile, groupfile, namefile;
+               string groups, outputDir, treefile, groupfile, namefile, countfile;
                 vector<string> Groups, outputNames; //holds groups to be used, and outputFile names
                 
          map<string, int> getRootForGroups(Tree* t);
diff --git a/phylosummary.cpp b/phylosummary.cpp

index 5f7bbc3c73a2161417a1841f567719a4bd1f8c4a..ab6bb831dfae73bb56c75eb8b144963701dc02be 100644 (file)
--- a/phylosummary.cpp
+++ b/phylosummary.cpp
@@ -8,21 +8,68 @@
   */
  
  #include "phylosummary.h"
-
  /**************************************************************************************************/
  
-PhyloSummary::PhyloSummary(string refTfile, string groupFile){
+PhyloSummary::PhyloSummary(string refTfile, CountTable* c){
         try {
                 m = MothurOut::getInstance();
                 maxLevel = 0;
                 ignore = false;
+        numSeqs = 0;
+               
+               ct = c;
+        groupmap = NULL;
+        
+               //check for necessary files
+               string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
+               ifstream FileTest(taxFileNameTest.c_str());
                 
-               if (groupFile != "") {
-                       groupmap = new GroupMap(groupFile);
-                       groupmap->readMap();
+               if (!FileTest) { 
+                       m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
                 }else{
-                       groupmap = NULL;
+                       readTreeStruct(FileTest);
                 }
+               
+               tree[0].rank = "0";
+               assignRank(0);
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "PhyloSummary");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+PhyloSummary::PhyloSummary(CountTable* c){
+       try {
+               m = MothurOut::getInstance();
+               maxLevel = 0;
+               ignore = true;
+        numSeqs = 0;
+               
+               ct = c;
+        groupmap = NULL;
+               
+               tree.push_back(rawTaxNode("Root"));
+               tree[0].rank = "0";
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "PhyloSummary");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+PhyloSummary::PhyloSummary(string refTfile, GroupMap* g){
+       try {
+               m = MothurOut::getInstance();
+               maxLevel = 0;
+               ignore = false;
+        numSeqs = 0;
+               
+               groupmap = g;
+        ct = NULL;
                                 
                 //check for necessary files
                 string taxFileNameTest = m->getFullPathName((refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"));
@@ -46,23 +93,18 @@ PhyloSummary::PhyloSummary(string refTfile, string groupFile){
  
  /**************************************************************************************************/
  
-PhyloSummary::PhyloSummary(string groupFile){
+PhyloSummary::PhyloSummary(GroupMap* g){
         try {
                 m = MothurOut::getInstance();
                 maxLevel = 0;
                 ignore = true;
+        numSeqs = 0;
                 
-               if (groupFile != "") {
-                       groupmap = new GroupMap(groupFile);
-                       groupmap->readMap();
-               }else{
-                       groupmap = NULL;
-               }
+               groupmap = g;
+        ct = NULL;
                 
                 tree.push_back(rawTaxNode("Root"));
                 tree[0].rank = "0";
-               
-               
         }
         catch(exception& e) {
                 m->errorOut(e, "PhyloSummary", "PhyloSummary");
@@ -78,7 +120,6 @@ int PhyloSummary::summarize(string userTfile){
          
          for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
              addSeqToTree(itTemp->first, itTemp->second);
-                       numSeqs++;
              temp.erase(itTemp++);
          }
          
@@ -137,7 +178,9 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
                         childPointer = tree[currentNode].children.find(taxon);
                         
                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
-                               if (groupmap != NULL) {
+                               int thisCount = 1;
+                
+                if (groupmap != NULL) {
                                         //find out the sequences group
                                         string group = groupmap->getGroup(seqName);
                                         
@@ -150,9 +193,27 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
                                         if (itGroup != tree[childPointer->second].groupCount.end()) {
                                                 tree[childPointer->second].groupCount[group]++;
                                         }
-                               }
+                               }else if (ct != NULL) {
+                    if (ct->hasGroupInfo()) {
+                        vector<int> groupCounts = ct->getGroupCounts(seqName);
+                        vector<string> groups = ct->getNamesOfGroups();
+                        for (int i = 0; i < groups.size(); i++) {
+                            
+                            if (groupCounts[i] != 0) {
+                                //do you have a count for this group?
+                                map<string, int>::iterator itGroup = tree[childPointer->second].groupCount.find(groups[i]);
+                                
+                                //if yes, increment it - there should not be a case where we can't find it since we load group in read
+                                if (itGroup != tree[childPointer->second].groupCount.end()) {
+                                    tree[childPointer->second].groupCount[groups[i]] += groupCounts[i];
+                                }
+                            }
+                        }
+                    }
+                    thisCount = ct->getNumSeqs(seqName);
+                }
                                 
-                               tree[childPointer->second].total++;
+                               tree[childPointer->second].total += thisCount;
  
                                 currentNode = childPointer->second;
                         }else{  
@@ -163,8 +224,8 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
                                 
                                         tree[index].parent = currentNode;
                                         tree[index].level = (level+1);
-                                       tree[index].total = 1;
                                         tree[currentNode].children[taxon] = index;
+                    int thisCount = 1;
                                         
                                         //initialize groupcounts
                                         if (groupmap != NULL) {
@@ -184,9 +245,33 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
                                                 //if yes, increment it - there should not be a case where we can't find it since we load group in read
                                                 if (itGroup != tree[index].groupCount.end()) {
                                                         tree[index].groupCount[group]++;
-                                               }                                               
-                                       }
+                                               }
+                                       }else if (ct != NULL) {
+                        if (ct->hasGroupInfo()) {
+                            vector<string> mGroups = ct->getNamesOfGroups();
+                            for (int j = 0; j < mGroups.size(); j++) {
+                                tree[index].groupCount[mGroups[j]] = 0;
+                            }
+                            vector<int> groupCounts = ct->getGroupCounts(seqName);
+                            vector<string> groups = ct->getNamesOfGroups();
+                        
+                            for (int i = 0; i < groups.size(); i++) {
+                                if (groupCounts[i] != 0) {
+                                   
+                                    //do you have a count for this group?
+                                    map<string, int>::iterator itGroup = tree[index].groupCount.find(groups[i]);
+                                     
+                                    //if yes, increment it - there should not be a case where we can't find it since we load group in read
+                                    if (itGroup != tree[index].groupCount.end()) {
+                                        tree[index].groupCount[groups[i]]+=groupCounts[i];
+                                    }
+                                }
+                            }
+                        }
+                        thisCount = ct->getNumSeqs(seqName);
+                    }
                                         
+                    tree[index].total = thisCount;
                                         currentNode = index;
                                         
                                 }else{ //otherwise, error
@@ -210,7 +295,7 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
  }
  /**************************************************************************************************/
  
-int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
+int PhyloSummary::addSeqToTree(string seqTaxonomy, map<string, bool> containsGroup){
         try {
                 numSeqs++;
                 
@@ -235,32 +320,12 @@ int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
                         childPointer = tree[currentNode].children.find(taxon);
                         
                         if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
-                               if (groupmap != NULL) {
-                                       
-                                       map<string, bool> containsGroup; 
-                                       vector<string> mGroups = groupmap->getNamesOfGroups();
-                                       for (int j = 0; j < mGroups.size(); j++) {
-                                               containsGroup[mGroups[j]] = false;
-                                       }
-                                       
-                                       for (int k = 0; k < names.size(); k++) {
-                                               //find out the sequences group
-                                               string group = groupmap->getGroup(names[k]);
-                                       
-                                               if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
-                                               else {
-                                                       containsGroup[group] = true;
-                                               }
-                                       }
+                for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
+                    if (itGroup->second == true) {
+                        tree[childPointer->second].groupCount[itGroup->first]++;
+                    }
+                }
                                         
-                                       for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
-                                               if (itGroup->second == true) {
-                                                       tree[childPointer->second].groupCount[itGroup->first]++;
-                                               }
-                                       }
-                                       
-                               }
-                               
                                 tree[childPointer->second].total++;
                                 
                                 currentNode = childPointer->second;
@@ -274,33 +339,12 @@ int PhyloSummary::addSeqToTree(string seqTaxonomy, vector<string> names){
                                         tree[index].level = (level+1);
                                         tree[index].total = 1;
                                         tree[currentNode].children[taxon] = index;
-                                       
-                                       //initialize groupcounts
-                                       if (groupmap != NULL) {
-                                               map<string, bool> containsGroup; 
-                                               vector<string> mGroups = groupmap->getNamesOfGroups();
-                                               for (int j = 0; j < mGroups.size(); j++) {
-                                                       tree[index].groupCount[mGroups[j]] = 0;
-                                                       containsGroup[mGroups[j]] = false;
-                                               }
-                                               
                                                 
-                                               for (int k = 0; k < names.size(); k++) {
-                                                       //find out the sequences group
-                                                       string group = groupmap->getGroup(names[k]);
-                                                       
-                                                       if (group == "not found") {  m->mothurOut("[WARNING]: " + names[k] + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
-                                                       else {
-                                                               containsGroup[group] = true;
-                                                       }
-                                               }
-                                               
-                                               for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
-                                                       if (itGroup->second == true) {
-                                                               tree[index].groupCount[itGroup->first]++;
-                                                       }
-                                               }
-                                       }
+                    for (map<string, bool>::iterator itGroup = containsGroup.begin(); itGroup != containsGroup.end(); itGroup++) {
+                        if (itGroup->second == true) {
+                            tree[index].groupCount[itGroup->first]++;
+                        }
+                    }
                                         
                                         currentNode = index;
                                         
@@ -349,17 +393,24 @@ void PhyloSummary::print(ofstream& out){
         try {
                 
                 if (ignore) { assignRank(0); }
-       
+        vector<string> mGroups;
                 //print labels
                 out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
                 if (groupmap != NULL) {
                         //so the labels match the counts below, since the map sorts them automatically...
                         //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
-                       vector<string> mGroups = groupmap->getNamesOfGroups();
+            mGroups = groupmap->getNamesOfGroups();
                         for (int i = 0; i < mGroups.size(); i++) {
                                 out << mGroups[i] << '\t';
                         }
-               }
+               }else if (ct != NULL) {
+            if (ct->hasGroupInfo()) {
+                mGroups = ct->getNamesOfGroups();
+                for (int i = 0; i < mGroups.size(); i++) {
+                    out << mGroups[i] << '\t';
+                }
+            }
+        }
                 
                 out << endl;
                 
@@ -373,9 +424,10 @@ void PhyloSummary::print(ofstream& out){
                                 tree[0].total += tree[it->second].total;
                                 
                                 if (groupmap != NULL) {
-                                       vector<string> mGroups = groupmap->getNamesOfGroups();
                                         for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } 
-                               }
+                               }else if ( ct != NULL) {
+                    if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } }
+                }
                         }
                 }
                 
@@ -384,12 +436,10 @@ void PhyloSummary::print(ofstream& out){
                 
                 
                 if (groupmap != NULL) {
-                       //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
-                       //      out << itGroup->second << '\t';
-                       //}
-                       vector<string> mGroups = groupmap->getNamesOfGroups();
-                       for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; } 
-               }
+                       for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; }  
+        }else if ( ct != NULL) {
+            if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; } }
+        }
                 out << endl;
                 
                 //print rest
@@ -427,7 +477,12 @@ void PhyloSummary::print(int i, ofstream& out){
                                         //}
                                         vector<string> mGroups = groupmap->getNamesOfGroups();
                                         for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
-                               }
+                               }else if (ct != NULL) {
+                    if (ct->hasGroupInfo()) {
+                        vector<string> mGroups = ct->getNamesOfGroups();
+                        for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
+                    }
+                }
                                 out << endl;
                                 
                         }
@@ -473,7 +528,13 @@ void PhyloSummary::readTreeStruct(ifstream& in){
                                 for (int j = 0; j < (groupmap->getNamesOfGroups()).size(); j++) {
                                         tree[i].groupCount[(groupmap->getNamesOfGroups())[j]] = 0;
                                 }
-                       }
+                       }else if (ct != NULL) {
+                if (ct->hasGroupInfo()) {
+                    for (int j = 0; j < (ct->getNamesOfGroups()).size(); j++) {
+                        tree[i].groupCount[(ct->getNamesOfGroups())[j]] = 0;
+                    }
+                }
+            }
                         
                         tree[i].total = 0;
                         
diff --git a/phylosummary.h b/phylosummary.h

index cdec0d0a4ac7ca83cb0845988adb90152bcae07b..65a467483ddab1ae58757629644162dc83bab254 100644 (file)
--- a/phylosummary.h
+++ b/phylosummary.h
@@ -13,6 +13,7 @@
  #include "mothur.h"
  #include "mothurout.h"
  #include "groupmap.h"
+#include "counttable.h"
  
  /**************************************************************************************************/
  
@@ -32,13 +33,15 @@ struct rawTaxNode {
  class PhyloSummary {
  
  public:
-       PhyloSummary(string);
-       PhyloSummary(string, string);
-       ~PhyloSummary() { if (groupmap != NULL)  {  delete groupmap;  }  }
+       PhyloSummary(GroupMap*);
+       PhyloSummary(string, GroupMap*);
+    PhyloSummary(CountTable*);
+       PhyloSummary(string, CountTable*);
+       ~PhyloSummary() {}
         
         int summarize(string);  //pass it a taxonomy file and a group file and it makes the tree
         int addSeqToTree(string, string);
-       int addSeqToTree(string, vector<string>);
+       int addSeqToTree(string, map<string, bool>);
         void print(ofstream&);
         int getMaxLevel() { return maxLevel; }
         
@@ -49,6 +52,7 @@ private:
         void assignRank(int);
         void readTreeStruct(ifstream&);
         GroupMap* groupmap;
+    CountTable* ct;
         bool ignore;
         
         int numNodes;
diff --git a/phylotree.cpp b/phylotree.cpp

index 3dde18680c625eb816230a8d13774ccfc47032cf..8a7c712b0f255db20da492fd95bc15d3116ce151 100644 (file)
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -75,7 +75,7 @@ PhyloTree::PhyloTree(ifstream& in, string filename){
                         for (int i = 0; i < numGenus; i++) {
                                 iss >> gnode >> gsize; m->gobble(iss);
                                 
-                               uniqueTaxonomies[gnode] = gnode;
+                               uniqueTaxonomies.insert(gnode);
                                 totals.push_back(gsize);
                         }
                         
@@ -102,7 +102,7 @@ PhyloTree::PhyloTree(ifstream& in, string filename){
                         for (int i = 0; i < numGenus; i++) {
                                 in >> gnode >> gsize; m->gobble(in);
                                 
-                               uniqueTaxonomies[gnode] = gnode;
+                               uniqueTaxonomies.insert(gnode);
                                 totals.push_back(gsize);
                         }
                         
@@ -260,7 +260,7 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
                         //use print to reassign the taxa id
                         taxon = getNextTaxon(seqTaxonomy, seqName);
                         
-                       if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies[currentNode] = currentNode; } break;  }
+                       if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies.insert(currentNode); } break;  }
                         
                         childPointer = tree[currentNode].children.find(taxon);
                         
@@ -280,7 +280,7 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
                                 name2Taxonomy[seqName] = currentNode;
                         }
         
-                       if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
+                       if (seqTaxonomy == "") {   uniqueTaxonomies.insert(currentNode);        }
                 }
                 
                 return 0;
@@ -295,9 +295,16 @@ vector<int> PhyloTree::getGenusNodes()     {
         try {
                 genusIndex.clear();
                 //generate genusIndexes
-               map<int, int>::iterator it2;
-               for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
-               
+               set<int>::iterator it2;
+        map<int, int> temp;
+               for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(*it2);   temp[*it2] = genusIndex.size()-1; }
+               
+        for (map<string, int>::iterator itName = name2Taxonomy.begin(); itName != name2Taxonomy.end(); itName++) {
+            map<int, int>::iterator itTemp = temp.find(itName->second);
+            if (itTemp != temp.end()) { name2GenusNodeIndex[itName->first] = itTemp->second; }
+            else {  m->mothurOut("[ERROR]: trouble making name2GenusNodeIndex, aborting.\n"); m->control_pressed = true; }
+        }
+        
                 return genusIndex;
         }
         catch(exception& e) {
@@ -541,8 +548,8 @@ void PhyloTree::printTreeNodes(string treefilename) {
                         
                         //print genus nodes
                         outTree << endl << uniqueTaxonomies.size() << endl;
-                       map<int, int>::iterator it2;
-                       for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
+                       set<int>::iterator it2;
+                       for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << *it2 << '\t' << tree[*it2].accessions.size() << endl;      }
                         outTree << endl;
                         
                         outTree.close();
@@ -594,12 +601,12 @@ string PhyloTree::getName(int i ){
         }
  }
  /**************************************************************************************************/
-int PhyloTree::getIndex(string seqName){
+int PhyloTree::getGenusIndex(string seqName){
         try {
-               map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
+               map<string, int>::iterator itFind = name2GenusNodeIndex.find(seqName);
         
-               if (itFind != name2Taxonomy.end()) {  return name2Taxonomy[seqName];  }
-               else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
+               if (itFind != name2GenusNodeIndex.end()) {  return itFind->second;  }
+               else { m->mothurOut("Cannot find " + seqName + ". Could be a mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
         }
         catch(exception& e) {
                 m->errorOut(e, "PhyloTree", "get");
diff --git a/phylotree.h b/phylotree.h

index 7aae8f1adedcf5f1db6b94d1bbe76a968a8e3100..e0002209e4d83aaaf9468573173858d9b0602b61 100644 (file)
--- a/phylotree.h
+++ b/phylotree.h
@@ -44,7 +44,7 @@ public:
         TaxNode get(int i);                             
         TaxNode get(string seqName);
         string getName(int i);                  
-       int getIndex(string seqName);   
+       int getGenusIndex(string seqName);      
         string getFullTaxonomy(string);  //pass a sequence name return taxonomy
         
         int getMaxLevel()               {       return maxLevel;        }
@@ -63,7 +63,8 @@ private:
         vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
         vector<int> totals; //holds the numSeqs at each genus level taxonomy
         map<string, int> name2Taxonomy;  //maps name to index in tree
-       map<int, int> uniqueTaxonomies;  //map of unique taxonomies
+    map<string, int> name2GenusNodeIndex;
+       set<int> uniqueTaxonomies;  //map of unique taxonomies
         map<int, int> leafNodes; //used to create static reference taxonomy file
         //void print(int, ofstream&);
         int numNodes;
diff --git a/prcseqscommand.cpp b/prcseqscommand.cpp

index 6b73d440778dce57e39cfdb1753277d9f512a94b..de2cb2058a064b043f424921f599edd6f7f8bf24 100644 (file)
--- a/prcseqscommand.cpp
+++ b/prcseqscommand.cpp
@@ -13,8 +13,9 @@ vector<string> PcrSeqsCommand::setParameters(){
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
                 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-        CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
          CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
          CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
                 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
@@ -40,7 +41,7 @@ string PcrSeqsCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The pcr.seqs command reads a fasta file.\n";
-        helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n";
+        helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n";
                 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
          helpString += "The start parameter allows you to provide a starting position to trim to.\n";
          helpString += "The end parameter allows you to provide a ending position to trim from.\n";
@@ -72,6 +73,7 @@ string PcrSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
              else if (type == "taxonomy") {  outputFileName =  "pcr" + m->getExtension(inputName); }
              else if (type == "group") {  outputFileName =  "pcr" + m->getExtension(inputName); }
              else if (type == "name") {  outputFileName =  "pcr" + m->getExtension(inputName); }
+            else if (type == "count") {  outputFileName =  "pcr" + m->getExtension(inputName); }
              else if (type == "accnos") {  outputFileName =  "bad.accnos"; }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
@@ -93,6 +95,7 @@ PcrSeqsCommand::PcrSeqsCommand(){
                 outputTypes["taxonomy"] = tempOutNames;
                 outputTypes["group"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
          outputTypes["accnos"] = tempOutNames;
         }
         catch(exception& e) {
@@ -132,6 +135,7 @@ PcrSeqsCommand::PcrSeqsCommand(string option)  {
                         outputTypes["group"] = tempOutNames;
                         outputTypes["name"] = tempOutNames;
              outputTypes["accnos"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -185,6 +189,14 @@ PcrSeqsCommand::PcrSeqsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                                 
                         }
              
@@ -229,6 +241,19 @@ PcrSeqsCommand::PcrSeqsCommand(string option)  {
                         else if(groupfile == "not open"){       groupfile = ""; abort = true;   } 
              else { m->setGroupFile(groupfile); }
              
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+            
              taxfile = validParameter.validFile(parameters, "taxonomy", true);
                         if (taxfile == "not found"){    taxfile = "";           }
                         else if(taxfile == "not open"){ taxfile = ""; abort = true;     } 
@@ -265,10 +290,12 @@ PcrSeqsCommand::PcrSeqsCommand(string option)  {
              }
                         
                         //check to make sure you didn't forget the name file by mistake                 
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(fastafile);
-                               parser.getNameFile(files);
-                       }
+                       if (countfile == "") { 
+                if (namefile == "") {
+                    vector<string> files; files.push_back(fastafile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
          
         }
@@ -339,7 +366,9 @@ int PcrSeqsCommand::execute(){
          if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); } return 0; }
                 if (taxfile != "")                      {               readTax(badNames);              }
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
-        
+               if (countfile != "")                    {               readCount(badNames);            }
+               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
+      
          m->mothurOutEndLine();
                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
                 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
@@ -373,6 +402,11 @@ int PcrSeqsCommand::execute(){
                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
                 }
          
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
+        
                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
                 m->mothurOutEndLine();
  
@@ -1087,6 +1121,63 @@ int PcrSeqsCommand::readTax(set<string> names){
                 exit(1);
         }
  }
+//***************************************************************************************************************
+int PcrSeqsCommand::readCount(set<string> badSeqNames){
+       try {
+               ifstream in;
+               m->openInputFile(countfile, in);
+               set<string>::iterator it;
+               
+               string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+        outputNames.push_back(goodCountFile);  outputTypes["count"].push_back(goodCountFile);
+               ofstream goodCountOut;  m->openOutputFile(goodCountFile, goodCountOut);
+               
+        string headers = m->getline(in); m->gobble(in);
+        goodCountOut << headers << endl;
+        
+        string name, rest; int thisTotal, removedCount; removedCount = 0;
+        bool wroteSomething = false;
+        while (!in.eof()) {
+            
+                       if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
+            
+                       in >> name; m->gobble(in); 
+            in >> thisTotal; m->gobble(in);
+            rest = m->getline(in); m->gobble(in);
+            
+                       if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; }
+                       else{
+                wroteSomething = true;
+                               goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
+                       }
+               }
+               in.close();
+               goodCountOut.close();
+        
+        if (m->control_pressed) { m->mothurRemove(goodCountFile);   }
+        
+        if (wroteSomething == false) {  m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
+        
+        //check for groups that have been eliminated
+        CountTable ct;
+        if (ct.testGroups(goodCountFile)) {
+            ct.readTable(goodCountFile);
+            ct.printTable(goodCountFile);
+        }
+               
+               if (m->control_pressed) { m->mothurRemove(goodCountFile);   }
+        
+        m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
+
+               
+               return 0;
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PcrSeqsCommand", "readCOunt");
+               exit(1);
+       }
+}
  /**************************************************************************************/
  
  
diff --git a/preclustercommand.cpp b/preclustercommand.cpp

index 951b2008b062c26c5fabb54dfd8ef2d2a858da67..dadc9186f243952445db24766cf3a08c9dc4d5d0 100644 (file)
--- a/preclustercommand.cpp
+++ b/preclustercommand.cpp
@@ -14,8 +14,9 @@
  vector<string> PreClusterCommand::setParameters(){     
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
@@ -36,9 +37,10 @@ string PreClusterCommand::getHelpString(){
                 string helpString = "";
                 helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n";
                 helpString += "The pre.cluster command outputs a new fasta and name file.\n";
-               helpString += "The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n";
-               helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
+               helpString += "The pre.cluster command parameters are fasta, name, group, count, processors and diffs. The fasta parameter is required. \n";
+               helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
                 helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n";
+        helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n";
                 helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
                 helpString += "The pre.cluster command should be in the following format: \n";
                 helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
@@ -63,6 +65,7 @@ string PreClusterCommand::getOutputFileNameTag(string type, string inputName="")
          else {
              if (type == "fasta") {  outputFileName =  "precluster" + m->getExtension(inputName); }
              else if (type == "name") {  outputFileName =  "precluster.names"; }
+            else if (type == "count") {  outputFileName =  "precluster.count_table"; }
              else if (type == "map") {  outputFileName =  "precluster.map"; }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
@@ -81,6 +84,7 @@ PreClusterCommand::PreClusterCommand(){
                 vector<string> tempOutNames;
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
                 outputTypes["map"] = tempOutNames;
         }
         catch(exception& e) {
@@ -117,6 +121,7 @@ PreClusterCommand::PreClusterCommand(string option) {
                         outputTypes["fasta"] = tempOutNames;
                         outputTypes["name"] = tempOutNames;
                         outputTypes["map"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                 
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -146,6 +151,14 @@ PreClusterCommand::PreClusterCommand(string option) {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         //check for required parameters
@@ -175,6 +188,25 @@ PreClusterCommand::PreClusterCommand(string option) {
                         if (groupfile == "not found") { groupfile =  "";  bygroup = false; }
                         else if (groupfile == "not open") { abort = true; groupfile =  ""; }    
                         else {   m->setGroupFile(groupfile); bygroup = true;  }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not found") { countfile =  "";   }
+                       else if (countfile == "not open") { abort = true; countfile =  ""; }    
+                       else {   
+                m->setCountTableFile(countfile); 
+                ct.readTable(countfile);
+                if (ct.hasGroupInfo()) { bygroup = true; }
+                else { bygroup = false;  }
+            }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+
                         
                         string temp     = validParameter.validFile(parameters, "diffs", false);         if(temp == "not found"){        temp = "1"; }
                         m->mothurConvert(temp, diffs); 
@@ -183,10 +215,12 @@ PreClusterCommand::PreClusterCommand(string option) {
                         m->setProcessors(temp);
                         m->mothurConvert(temp, processors);
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(fastafile);
-                               parser.getNameFile(files);
-                       }
+            if (countfile == "") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(fastafile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
                                 
         }
@@ -207,10 +241,11 @@ int PreClusterCommand::execute(){
                 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
                 string newFastaFile = fileroot + getOutputFileNameTag("fasta", fastafile);
                 string newNamesFile = fileroot + getOutputFileNameTag("name");
+        string newCountFile = fileroot + getOutputFileNameTag("count");
                 string newMapFile = fileroot + getOutputFileNameTag("map"); //add group name if by group
                 outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
-               outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile);
-               
+               if (countfile == "") { outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); }
+               else { outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); }
                 
                 if (bygroup) {
                         //clear out old files
@@ -219,39 +254,45 @@ int PreClusterCommand::execute(){
                         newMapFile = fileroot + "precluster.";
                         
                         //parse fasta and name file by group
-                       SequenceParser* parser;
-                       if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile);      }
-                       else                            { parser = new SequenceParser(groupfile, fastafile);                    }
-                       
-                       vector<string> groups = parser->getNamesOfGroups();
-                       
-                       if(processors == 1)     {       driverGroups(parser, newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); }
-                       else                            {       createProcessesGroups(parser, newFastaFile, newNamesFile, newMapFile, groups);                  }
-                       
-                       delete parser;
-                       
-                       if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        }        return 0; }
-                       
-                       //run unique.seqs for deconvolute results
-                       string inputString = "fasta=" + newFastaFile;
-                       if (namefile != "") { inputString += ", name=" + newNamesFile; }
-                       m->mothurOutEndLine(); 
-                       m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
-                       m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
-                       m->mothurCalling = true;
+            vector<string> groups;
+                       if (countfile != "") {
+                cparser = new SequenceCountParser(countfile, fastafile);
+                groups = cparser->getNamesOfGroups();
+            }else {
+                if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile);     }
+                else                           { parser = new SequenceParser(groupfile, fastafile);                    }
+                groups = parser->getNamesOfGroups();
+                       }
              
-                       Command* uniqueCommand = new DeconvoluteCommand(inputString);
-                       uniqueCommand->execute();
-                       
-                       map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
-                       
-                       delete uniqueCommand;
-                       m->mothurCalling = false;
-                       m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
-                       
-                       m->renameFile(filenames["fasta"][0], newFastaFile);
-            m->renameFile(filenames["name"][0], newNamesFile);
-                       
+                       if(processors == 1)     {       driverGroups(newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); }
+                       else                            {       createProcessesGroups(newFastaFile, newNamesFile, newMapFile, groups);                  }
+                       
+                       if (countfile != "") { 
+                mergeGroupCounts(newCountFile, newNamesFile, newFastaFile);
+                delete cparser; 
+            }else {  
+                delete parser; 
+                //run unique.seqs for deconvolute results
+                string inputString = "fasta=" + newFastaFile;
+                if (namefile != "") { inputString += ", name=" + newNamesFile; }
+                m->mothurOutEndLine(); 
+                m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+                m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
+                m->mothurCalling = true;
+                
+                Command* uniqueCommand = new DeconvoluteCommand(inputString);
+                uniqueCommand->execute();
+                
+                map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+                
+                delete uniqueCommand;
+                m->mothurCalling = false;
+                m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+                
+                m->renameFile(filenames["fasta"][0], newFastaFile);
+                m->renameFile(filenames["name"][0], newNamesFile); 
+                       }
+            if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);        }        return 0; }
                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine(); 
                                 
                 }else {
@@ -272,8 +313,9 @@ int PreClusterCommand::execute(){
                         
                         m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
-                       printData(newFastaFile, newNamesFile);
-                       
+                       if (countfile != "") { newNamesFile = newCountFile; }
+            printData(newFastaFile, newNamesFile, "");
+                               
                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
                 }
                                 
@@ -295,6 +337,11 @@ int PreClusterCommand::execute(){
                 if (itTypes != outputTypes.end()) {
                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
                 }
+        
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
                 
                 return 0;
                 
@@ -305,7 +352,7 @@ int PreClusterCommand::execute(){
         }
  }
  /**************************************************************************************************/
-int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newFName, string newNName, string newMFile, vector<string> groups) {
+int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile, vector<string> groups) {
         try {
                 
                 vector<int> processIDS;
@@ -336,7 +383,7 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF
                                 process++;
                         }else if (pid == 0){
                  outputNames.clear();
-                               num = driverGroups(parser, newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
+                               num = driverGroups(newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
                  
                  string tempFile = toString(getpid()) + ".outputNames.temp";
                  ofstream outTemp;
@@ -355,7 +402,7 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF
                 }
                 
                 //do my part
-               num = driverGroups(parser, newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
+               num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<processIDS.size();i++) { 
@@ -395,7 +442,7 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF
                         // Allocate memory for thread data.
                         string extension = toString(i) + ".temp";
                         
-                       preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, i);
+                       preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, i);
                         pDataArray.push_back(tempPreCluster);
                         processIDS.push_back(i);
                         
@@ -406,7 +453,7 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF
                 
                                 
                 //using the main process as a worker saves time and memory
-               num = driverGroups(parser, newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
+               num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
                 
                 //Wait until all threads have terminated.
                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
@@ -443,7 +490,7 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF
         }
  }
  /**************************************************************************************************/
-int PreClusterCommand::driverGroups(SequenceParser* parser, string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
+int PreClusterCommand::driverGroups(string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
         try {
                 
                 int numSeqs = 0;
@@ -458,24 +505,29 @@ int PreClusterCommand::driverGroups(SequenceParser* parser, string newFFile, str
                         m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
                         
                         map<string, string> thisNameMap;
-                       if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); }
-                       vector<Sequence> thisSeqs = parser->getSeqs(groups[i]);
-                       
+            vector<Sequence> thisSeqs;
+                       if (groupfile != "") { 
+                thisSeqs = parser->getSeqs(groups[i]);
+            }else if (countfile != "") {
+                thisSeqs = cparser->getSeqs(groups[i]);
+            }
+                       if (namefile != "") {  thisNameMap = parser->getNameMap(groups[i]); }
+            
                         //fill alignSeqs with this groups info.
-                       numSeqs = loadSeqs(thisNameMap, thisSeqs);
+                       numSeqs = loadSeqs(thisNameMap, thisSeqs, groups[i]);
                         
                         if (m->control_pressed) {   return 0; }
                         
                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); m->control_pressed = true; return 0;  }
                         
-                       int count = process(newMFile+groups[i]+".map");
+                       int count= process(newMFile+groups[i]+".map");
                         outputNames.push_back(newMFile+groups[i]+".map"); outputTypes["map"].push_back(newMFile+groups[i]+".map");
                         
                         if (m->control_pressed) {  return 0; }
                         
                         m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
-                       printData(newFFile, newNFile);
+                       printData(newFFile, newNFile, groups[i]);
                         
                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
                         
@@ -559,26 +611,13 @@ int PreClusterCommand::readFASTA(){
                 //ifstream inNames;
                 ifstream inFasta;
                 
-               //m->openInputFile(namefile, inNames);
                 m->openInputFile(fastafile, inFasta);
-               
-               //string firstCol, secondCol, nameString;
                 set<int> lengths;
                 
                 while (!inFasta.eof()) {
                         
                         if (m->control_pressed) { inFasta.close(); return 0; }
-                       
-                       //inNames >> firstCol >> secondCol;
-                       //nameString = secondCol;
-                       
-                       //m->gobble(inNames);
-                       //int size = 1;
-                       //while (secondCol.find_first_of(',') != -1) { 
-                       //      size++;
-                       //      secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
-                       //}
-                       
+                                               
                         Sequence seq(inFasta);  m->gobble(inFasta);
                         
                         if (seq.getName() != "") {  //can get "" if commented line is at end of fasta file
@@ -592,14 +631,15 @@ int PreClusterCommand::readFASTA(){
                                                 lengths.insert(seq.getAligned().length());
                                         }       
                                 }else { //no names file, you are identical to yourself 
-                                       seqPNode tempNode(1, seq, seq.getName());
+                    int numRep = 1;
+                    if (countfile != "") { numRep = ct.getNumSeqs(seq.getName()); }
+                                       seqPNode tempNode(numRep, seq, seq.getName());
                                         alignSeqs.push_back(tempNode);
                                         lengths.insert(seq.getAligned().length());
                                 }
                         }
                 }
                 inFasta.close();
-               //inNames.close();
          
          if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
          else if (lengths.size() == 1) { length = *(lengths.begin()); }
@@ -613,13 +653,15 @@ int PreClusterCommand::readFASTA(){
         }
  }
  /**************************************************************************************************/
-int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs){
+int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs, string group){
         try {
                 set<int> lengths;
                 alignSeqs.clear();
                 map<string, string>::iterator it;
                 bool error = false;
-                       
+        map<string, int> thisCount;
+        if (countfile != "") { thisCount = cparser->getCountTable(group);  }
+               
                 for (int i = 0; i < thisSeqs.size(); i++) {
                         
                         if (m->control_pressed) { return 0; }
@@ -641,12 +683,20 @@ int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>&
                      lengths.insert(thisSeqs[i].getAligned().length());
                                 }       
                         }else { //no names file, you are identical to yourself 
-                               seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
+                int numRep = 1;
+                if (countfile != "") { 
+                    map<string, int>::iterator it2 = thisCount.find(thisSeqs[i].getName());
+                    
+                    //should never be true since parser checks for this
+                    if (it2 == thisCount.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); error = true; }
+                    else { numRep = it2->second;  }
+                }
+                               seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName());
                                 alignSeqs.push_back(tempNode);
                                 lengths.insert(thisSeqs[i].getAligned().length());
                         }
                 }
-               
+    
          if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
          else if (lengths.size() == 1) { length = *(lengths.begin()); }
          
@@ -683,10 +733,84 @@ int PreClusterCommand::calcMisMatches(string seq1, string seq2){
                 exit(1);
         }
  }
+/**************************************************************************************************/
+
+int PreClusterCommand::mergeGroupCounts(string newcount, string newname, string newfasta){
+       try {
+               ifstream inNames;
+        m->openInputFile(newname, inNames);
+        
+        string group, first, second;
+        set<string> uniqueNames;
+        while (!inNames.eof()) {
+            if (m->control_pressed) { break; }
+            inNames >> group; m->gobble(inNames);
+            inNames >> first; m->gobble(inNames);
+            inNames >> second; m->gobble(inNames);
+            
+            vector<string> names;
+            m->splitAtComma(second, names);
+            
+            uniqueNames.insert(first);
+            
+            int total = ct.getGroupCount(first, group);
+            for (int i = 1; i < names.size(); i++) {
+                total += ct.getGroupCount(names[i], group);
+                ct.setAbund(names[i], group, 0);
+            }
+            ct.setAbund(first, group, total);
+        }
+        inNames.close();
+        
+        vector<string> namesOfSeqs = ct.getNamesOfSeqs();
+        for (int i = 0; i < namesOfSeqs.size(); i++) {
+            if (ct.getNumSeqs(namesOfSeqs[i]) == 0) {
+                ct.remove(namesOfSeqs[i]);
+            }
+        }
+        
+        ct.printTable(newcount); 
+        m->mothurRemove(newname);
+        
+        if (bygroup) { //if by group, must remove the duplicate seqs that are named the same
+            ifstream in;
+            m->openInputFile(newfasta, in);
+            
+            ofstream out;
+            m->openOutputFile(newfasta+"temp", out);
+            
+            int count = 0;
+            set<string> already;
+            while(!in.eof()) {
+                if (m->control_pressed) { break; }
+                
+                Sequence seq(in); m->gobble(in);
+                
+                if (seq.getName() != "") {
+                    count++;
+                    if (already.count(seq.getName()) == 0) {
+                        seq.printSequence(out);
+                        already.insert(seq.getName());
+                    }
+                }
+            }
+            in.close();
+            out.close();
+            m->mothurRemove(newfasta);
+            m->renameFile(newfasta+"temp", newfasta);
+        }
+                       return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PreClusterCommand", "mergeGroupCounts");
+               exit(1);
+       }
+}
  
  /**************************************************************************************************/
  
-void PreClusterCommand::printData(string newfasta, string newname){
+void PreClusterCommand::printData(string newfasta, string newname, string group){
         try {
                 ofstream outFasta;
                 ofstream outNames;
@@ -699,10 +823,14 @@ void PreClusterCommand::printData(string newfasta, string newname){
                         m->openOutputFile(newname, outNames);
                 }
                 
+        if ((countfile != "") && (group == ""))  { outNames << "Representative_Sequence\ttotal\n";  }
                 for (int i = 0; i < alignSeqs.size(); i++) {
                         if (alignSeqs[i].numIdentical != 0) {
                                 alignSeqs[i].seq.printSequence(outFasta); 
-                               outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
+                               if (countfile != "") {  
+                    if (group != "") {  outNames << group << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; }
+                    else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].numIdentical << endl;  }
+                }else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
                         }
                 }
                 
diff --git a/preclustercommand.h b/preclustercommand.h

index 082bff27e25209fc7db92404388ae40592d45bff..e63afa6c8af8ee442e267714fcfccd140d4f79dd 100644 (file)
--- a/preclustercommand.h
+++ b/preclustercommand.h
@@ -15,6 +15,7 @@
  #include "command.hpp"
  #include "sequence.hpp"
  #include "sequenceparser.h"
+#include "sequencecountparser.h"
  
  /************************************************************/
  struct seqPNode {
@@ -28,7 +29,13 @@ struct seqPNode {
         ~seqPNode() {}
  };
  /************************************************************/
-inline bool comparePriority(seqPNode first, seqPNode second) {  return (first.numIdentical > second.numIdentical); }
+inline bool comparePriority(seqPNode first, seqPNode second) {  
+    if (first.numIdentical > second.numIdentical) { return true;  }
+    else if (first.numIdentical == second.numIdentical) { 
+        if (first.seq.getName() > second.seq.getName()) { return true; }
+    }
+    return false; 
+}
  //************************************************************/
  
  class PreClusterCommand : public Command {
@@ -58,9 +65,13 @@ private:
                 linePair(int i, int j) : start(i), end(j) {}
         };
         
+    SequenceParser* parser;
+    SequenceCountParser* cparser;
+    CountTable ct;
+    
         int diffs, length, processors;
         bool abort, bygroup;
-       string fastafile, namefile, outputDir, groupfile;
+       string fastafile, namefile, outputDir, groupfile, countfile;
         vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
         map<string, string> names; //represents the names file first column maps to second column
         map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
@@ -73,11 +84,12 @@ private:
         void readNameFile();
         //int readNamesFASTA();
         int calcMisMatches(string, string);
-       void printData(string, string); //fasta filename, names file name
+       void printData(string, string, string); //fasta filename, names file name
         int process(string);
-       int loadSeqs(map<string, string>&, vector<Sequence>&);
-       int driverGroups(SequenceParser*, string, string, string, int, int, vector<string> groups);
-       int createProcessesGroups(SequenceParser*, string, string, string, vector<string>);
+       int loadSeqs(map<string, string>&, vector<Sequence>&, string);
+       int driverGroups(string, string, string, int, int, vector<string> groups);
+       int createProcessesGroups(string, string, string, vector<string>);
+    int mergeGroupCounts(string, string, string);
  };
  
  /**************************************************************************************************/
@@ -87,7 +99,7 @@ private:
  struct preClusterData {
         string fastafile; 
         string namefile; 
-       string groupfile;
+       string groupfile, countfile;
         string newFName, newNName, newMName;
         MothurOut* m;
         int start;
@@ -97,7 +109,7 @@ struct preClusterData {
         vector<string> mapFileNames;
         
         preClusterData(){}
-       preClusterData(string f, string n, string g, string nff,  string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
+       preClusterData(string f, string n, string g, string c, string nff,  string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
                 fastafile = f;
                 namefile = n;
                 groupfile = g;
@@ -110,6 +122,7 @@ struct preClusterData {
                 diffs = d;
                 threadID = tid;
                 groups = gr;
+        countfile = c;
         }
  };
  
@@ -124,10 +137,15 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
                 
                 //parse fasta and name file by group
                 SequenceParser* parser;
-               if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);      }
-               else                                                    { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
-               
-               int numSeqs = 0;
+        SequenceCountParser* cparser;
+        if (pDataArray->countfile != "") {
+            cparser = new SequenceCountParser(pDataArray->countfile, pDataArray->fastafile);
+        }else {
+            if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
+            else                               { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                    }
+        }
+        
+               int numSeqs = 0;
                 vector<seqPNode> alignSeqs;
                 //clear out old files
                 ofstream outF; pDataArray->m->openOutputFile(pDataArray->newFName, outF); outF.close();
@@ -143,8 +161,13 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
                         
                         map<string, string> thisNameMap;
-                       if (pDataArray->namefile != "") { thisNameMap = parser->getNameMap(pDataArray->groups[k]); }
-                       vector<Sequence> thisSeqs = parser->getSeqs(pDataArray->groups[k]);
+            vector<Sequence> thisSeqs;
+                       if (pDataArray->groupfile != "") { 
+                thisSeqs = parser->getSeqs(pDataArray->groups[k]);
+            }else if (pDataArray->countfile != "") {
+                thisSeqs = cparser->getSeqs(pDataArray->groups[k]);
+            }
+                       if (pDataArray->namefile != "") {  thisNameMap = parser->getNameMap(pDataArray->groups[k]); }
                         
                         //fill alignSeqs with this groups info.
                         ////////////////////////////////////////////////////
@@ -154,6 +177,9 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
                         alignSeqs.clear();
                         map<string, string>::iterator it;
                         bool error = false;
+            map<string, int> thisCount;
+            if (pDataArray->countfile != "") { thisCount = cparser->getCountTable(pDataArray->groups[k]);  }
+
                         
                         for (int i = 0; i < thisSeqs.size(); i++) {
                                 
@@ -176,8 +202,16 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
                                                 if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
                                         }       
                                 }else { //no names file, you are identical to yourself 
-                                       seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
-                                       alignSeqs.push_back(tempNode);
+                                       int numRep = 1;
+                    if (pDataArray->countfile != "") { 
+                        map<string, int>::iterator it2 = thisCount.find(thisSeqs[i].getName());
+                        
+                        //should never be true since parser checks for this
+                        if (it2 == thisCount.end()) { pDataArray->m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); error = true; }
+                        else { numRep = it2->second;  }
+                    }
+                    seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName());
+                    alignSeqs.push_back(tempNode);
                                         if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
                                 }
                         }
@@ -274,7 +308,9 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
                         for (int i = 0; i < alignSeqs.size(); i++) {
                                 if (alignSeqs[i].numIdentical != 0) {
                                         alignSeqs[i].seq.printSequence(outFasta); 
-                                       outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
+                                       if (pDataArray->countfile != "") {  outNames << pDataArray->groups[k] << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; 
+                    }else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
+
                                 }
                         }
                         
diff --git a/randomforest.cpp b/randomforest.cpp

new file mode 100644 (file)

index 0000000..36a2c1a
--- /dev/null
+++ b/randomforest.cpp
@@ -0,0 +1,156 @@
+//
+//  randomforest.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 10/2/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "randomforest.hpp" 
+
+/***********************************************************************/
+
+RandomForest::RandomForest(const vector <vector<int> > dataSet,const int numDecisionTrees,
+             const string treeSplitCriterion = "informationGain") : AbstractRandomForest(dataSet, numDecisionTrees, treeSplitCriterion) {
+    m = MothurOut::getInstance();
+}
+
+/***********************************************************************/
+// DONE
+int RandomForest::calcForrestErrorRate() {
+    try {
+        int numCorrect = 0;
+        for (map<int, vector<int> >::iterator it = globalOutOfBagEstimates.begin(); it != globalOutOfBagEstimates.end(); it++) {
+            
+            if (m->control_pressed) { return 0; }
+            
+            int indexOfSample = it->first;
+            vector<int> predictedOutComes = it->second;
+            vector<int>::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end());
+            int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin());
+            int realOutcome = dataSet[indexOfSample][numFeatures];
+            
+            if (majorityVotedOutcome == realOutcome) { numCorrect++; }
+        }
+        
+        // TODO: save or return forrestErrorRate for future use;
+        double forrestErrorRate = 1 - ((double)numCorrect / (double)globalOutOfBagEstimates.size());
+        
+        m->mothurOut("numCorrect = " + toString(numCorrect)+ "\n");
+        m->mothurOut("forrestErrorRate = " + toString(forrestErrorRate)+ "\n");
+    
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "RandomForest", "calcForrestErrorRate");
+               exit(1);
+       } 
+}
+
+/***********************************************************************/
+// DONE
+int RandomForest::calcForrestVariableImportance(string filename) {
+    try {
+    
+    // TODO: need to add try/catch operators to fix this
+    // follow the link: http://en.wikipedia.org/wiki/Dynamic_cast
+        //if you are going to dynamically cast, aren't you undoing the advantage of abstraction. Why abstract at all?
+        //could cause maintenance issues later if other types of Abstract decison trees are created that cannot be cast as a decision tree.
+    for (int i = 0; i < decisionTrees.size(); i++) {
+        if (m->control_pressed) { return 0; }
+        DecisionTree* decisionTree = dynamic_cast<DecisionTree*>(decisionTrees[i]);
+        
+        for (int j = 0; j < numFeatures; j++) {
+            globalVariableImportanceList[j] += (double)decisionTree->variableImportanceList[j];
+        }
+    }
+    
+    for (int i = 0;  i < numFeatures; i++) {
+        cout << "[" << i << ',' << globalVariableImportanceList[i] << "], ";
+        globalVariableImportanceList[i] /= (double)numDecisionTrees;
+    }
+    
+    vector< vector<double> > globalVariableRanks;
+    for (int i = 0; i < globalVariableImportanceList.size(); i++) {
+        if (globalVariableImportanceList[i] > 0) {
+            vector<double> globalVariableRank(2, 0);
+            globalVariableRank[0] = i; globalVariableRank[1] = globalVariableImportanceList[i];
+            globalVariableRanks.push_back(globalVariableRank);
+        }
+    }
+    
+    VariableRankDescendingSorterDouble variableRankDescendingSorter;
+    sort(globalVariableRanks.begin(), globalVariableRanks.end(), variableRankDescendingSorter);
+        ofstream out;
+        m->openOutputFile(filename, out);
+        out <<"OTU\tRank\n";
+        for (int i = 0; i < globalVariableRanks.size(); i++) {
+            out << m->currentBinLabels[(int)globalVariableRanks[i][0]] << '\t' << globalVariableImportanceList[globalVariableRanks[i][0]] << endl;
+        }
+        out.close();
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "RandomForest", "calcForrestVariableImportance");
+               exit(1);
+       }  
+}
+/***********************************************************************/
+// DONE
+int RandomForest::populateDecisionTrees() {
+    try {
+        
+        for (int i = 0; i < numDecisionTrees; i++) {
+            if (m->control_pressed) { return 0; }
+            if (((i+1) % 10) == 0) {  m->mothurOut("Creating " + toString(i+1) + " (th) Decision tree\n");  }
+            // TODO: need to first fix if we are going to use pointer based system or anything else
+            DecisionTree* decisionTree = new DecisionTree(dataSet, globalDiscardedFeatureIndices, OptimumFeatureSubsetSelector("log2"), treeSplitCriterion);
+            decisionTree->calcTreeVariableImportanceAndError();
+            if (m->control_pressed) { return 0; }
+            updateGlobalOutOfBagEstimates(decisionTree);
+            if (m->control_pressed) { return 0; }
+            decisionTree->purgeDataSetsFromTree();
+            if (m->control_pressed) { return 0; }
+            decisionTrees.push_back(decisionTree);
+        }
+        
+        if (m->debug) {
+            // m->mothurOut("globalOutOfBagEstimates = " + toStringVectorMap(globalOutOfBagEstimates)+ "\n");
+        }
+        
+        return 0;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "RandomForest", "populateDecisionTrees");
+        exit(1);
+    }  
+}
+/***********************************************************************/
+// TODO: need to finalize bettween reference and pointer for DecisionTree [partially solved]
+// TODO: make this pure virtual in superclass
+// DONE
+int RandomForest::updateGlobalOutOfBagEstimates(DecisionTree* decisionTree) {
+    try {
+        for (map<int, int>::iterator it = decisionTree->outOfBagEstimates.begin(); it != decisionTree->outOfBagEstimates.end(); it++) {
+            
+            if (m->control_pressed) { return 0; }
+            
+            int indexOfSample = it->first;
+            int predictedOutcomeOfSample = it->second;
+            
+            if (globalOutOfBagEstimates.count(indexOfSample) == 0) {
+                globalOutOfBagEstimates[indexOfSample] = vector<int>(decisionTree->numOutputClasses, 0);
+            };
+            
+            globalOutOfBagEstimates[indexOfSample][predictedOutcomeOfSample] += 1;
+        }
+        return 0;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "RandomForest", "updateGlobalOutOfBagEstimates");
+        exit(1);
+    }  
+}
+/***********************************************************************/
+
+
diff --git a/randomforest.hpp b/randomforest.hpp

new file mode 100755 (executable)

index 0000000..716d1a1
--- /dev/null
+++ b/randomforest.hpp
@@ -0,0 +1,45 @@
+//
+//  randomforest.hpp
+//  rrf-fs-prototype
+//
+//  Created by Abu Zaher Faridee on 7/20/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef rrf_fs_prototype_randomforest_hpp
+#define rrf_fs_prototype_randomforest_hpp
+
+#include "macros.h"
+#include "abstractrandomforest.hpp"
+#include "decisiontree.hpp"
+
+class RandomForest: public AbstractRandomForest {
+    
+public:
+    
+    // DONE
+    RandomForest(const vector <vector<int> > dataSet,const int numDecisionTrees, const string);
+    
+    
+    //NOTE:: if you are going to dynamically cast, aren't you undoing the advantage of abstraction. Why abstract at all?
+    //could cause maintenance issues later if other types of Abstract decison trees are created that cannot be cast as a decision tree.
+    virtual ~RandomForest() {
+        for (vector<AbstractDecisionTree*>::iterator it = decisionTrees.begin(); it != decisionTrees.end(); it++) {
+            // we know that this is decision tree, so we can do a dynamic_case<DecisionTree*> here
+            DecisionTree* decisionTree = dynamic_cast<DecisionTree*>(*it);
+            // calling the destructor by deleting
+            delete decisionTree;
+        }
+    }
+    
+    int calcForrestErrorRate();
+    int calcForrestVariableImportance(string);
+    int populateDecisionTrees();
+    int updateGlobalOutOfBagEstimates(DecisionTree* decisionTree);
+    
+private:
+    MothurOut* m;
+    
+};
+
+#endif
diff --git a/readcluster.cpp b/readcluster.cpp

index b6cb71de5fdbb888f4a7c1614987f74cd0bf8b1a..a6adabb5b8aa155d3b633f885fd37f7ba051465f 100644 (file)
--- a/readcluster.cpp
+++ b/readcluster.cpp
@@ -42,6 +42,26 @@ int ReadCluster::read(NameAssignment*& nameMap){
         }
  }
  /***********************************************************************/
+int ReadCluster::read(CountTable*& ct){
+       try {
+        
+               if (format == "phylip") { convertPhylip2Column(ct); }
+               else { list = new ListVector(ct->getListVector());  }
+               
+               if (m->control_pressed) { return 0; }
+               
+               if (sortWanted) {  OutPutFile = m->sortFile(distFile, outputDir);  }
+               else {  OutPutFile = distFile;   } //for use by clusters splitMatrix to convert a phylip matrix to column
+               
+               return 0;
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ReadCluster", "read");
+               exit(1);
+       }
+}
+/***********************************************************************/
  
  int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){
         try {   
@@ -224,6 +244,181 @@ int ReadCluster::convertPhylip2Column(NameAssignment*& nameMap){
  }
  /***********************************************************************/
  
+int ReadCluster::convertPhylip2Column(CountTable*& ct){
+       try {   
+               //convert phylip file to column file
+               map<int, string> rowToName;
+               map<int, string>::iterator it;
+               
+               ifstream in;
+               ofstream out;
+               string tempFile = distFile + ".column.temp";
+               
+               m->openInputFile(distFile, in);  m->gobble(in);
+               m->openOutputFile(tempFile, out);
+               
+               float distance;
+               int square, nseqs;
+               string name;
+               vector<string> matrixNames;
+               
+               string numTest;
+               in >> numTest >> name;
+               
+               if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); }
+               else { convert(numTest, nseqs); }
+               
+               rowToName[0] = name;
+               matrixNames.push_back(name);
+               
+               if(ct == NULL){
+                       list = new ListVector(nseqs);
+                       list->set(0, name);
+               }
+               else{  list = new ListVector(ct->getListVector()); }
+        
+               char d;
+               while((d=in.get()) != EOF){
+                       
+                       if(isalnum(d)){
+                               square = 1;
+                               in.putback(d);
+                               for(int i=0;i<nseqs;i++){
+                                       in >> distance;
+                               }
+                               break;
+                       }
+                       if(d == '\n'){
+                               square = 0;
+                               break;
+                       }
+               }
+        
+               if(square == 0){
+            
+                       for(int i=1;i<nseqs;i++){
+                               in >> name;
+                               rowToName[i] = name;
+                               matrixNames.push_back(name);
+                               
+                               //there's A LOT of repeated code throughout this method...
+                               if(ct == NULL){
+                                       list->set(i, name);
+                                       
+                                       for(int j=0;j<i;j++){
+                        
+                                               if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+                                               
+                                               in >> distance;
+                                               
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if(distance < cutoff){
+                                                       out << i << '\t' << j << '\t' << distance << endl;
+                                               }
+                                       }
+                                       
+                               }
+                               else{
+                                       
+                                       for(int j=0;j<i;j++){
+                                               
+                                               if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+                                               
+                                               in >> distance;
+                                               
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if(distance < cutoff){
+                                                       out << i << '\t' << j << '\t' << distance << endl;
+                                               }
+                                       }
+                               }
+                       }
+               }
+               else{
+                       for(int i=1;i<nseqs;i++){
+                               in >> name;                
+                               rowToName[i] = name;
+                               matrixNames.push_back(name);
+                
+                               if(ct == NULL){
+                                       list->set(i, name);
+                                       for(int j=0;j<nseqs;j++){
+                                               if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+                                               
+                                               in >> distance;
+                        
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if(distance < cutoff && j < i){
+                                                       out << i << '\t' << j << '\t' << distance << endl;
+                                               }
+                                       }
+                               }
+                               else{
+                                       for(int j=0;j<nseqs;j++){
+                                               if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(tempFile); return 0; }
+                                               
+                                               in >> distance;
+                        
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if(distance < cutoff && j < i){
+                                                       out << i << '\t' << j << '\t' << distance << endl;
+                                               }
+                                               
+                                       }
+                               }
+                       }
+               }
+               
+               list->setLabel("0");
+               in.close();
+               out.close();
+        
+               if(ct == NULL){
+                       ct = new CountTable();
+                       for(int i=0;i<matrixNames.size();i++){
+                               ct->push_back(matrixNames[i]);
+                       }
+               }
+               
+        
+               ifstream in2;
+               ofstream out2;
+               
+               string outputFile = m->getRootName(distFile) + "column.dist";
+               m->openInputFile(tempFile, in2);
+               m->openOutputFile(outputFile, out2);
+               
+               int first, second;
+               float dist;
+               
+               while (in2) {
+                       if (m->control_pressed) { in2.close(); out2.close(); m->mothurRemove(tempFile); m->mothurRemove(outputFile); return 0; }
+                       
+                       in2 >> first >> second >> dist;
+                       out2 << rowToName[first] << '\t' << rowToName[second] << '\t' << dist << endl;
+                       m->gobble(in2);
+               }
+               in2.close();
+               out2.close();
+               
+               m->mothurRemove(tempFile);
+               distFile = outputFile;
+        
+               if (m->control_pressed) {  m->mothurRemove(outputFile);  }
+        
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ReadCluster", "convertPhylip2Column");
+               exit(1);
+       }
+}
+/***********************************************************************/
+
  ReadCluster::~ReadCluster(){}
  /***********************************************************************/
  
diff --git a/readcluster.h b/readcluster.h

index a838dace17f35547421c03b0dfc8dcc6efdc2317..7ea579c2fc1a250bba3500f7d6dc59d3ebed26f8 100644 (file)
--- a/readcluster.h
+++ b/readcluster.h
@@ -13,6 +13,7 @@
  #include "mothur.h"
  #include "nameassignment.hpp"
  #include "listvector.hpp"
+#include "counttable.h"
  
  
  /******************************************************/
@@ -23,6 +24,7 @@ public:
         ReadCluster(string, float, string, bool);
         ~ReadCluster();
         int read(NameAssignment*&);
+    int read(CountTable*&);
         string getOutputFile() { return OutPutFile; }
         void setFormat(string f) { format = f;  }
         ListVector* getListVector()             {       return list;    }
@@ -36,6 +38,7 @@ private:
         bool sortWanted;
         
         int convertPhylip2Column(NameAssignment*&);
+    int convertPhylip2Column(CountTable*&);
  };
  
  /******************************************************/
diff --git a/readmatrix.hpp b/readmatrix.hpp

index 90d5b430de4a1ce73e5a9c03880333c4b39febad..bc3874e7f424d24a4a3dbf1465559c916d7423f0 100644 (file)
--- a/readmatrix.hpp
+++ b/readmatrix.hpp
@@ -16,7 +16,6 @@
  #include "counttable.h"
  #include "sparsedistancematrix.h"
  
-class SparseMatrix;
  
  class ReadMatrix {
  
diff --git a/readtree.cpp b/readtree.cpp

index 6fa4c3da814347cc60c0807bd16cb373fff8c962..71c4bd5b96fcd1cc435f2954616899fdde3eedb8 100644 (file)
--- a/readtree.cpp
+++ b/readtree.cpp
@@ -20,12 +20,12 @@ ReadTree::ReadTree() {
         }
  }
  /***********************************************************************/
-int ReadTree::AssembleTrees(map<string, string> nameMap) {
+int ReadTree::AssembleTrees() {
          try {
                  //assemble users trees
                  for (int i = 0; i < Trees.size(); i++) {
                          if (m->control_pressed) { return 0;  }
-                        Trees[i]->assembleTree(nameMap);
+                        Trees[i]->assembleTree();
                  }
                  return 0;
          }
@@ -107,7 +107,7 @@ float ReadTree::readBranchLength(istream& f) {
  /***********************************************************************/
  //This class reads a file in Newick form and stores it in a tree.
  
-int ReadNewickTree::read(TreeMap* tmap) {
+int ReadNewickTree::read(CountTable* ct) {
         try {
                 holder = "";
                 int c, error;
@@ -129,12 +129,12 @@ int ReadNewickTree::read(TreeMap* tmap) {
                                 }
  
                                 //make new tree
-                               T = new Tree(tmap); 
+                               T = new Tree(ct); 
  
                                 numNodes = T->getNumNodes();
                                 numLeaves = T->getNumLeaves();
                                 
-                               error = readTreeString(tmap); 
+                               error = readTreeString(ct); 
                                 
                                 //save trees for later commands
                                 Trees.push_back(T); 
@@ -143,9 +143,9 @@ int ReadNewickTree::read(TreeMap* tmap) {
                 //if you are a nexus file
                 }else if ((c = filehandle.peek()) == '#') {
                         //get right number of seqs from nexus file.
-                       Tree* temp = new Tree(tmap);  delete temp;
+                       Tree* temp = new Tree(ct);  delete temp;
                         
-                       nexusTranslation(tmap);  //reads file through the translation and updates treemap
+                       nexusTranslation(ct);  //reads file through the translation and updates treemap
                         while((c = filehandle.peek()) != EOF) { 
                                 // get past comments
                                 while ((c = filehandle.peek()) != EOF) {        
@@ -166,12 +166,12 @@ int ReadNewickTree::read(TreeMap* tmap) {
                                 filehandle.putback(c);  //put back first ( of tree.
                                 
                                 //make new tree
-                               T = new Tree(tmap); 
+                               T = new Tree(ct); 
                                 numNodes = T->getNumNodes();
                                 numLeaves = T->getNumLeaves();
                                 
                                 //read tree info
-                               error = readTreeString(tmap); 
+                               error = readTreeString(ct); 
                                  
                                 //save trees for later commands
                                 Trees.push_back(T); 
@@ -191,7 +191,7 @@ int ReadNewickTree::read(TreeMap* tmap) {
  }
  /**************************************************************************************************/
  //This function read the file through the translation of the sequences names and updates treemap.
-string ReadNewickTree::nexusTranslation(TreeMap* tmap) {
+string ReadNewickTree::nexusTranslation(CountTable* ct) {
         try {
                 
                 holder = "";
@@ -209,42 +209,14 @@ string ReadNewickTree::nexusTranslation(TreeMap* tmap) {
                         filehandle >> holder; 
                         if(holder == "tree" && comment != 1){return holder;}
                 }
-               
-               //update treemap
-               tmap->namesOfSeqs.clear();
-               
-               /*char c;
-               string number, name;
-               while ((c = filehandle.peek()) != EOF) {        
-                       
-                       filehandle >> number; 
-                       
-                       if ((number == "tree") || (number == ";") ) {  name = number; break;  }
-                       
-                       filehandle >> name; 
-                       
-                       char lastChar;
-                       if (name.length() != 0) { lastChar = name[name.length()-1]; }
-                       
-                       if ((name == "tree") || (name == ";") ) {  break;  }
-                       
-                       if (lastChar == ',') {  name.erase(name.end()-1); } //erase the comma
-                       */      
-               
+    
                 string number, name;
                 for(int i=0;i<numSeqs;i++){
                         
                         filehandle >> number;
                         filehandle >> name;
                         name.erase(name.end()-1);  //erase the comma
-                       
-                       //insert new one with new name
-                       string group = tmap->getGroup(name);
-                       tmap->treemap[toString(number)].groupname = group;
-                       tmap->treemap[toString(number)].vectorIndex = tmap->getIndex(name);
-                       //erase old one.  so treemap[sarah].groupnumber is now treemap[1].groupnumber. if number is 1 and name is sarah.
-                       tmap->treemap.erase(name);
-                       tmap->namesOfSeqs.push_back(number);
+                       ct->renameSeq(name, toString(number));
                 }
                 
                 return name;
@@ -256,7 +228,7 @@ string ReadNewickTree::nexusTranslation(TreeMap* tmap) {
  }
  
  /**************************************************************************************************/
-int ReadNewickTree::readTreeString(TreeMap* tmap) {
+int ReadNewickTree::readTreeString(CountTable* ct) {
         try {
                 
                 int n = 0;
@@ -269,7 +241,7 @@ int ReadNewickTree::readTreeString(TreeMap* tmap) {
                 if(ch == '('){
                         n = numLeaves;  //number of leaves / sequences, we want node 1 to start where the leaves left off
  
-                       lc = readNewickInt(filehandle, n, T, tmap);
+                       lc = readNewickInt(filehandle, n, T, ct);
                         if (lc == -1) { m->mothurOut("error with lc"); m->mothurOutEndLine(); return -1; } //reports an error in reading
         
                         if(filehandle.peek()==','){                                                     
@@ -281,7 +253,7 @@ int ReadNewickTree::readTreeString(TreeMap* tmap) {
                         }       
                 
                         if(rooted != 1){                                                                
-                               rc = readNewickInt(filehandle, n, T, tmap);
+                               rc = readNewickInt(filehandle, n, T, ct);
                                 if (rc == -1) { m->mothurOut("error with rc"); m->mothurOutEndLine(); return -1; } //reports an error in reading
                                 if(filehandle.peek() == ')'){                                   
                                         readSpecialChar(filehandle,')',"right parenthesis");
@@ -326,7 +298,7 @@ int ReadNewickTree::readTreeString(TreeMap* tmap) {
  }
  /**************************************************************************************************/
  
-int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) {
+int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, CountTable* ct) {
         try {
                 
                 if (m->control_pressed) { return -1; } 
@@ -339,7 +311,7 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) {
                         //read all children
                         vector<int> childrenNodes;
                         while(f.peek() != ')'){
-                               int child = readNewickInt(f, n, T, tmap);
+                               int child = readNewickInt(f, n, T, ct);
                                 if (child == -1) { return -1; } //reports an error in reading
                 //cout << "child = " << child << endl;          
                                 childrenNodes.push_back(child);
@@ -387,12 +359,7 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) {
                         }else{
                                 T->tree[n].setBranchLength(0.0); 
                         }                                               
-                       
-                       //T->tree[n].setChildren(lc,rc);
-                       //T->tree[lc].setParent(n);
-                       //T->tree[rc].setParent(n);
-                       //T->printTree();  cout << endl;
-                       
+                                               
                         return n++;
                 
                 }else{
@@ -410,33 +377,27 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T, TreeMap* tmap) {
                         f.putback(d);
                 
                         //set group info
-                       string group = tmap->getGroup(name);
+                       vector<string> group = ct->getGroups(name);
                         
                         //find index in tree of name
                         int n1 = T->getIndex(name);
                         
                         //adds sequence names that are not in group file to the "xxx" group
-                       if(group == "not found") {
+                       if(group.size() == 0) {
                                 m->mothurOut("Name: " + name + " is not in your groupfile, and will be disregarded. \n");  //readOk = -1; return n1;
                                 
-                               tmap->namesOfSeqs.push_back(name);
-                               tmap->treemap[name].groupname = "xxx";
-                               
-                               map<string, int>::iterator it;
-                               it = tmap->seqsPerGroup.find("xxx");
-                               if (it == tmap->seqsPerGroup.end()) { //its a new group
-                                       tmap->addGroup("xxx");
-                                       tmap->seqsPerGroup["xxx"] = 1;
-                               }else {
-                                       tmap->seqsPerGroup["xxx"]++;
-                               }
-                               
-                               group = "xxx";
-                       }
-                       
-                       vector<string> tempGroup; tempGroup.push_back(group);
-                       
-                       T->tree[n1].setGroup(tempGroup);
+                vector<string> currentGroups = ct->getNamesOfGroups();
+                if (!m->inUsersGroups("xxx", currentGroups)) {  ct->addGroup("xxx");  }
+                currentGroups = ct->getNamesOfGroups();
+                vector<int> thisCounts; thisCounts.resize(currentGroups.size(), 0);
+                for (int h = 0; h < currentGroups.size(); h++) {  
+                    if (currentGroups[h] == "xxx") {  thisCounts[h] = 1;  break; }
+                }
+                ct->push_back(name, thisCounts);
+                
+                               group.push_back("xxx");
+                       }                       
+                       T->tree[n1].setGroup(group);
                         T->tree[n1].setChildren(-1,-1);
                 
                         if(blen == 1){  
diff --git a/readtree.h b/readtree.h

index 6b074de839070a86bd3e010bbf222a7ec4011dce..8a692432c7cda431c6f5d73f409a0fa8e9152671 100644 (file)
--- a/readtree.h
+++ b/readtree.h
@@ -11,6 +11,7 @@
  
  #include "mothur.h"
  #include "tree.h"
+#include "counttable.h"
  
  #define MAX_LINE               513
  #define SKIPLINE(f,c)  {while((c=f.get())!=EOF && ((c) != '\n')){}}
@@ -24,17 +25,17 @@ class ReadTree {
                 ReadTree(); 
                 virtual ~ReadTree() {};
                 
-               virtual int read(TreeMap*) = 0;
+               virtual int read(CountTable*) = 0;
                 int readSpecialChar(istream&, char, string);
                 int readNodeChar(istream& f);
                 float readBranchLength(istream& f);
         
                 vector<Tree*> getTrees() { return Trees; }
-               int AssembleTrees(map<string, string>);
+               int AssembleTrees();
                 
         protected:
                 vector<Tree*> Trees;
-               TreeMap* treeMap;
+               CountTable* ct;
                 int numNodes, numLeaves;
                 MothurOut* m;
                 
@@ -48,13 +49,13 @@ class ReadNewickTree : public ReadTree {
  public:
         ReadNewickTree(string file) : treeFile(file) { m->openInputFile(file, filehandle); readOk = 0; } 
         ~ReadNewickTree() {};
-       int read(TreeMap*);
+       int read(CountTable*);
         
  private:
         Tree* T;
-       int readNewickInt(istream&, int&, Tree*, TreeMap*);
-       int readTreeString(TreeMap*);
-       string nexusTranslation(TreeMap*);
+       int readNewickInt(istream&, int&, Tree*, CountTable*);
+       int readTreeString(CountTable*);
+       string nexusTranslation(CountTable*);
         ifstream filehandle;
         string treeFile;
         string holder;
diff --git a/removegroupscommand.cpp b/removegroupscommand.cpp

index 05b1170bf4501ebd3131a616fdcd58a9b49e59f2..a29906c5f41769c93b3675b88eb1267fa2eb7127 100644 (file)
--- a/removegroupscommand.cpp
+++ b/removegroupscommand.cpp
@@ -18,9 +18,9 @@ vector<string> RemoveGroupsCommand::setParameters(){
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(pfasta);
                 CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup);
-        CommandParameter pdesign("design", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pdesign);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup);         CommandParameter pdesign("design", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pdesign);
                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(plist);
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(ptaxonomy);
                 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos);
@@ -41,9 +41,9 @@ vector<string> RemoveGroupsCommand::setParameters(){
  string RemoveGroupsCommand::getHelpString(){   
         try {
                 string helpString = "";
-               helpString += "The remove.groups command removes sequences from a specfic group or set of groups from the following file types: fasta, name, group, list, taxonomy, design or sharedfile.\n";
+               helpString += "The remove.groups command removes sequences from a specfic group or set of groups from the following file types: fasta, name, group, count, list, taxonomy, design or sharedfile.\n";
                 helpString += "It outputs a file containing the sequences NOT in the those specified groups, or with a sharedfile eliminates the groups you selected.\n";
-               helpString += "The remove.groups command parameters are accnos, fasta, name, group, list, taxonomy, shared, design and groups. The group parameter is required, unless you have a current group file or are using a sharedfile.\n";
+               helpString += "The remove.groups command parameters are accnos, fasta, name, group, list, taxonomy, shared, design and groups. The group or count parameter is required, unless you have a current group or count file or are using a sharedfile.\n";
                 helpString += "You must also provide an accnos containing the list of groups to remove or set the groups parameter to the groups you wish to remove.\n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like removed.  You can separate group names with dashes.\n";
                 helpString += "The remove.groups command should be in the following format: remove.groups(accnos=yourAccnos, fasta=yourFasta, group=yourGroupFile).\n";
@@ -71,6 +71,7 @@ string RemoveGroupsCommand::getOutputFileNameTag(string type, string inputName="
              else if (type == "taxonomy")    {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "name")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "group")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
+            else if (type == "count")       {   outputFileName =  "pick.count_table";   }
              else if (type == "list")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "shared")      {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "design")      {   outputFileName =  "pick" + m->getExtension(inputName);   }
@@ -96,6 +97,7 @@ RemoveGroupsCommand::RemoveGroupsCommand(){
                 outputTypes["list"] = tempOutNames;
                 outputTypes["shared"] = tempOutNames;
          outputTypes["design"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "RemoveGroupsCommand", "RemoveGroupsCommand");
@@ -134,6 +136,7 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option)  {
                         outputTypes["list"] = tempOutNames;
                         outputTypes["shared"] = tempOutNames;
              outputTypes["design"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
@@ -207,6 +210,14 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["design"] = inputDir + it->second;           }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         
@@ -258,12 +269,22 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option)  {
                         else if (sharedfile == "not found") {  sharedfile = "";  }
                         else { m->setSharedFile(sharedfile); }
                         
-                       groupfile = validParameter.validFile(parameters, "group", true);
-                       if (groupfile == "not open") { groupfile = ""; abort = true; }
-                       else if (groupfile == "not found") {    groupfile = ""; }
-                       else { m->setGroupFile(groupfile); }    
                         
-                       if ((sharedfile == "") && (groupfile == "") && (designfile == "")) { 
+                       countfile = validParameter.validFile(parameters, "count", true);
+            if (countfile == "not open") { countfile = ""; abort = true; }
+            else if (countfile == "not found") { countfile = "";  }    
+            else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+            
+                       
+                       if ((sharedfile == "") && (groupfile == "") && (designfile == "") && (countfile == "")) { 
                                 //is there are current file available for any of these?
                                 if ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != "")) {
                                         //give priority to group, then shared
@@ -273,7 +294,11 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option)  {
                                                 sharedfile = m->getSharedFile(); 
                                                 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
                                                 else { 
-                                                       m->mothurOut("You have no current groupfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true;
+                                                       countfile = m->getCountTableFile(); 
+                            if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                            else { 
+                                m->mothurOut("You have no current groupfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true;
+                            }
                                                 }
                                         }
                                 }else {
@@ -287,7 +312,12 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option)  {
                                                         designfile = m->getDesignFile(); 
                              if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); }
                              else { 
-                                m->mothurOut("You have no current groupfile or sharedfile or designfile and one is required."); m->mothurOutEndLine(); abort = true;
+                                countfile = m->getCountTableFile(); 
+                                if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                                else { 
+                                    m->mothurOut("You have no current groupfile, designfile, countfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true;
+                                }
+                                
                              }
                                                 }
                                         }
@@ -296,14 +326,15 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option)  {
                         
                         if ((accnosfile == "") && (Groups.size() == 0)) { m->mothurOut("You must provide an accnos file containing group names or specify groups using the groups parameter."); m->mothurOutEndLine(); abort = true; }
                         
-                       if ((fastafile == "") && (namefile == "") && (groupfile == "")  && (sharedfile == "") && (designfile == "") && (listfile == "") && (taxfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared, design or list."); m->mothurOutEndLine(); abort = true; }
-                       if ((groupfile == "") && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != "")))  { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group file."); m->mothurOutEndLine(); abort = true; }
-                       
-                       if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
-                               vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
-                               parser.getNameFile(files);
-                       }
-               
+                       if ((fastafile == "") && (namefile == "") && (countfile == "") && (groupfile == "")  && (designfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared, design, count or list."); m->mothurOutEndLine(); abort = true; }
+                       if (((groupfile == "") && (countfile == "")) && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != "")))  { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group or count file."); m->mothurOutEndLine(); abort = true; }
+            
+            if (countfile == "") {
+                if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
+                    vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
                 
         }
@@ -337,7 +368,28 @@ int RemoveGroupsCommand::execute(){
                         fillNames();
                         
                         delete groupMap;
-               }
+               }else if (countfile != ""){
+            if ((fastafile != "") || (listfile != "") || (taxfile != "")) { 
+                m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n");
+            }
+            CountTable ct;
+            ct.readTable(countfile);
+            if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: your count file does not contain group info, aborting.\n"); return 0; }
+            
+            vector<string> gNamesOfGroups = ct.getNamesOfGroups();
+            SharedUtil util;
+            util.setGroups(Groups, gNamesOfGroups);
+            vector<string> namesOfSeqs = ct.getNamesOfSeqs();
+            sort(Groups.begin(), Groups.end());
+            
+            for (int i = 0; i < namesOfSeqs.size(); i++) {
+                vector<string> thisSeqsGroups = ct.getGroups(namesOfSeqs[i]);
+                if (m->isSubset(Groups, thisSeqsGroups)) { //you only have seqs from these groups so remove you
+                    names.insert(namesOfSeqs[i]);
+                }
+            }
+        }
+
                                 
                 if (m->control_pressed) { return 0; }
                 
@@ -345,6 +397,7 @@ int RemoveGroupsCommand::execute(){
                 if (namefile != "")                     {               readName();             }
                 if (fastafile != "")            {               readFasta();    }
                 if (groupfile != "")            {               readGroup();    }
+        if (countfile != "")           {               readCount();    }
                 if (listfile != "")                     {               readList();             }
                 if (taxfile != "")                      {               readTax();              }
                 if (sharedfile != "")           {               readShared();   }
@@ -394,6 +447,11 @@ int RemoveGroupsCommand::execute(){
                         if (itTypes != outputTypes.end()) {
                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setDesignFile(current); }
                         }
+            
+            itTypes = outputTypes.find("count");
+                       if (itTypes != outputTypes.end()) {
+                               if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+                       }
                 }
                 
                 return 0;               
@@ -762,6 +820,87 @@ int RemoveGroupsCommand::readGroup(){
         }
  }
  //**********************************************************************************************************************
+int RemoveGroupsCommand::readCount(){
+       try {
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               
+               ifstream in;
+               m->openInputFile(countfile, in);
+               
+               bool wroteSomething = false;
+               int removedCount = 0;
+               
+        string headers = m->getline(in); m->gobble(in);
+        vector<string> columnHeaders = m->splitWhiteSpace(headers);
+        
+        vector<string> groups;
+        map<int, string> originalGroupIndexes;
+        map<string, int> GroupIndexes;
+        set<int> indexOfGroupsChosen;
+        for (int i = 2; i < columnHeaders.size(); i++) {  groups.push_back(columnHeaders[i]);  originalGroupIndexes[i-2] = columnHeaders[i]; }
+        //sort groups to keep consistent with how we store the groups in groupmap
+        sort(groups.begin(), groups.end());
+        for (int i = 0; i < groups.size(); i++) {  GroupIndexes[groups[i]] = i; }
+
+               vector<string> groupsToKeep;
+               for (int i = 0; i < groups.size(); i++) {
+                       if (!m->inUsersGroups(groups[i], Groups)) { groupsToKeep.push_back(groups[i]); }
+               }
+        sort(groupsToKeep.begin(), groupsToKeep.end());
+        out << "Representative_Sequence\ttotal\t";
+        for (int i = 0; i < groupsToKeep.size(); i++) { out << groupsToKeep[i] << '\t'; indexOfGroupsChosen.insert(GroupIndexes[groupsToKeep[i]]); }
+        out << endl;
+        
+        string name; int oldTotal;
+        while (!in.eof()) {
+            
+            if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
+            
+            in >> name; m->gobble(in); in >> oldTotal; m->gobble(in);
+            if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + toString(oldTotal) + "\n"); }
+            
+            if (names.count(name) == 0) {
+                //if group info, then read it
+                vector<int> selectedCounts; int thisTotal = 0; int temp;
+                for (int i = 0; i < groups.size(); i++) {  
+                    int thisIndex = GroupIndexes[originalGroupIndexes[i]]; 
+                    in >> temp;  m->gobble(in);
+                    if (indexOfGroupsChosen.count(thisIndex) != 0) { //we want this group
+                        selectedCounts.push_back(temp); thisTotal += temp;
+                    }
+                }
+                
+                out << name << '\t' << thisTotal << '\t';
+                for (int i = 0; i < selectedCounts.size(); i++) {  out << selectedCounts[i] << '\t'; }
+                out << endl;
+                
+                wroteSomething = true;
+                removedCount+= (oldTotal - thisTotal);
+            }else {  m->getline(in); removedCount += oldTotal; }
+            
+            m->gobble(in);
+        }
+        in.close();
+               out.close();
+               
+               if (wroteSomething == false) {  m->mothurOut("Your file does NOT contain sequences from the groups you wish to get."); m->mothurOutEndLine();  }
+               outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName);
+               
+               m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
+        
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RemoveGroupsCommand", "readCount");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
  int RemoveGroupsCommand::readDesign(){
         try {
                 string thisOutputDir = outputDir;
diff --git a/removegroupscommand.h b/removegroupscommand.h

index c6db38057f8c206ecbba0806486078eccdcc7278..c36998ad8bb52349d3c5d0ee73812dd8a6f50c07 100644 (file)
--- a/removegroupscommand.h
+++ b/removegroupscommand.h
@@ -36,7 +36,7 @@ public:
         
  private:
         set<string> names;
-       string accnosfile, fastafile, namefile, groupfile, designfile, listfile, taxfile, outputDir, groups, sharedfile;
+       string accnosfile, fastafile, namefile, groupfile, countfile, designfile, listfile, taxfile, outputDir, groups, sharedfile;
         bool abort;
         vector<string> outputNames, Groups;
         GroupMap* groupMap;
@@ -49,6 +49,7 @@ private:
         int readShared();
         int readName();
         int readGroup();
+    int readCount();
         int readList();
         int readTax();
         int fillNames();
diff --git a/removelineagecommand.cpp b/removelineagecommand.cpp

index 4cec90f567c764e8770987c150b67b5ececc0d35..2b930b5e2450cc15b7871923fb02dbcc1c5824ee 100644 (file)
--- a/removelineagecommand.cpp
+++ b/removelineagecommand.cpp
@@ -10,13 +10,15 @@
  #include "removelineagecommand.h"
  #include "sequence.hpp"
  #include "listvector.hpp"
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> RemoveLineageCommand::setParameters(){  
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist);
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,true); parameters.push_back(ptaxonomy);
                 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport);
@@ -38,9 +40,9 @@ vector<string> RemoveLineageCommand::setParameters(){
  string RemoveLineageCommand::getHelpString(){  
         try {
                 string helpString = "";
-               helpString += "The remove.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, list or alignreport file.\n";
+               helpString += "The remove.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, count, list or alignreport file.\n";
                 helpString += "It outputs a file containing only the sequences from the taxonomy file that are not from the taxon you requested to be removed.\n";
-               helpString += "The remove.lineage command parameters are taxon, fasta, name, group, list, taxonomy, alignreport and dups.  You must provide taxonomy unless you have a valid current taxonomy file.\n";
+               helpString += "The remove.lineage command parameters are taxon, fasta, name, group, list, taxonomy, count, alignreport and dups.  You must provide taxonomy unless you have a valid current taxonomy file.\n";
                 helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n";
                 helpString += "The taxon parameter allows you to select the taxons you would like to remove, and is required.\n";
                 helpString += "You may enter your taxons with confidence scores, doing so will remove only those sequences that belong to the taxonomy and whose cofidence scores fall below the scores you give.\n";
@@ -72,6 +74,7 @@ string RemoveLineageCommand::getOutputFileNameTag(string type, string inputName=
              else if (type == "name")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "group")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "list")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
+            else if (type == "count")       {   outputFileName =  "pick.count_table";   }
              else if (type == "alignreport")      {   outputFileName =  "pick.align.report";   }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
@@ -94,6 +97,7 @@ RemoveLineageCommand::RemoveLineageCommand(){
                 outputTypes["group"] = tempOutNames;
                 outputTypes["alignreport"] = tempOutNames;
                 outputTypes["list"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "RemoveLineageCommand", "RemoveLineageCommand");
@@ -131,6 +135,7 @@ RemoveLineageCommand::RemoveLineageCommand(string option)  {
                         outputTypes["group"] = tempOutNames;
                         outputTypes["alignreport"] = tempOutNames;
                         outputTypes["list"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
@@ -187,6 +192,14 @@ RemoveLineageCommand::RemoveLineageCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -223,6 +236,19 @@ RemoveLineageCommand::RemoveLineageCommand(string option)  {
                                 else {  m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; }
                         }else { m->setTaxonomyFile(taxfile); }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+            if (countfile == "not open") { countfile = ""; abort = true; }
+            else if (countfile == "not found") { countfile = "";  }    
+            else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+            
                         string usedDups = "true";
                         string temp = validParameter.validFile(parameters, "dups", false);      
                         if (temp == "not found") { 
@@ -240,14 +266,16 @@ RemoveLineageCommand::RemoveLineageCommand(string option)  {
                         }
                         m->splitAtChar(taxons, listOfTaxons, '-');
                         
-                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == ""))  { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; }
+                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (countfile == ""))  { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; }
                 
                         if ((usedDups != "") && (namefile == "")) {  m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine();  abort = true; }                       
                         
-                       if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
-                               vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
-                               parser.getNameFile(files);
-                       }
+                       if (countfile == "") {
+                if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){
+                    vector<string> files; files.push_back(fastafile); files.push_back(taxfile);
+                    parser.getNameFile(files);
+                }
+            }
                         
                 }
  
@@ -265,6 +293,12 @@ int RemoveLineageCommand::execute(){
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
                 if (m->control_pressed) { return 0; }
+        
+        if (countfile != "") {
+            if ((fastafile != "") || (listfile != "") || (taxfile != "")) { 
+                m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n");
+            }
+        }
                 
                 //read through the correct file and output lines you want to keep
                 if (taxfile != "")                      {               readTax();              }  //fills the set of names to remove
@@ -273,6 +307,7 @@ int RemoveLineageCommand::execute(){
                 if (groupfile != "")            {               readGroup();    }
                 if (alignfile != "")            {               readAlign();    }
                 if (listfile != "")                     {               readList();             }
+        if (countfile != "")           {               readCount();    }
                 
                 
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } return 0; }
@@ -309,6 +344,11 @@ int RemoveLineageCommand::execute(){
                         if (itTypes != outputTypes.end()) {
                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
                         }
+            
+            itTypes = outputTypes.find("count");
+                       if (itTypes != outputTypes.end()) {
+                               if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+                       }
                 }
                 
                 return 0;               
@@ -511,7 +551,59 @@ int RemoveLineageCommand::readName(){
                 exit(1);
         }
  }
-
+//**********************************************************************************************************************
+int RemoveLineageCommand::readCount(){
+       try {
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               
+               ifstream in;
+               m->openInputFile(countfile, in);
+               
+               bool wroteSomething = false;
+               
+        string headers = m->getline(in); m->gobble(in);
+        out << headers << endl;
+        
+        string name, rest; int thisTotal;
+        while (!in.eof()) {
+            
+            if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
+            
+            in >> name; m->gobble(in); 
+            in >> thisTotal; m->gobble(in);
+            rest = m->getline(in); m->gobble(in);
+            if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); }
+            
+            if (names.count(name) == 0) {
+                out << name << '\t' << thisTotal << '\t' << rest << endl;
+                wroteSomething = true;
+            }
+        }
+        in.close();
+               out.close();
+        
+        //check for groups that have been eliminated
+        CountTable ct;
+        if (ct.testGroups(outputFileName)) {
+            ct.readTable(outputFileName);
+            ct.printTable(outputFileName);
+        }
+               
+               if (wroteSomething == false) {  m->mothurOut("Your group file contains only sequences from " + taxons + "."); m->mothurOutEndLine();  }
+               outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName);
+        
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RemoveLineageCommand", "readCount");
+               exit(1);
+       }
+}
  //**********************************************************************************************************************
  int RemoveLineageCommand::readGroup(){
         try {
@@ -594,15 +686,17 @@ int RemoveLineageCommand::readTax(){
                         
                         bool remove = false;
                         
+            string noQuotesTax = m->removeQuotes(tax);
+            
                         for (int j = 0; j < listOfTaxons.size(); j++) {
-                               string newtax = tax;
+                               string newtax = noQuotesTax;
                                 
                                 //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them
                                 if (!taxonsHasConfidence[j]) {
                                         
-                                       int hasConfidences = tax.find_first_of('(');
+                                       int hasConfidences = noQuotesTax.find_first_of('(');
                                         if (hasConfidences != string::npos) { 
-                                               newtax = tax;
+                                               newtax = noQuotesTax;
                                                 m->removeConfidences(newtax);
                                         }
                                         
@@ -617,7 +711,7 @@ int RemoveLineageCommand::readTax(){
                                         }
                                         
                                 }else{//if taxons has them and you don't them remove taxons
-                                       int hasConfidences = tax.find_first_of('(');
+                                       int hasConfidences = noQuotesTax.find_first_of('(');
                                         if (hasConfidences == string::npos) { 
                                                 
                                                 int pos = newtax.find(noConfidenceTaxons[j]);
@@ -632,10 +726,10 @@ int RemoveLineageCommand::readTax(){
                                         }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons
                                                 //first remove confidences from both and see if the taxonomy exists
                                                 
-                                               string noNewTax = tax;
-                                               int hasConfidences = tax.find_first_of('(');
+                                               string noNewTax = noQuotesTax;
+                                               int hasConfidences = noQuotesTax.find_first_of('(');
                                                 if (hasConfidences != string::npos) { 
-                                                       noNewTax = tax;
+                                                       noNewTax = noQuotesTax;
                                                         m->removeConfidences(noNewTax);
                                                 }
                                                 
diff --git a/removelineagecommand.h b/removelineagecommand.h

index a5caec8e4c3e0f04ec7744489c3f27c0a495e38d..a756d24f583be82eac03634ebcd8496c4fd687b6 100644 (file)
--- a/removelineagecommand.h
+++ b/removelineagecommand.h
@@ -34,12 +34,13 @@ class RemoveLineageCommand : public Command {
         private:
                 set<string> names;
                 vector<string> outputNames, listOfTaxons;
-               string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons;
+               string fastafile, namefile, groupfile, alignfile, listfile, countfile, taxfile, outputDir, taxons;
                 bool abort, dups;
                 
                 int readFasta();
                 int readName();
                 int readGroup();
+        int readCount();
                 int readAlign();
                 int readList();
                 int readTax();  
diff --git a/removerarecommand.cpp b/removerarecommand.cpp

index 923ca72eccbd2f88bad0b36b0c15fba58f1b829c..ded26bbae7b81082f45710c03066dd27145c95a9 100644 (file)
--- a/removerarecommand.cpp
+++ b/removerarecommand.cpp
@@ -20,7 +20,8 @@ vector<string> RemoveRareCommand::setParameters(){
                 CommandParameter prabund("rabund", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(prabund);
                 CommandParameter psabund("sabund", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(psabund);
                 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pshared);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
+        CommandParameter pcount("count", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                 CommandParameter pnseqs("nseqs", "Number", "", "0", "", "", "",false,true); parameters.push_back(pnseqs);
@@ -41,7 +42,7 @@ vector<string> RemoveRareCommand::setParameters(){
  string RemoveRareCommand::getHelpString(){     
         try {
                 string helpString = "";
-               helpString += "The remove.rare command parameters are list, rabund, sabund, shared, group, label, groups, bygroup and nseqs.\n";
+               helpString += "The remove.rare command parameters are list, rabund, sabund, shared, group, count, label, groups, bygroup and nseqs.\n";
                 helpString += "The remove.rare command reads one of the following file types: list, rabund, sabund or shared file. It outputs a new file after removing the rare otus.\n";
                 helpString += "The groups parameter allows you to specify which of the groups you would like analyzed.  Default=all. You may separate group names with dashes.\n";
                 helpString += "The label parameter is used to analyze specific labels in your input. default=all. You may separate label names with dashes.\n";
@@ -72,6 +73,7 @@ string RemoveRareCommand::getOutputFileNameTag(string type, string inputName="")
              else if (type == "sabund")    {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "shared")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "group")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
+            else if (type == "count")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "list")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
@@ -93,6 +95,7 @@ RemoveRareCommand::RemoveRareCommand(){
                 outputTypes["sabund"] = tempOutNames;
                 outputTypes["list"] = tempOutNames;
                 outputTypes["group"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
                 outputTypes["shared"] = tempOutNames;
         }
         catch(exception& e) {
@@ -131,6 +134,7 @@ RemoveRareCommand::RemoveRareCommand(string option)  {
                         outputTypes["list"] = tempOutNames;
                         outputTypes["group"] = tempOutNames;
                         outputTypes["shared"] = tempOutNames;   
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
@@ -179,6 +183,14 @@ RemoveRareCommand::RemoveRareCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         
@@ -207,6 +219,15 @@ RemoveRareCommand::RemoveRareCommand(string option)  {
                         if (sharedfile == "not open") { sharedfile = "";  abort = true; }
                         else if (sharedfile == "not found") {  sharedfile = "";  }
                         else { m->setSharedFile(sharedfile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+                               
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
                         
                         if ((sharedfile == "") && (listfile == "") && (rabundfile == "") && (sabundfile == "")) { 
                                 //is there are current file available for any of these?
@@ -252,7 +273,7 @@ RemoveRareCommand::RemoveRareCommand(string option)  {
                         
                         if (byGroup && (sharedfile == "")) { m->mothurOut("The byGroup parameter is only valid with a shared file."); m->mothurOutEndLine(); }
                         
-                       if ((groupfile != "") && (listfile == "")) { m->mothurOut("A groupfile is only valid with a list file."); m->mothurOutEndLine(); groupfile = ""; }
+                       if (((groupfile != "") || (countfile != "")) && (listfile == "")) { m->mothurOut("A group or count file is only valid with a list file."); m->mothurOutEndLine(); groupfile = ""; countfile = ""; }
                 }
                 
         }
@@ -310,6 +331,11 @@ int RemoveRareCommand::execute(){
                         if (itTypes != outputTypes.end()) {
                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); }
                         }
+            
+            itTypes = outputTypes.find("count");
+                       if (itTypes != outputTypes.end()) {
+                               if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+                       }
                 }
                 
                 return 0;               
@@ -327,7 +353,9 @@ int RemoveRareCommand::processList(){
                 string thisOutputDir = outputDir;
                 if (outputDir == "") {  thisOutputDir += m->hasPath(listfile);  }
                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + getOutputFileNameTag("list", listfile);
-               string outputGroupFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);            
+               string outputGroupFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
+        string outputCountFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+        
                 ofstream out, outGroup;
                 m->openOutputFile(outputFileName, out);
                 
@@ -374,13 +402,21 @@ int RemoveRareCommand::processList(){
                 
                 //if groupfile is given then use it
                 GroupMap* groupMap;
+        CountTable ct;
                 if (groupfile != "") { 
                         groupMap = new GroupMap(groupfile); groupMap->readMap(); 
                         SharedUtil util;
                         vector<string> namesGroups = groupMap->getNamesOfGroups();
                         util.setGroups(Groups, namesGroups);
                         m->openOutputFile(outputGroupFileName, outGroup);
-               }
+               }else if (countfile != "") {
+            ct.readTable(countfile);
+            if (ct.hasGroupInfo()) {
+                vector<string> namesGroups = ct.getNamesOfGroups();
+                SharedUtil util;
+                util.setGroups(Groups, namesGroups);
+            }
+        }
                 
                 
                 if (list != NULL) {     
@@ -397,6 +433,7 @@ int RemoveRareCommand::processList(){
                                 vector<string> names;
                                 string saveBinNames = binnames;
                                 m->splitAtComma(binnames, names);
+                int binsize = names.size();
                                 
                                 vector<string> newGroupFile;
                                 if (groupfile != "") {
@@ -412,14 +449,38 @@ int RemoveRareCommand::processList(){
                                                         saveBinNames += names[k] + ",";
                                                 }
                                         }
-                                       names = newNames;
+                                       names = newNames; binsize = names.size();
                                         saveBinNames = saveBinNames.substr(0, saveBinNames.length()-1);
-                               }
+                               }else if (countfile != "") {
+                                       saveBinNames = "";
+                    binsize = 0;
+                                       for(int k = 0; k < names.size(); k++) {
+                        if (ct.hasGroupInfo()) {
+                            vector<string> thisSeqsGroups = ct.getGroups(names[k]);
+                            
+                            int thisSeqsCount = 0;
+                            for (int n = 0; n < thisSeqsGroups.size(); n++) {
+                                if (m->inUsersGroups(thisSeqsGroups[n], Groups)) {
+                                    thisSeqsCount += ct.getGroupCount(names[k], thisSeqsGroups[n]);
+                                }
+                            }
+                            binsize += thisSeqsCount;
+                            //if you don't have any seqs from the groups the user wants, then remove you.
+                            if (thisSeqsCount == 0) { newGroupFile.push_back(names[k]); }
+                            else { saveBinNames += names[k] + ","; }
+                        }else {
+                            binsize += ct.getNumSeqs(names[k]); 
+                            saveBinNames += names[k] + ",";
+                        }
+                                       }
+                                       saveBinNames = saveBinNames.substr(0, saveBinNames.length()-1);
+                }
  
-                               if (names.size() > nseqs) { //keep bin
+                               if (binsize > nseqs) { //keep bin
                                         newList.push_back(saveBinNames);
-                                       for(int k = 0; k < newGroupFile.size(); k++) { outGroup << newGroupFile[k] << endl; }
-                               }
+                                       if (groupfile != "") {  for(int k = 0; k < newGroupFile.size(); k++) { outGroup << newGroupFile[k] << endl; }  }
+                    else if (countfile != "") { for(int k = 0; k < newGroupFile.size(); k++) {  ct.remove(newGroupFile[k]); } }  
+                               }else {  if (countfile != "") {  for(int k = 0; k < names.size(); k++) {  ct.remove(names[k]); } }  }
                         }
                         
                         //print new listvector
@@ -431,6 +492,17 @@ int RemoveRareCommand::processList(){
                 
                 out.close();
                 if (groupfile != "") { outGroup.close(); outputTypes["group"].push_back(outputGroupFileName); outputNames.push_back(outputGroupFileName); }
+        if (countfile != "") { 
+            if (ct.hasGroupInfo()) {
+                vector<string> allGroups = ct.getNamesOfGroups();
+                for (int i = 0; i < allGroups.size(); i++) {
+                    if (!m->inUsersGroups(allGroups[i], Groups)) { ct.removeGroup(allGroups[i]); }
+                }
+
+            }
+            ct.printTable(outputCountFileName);
+            outputTypes["count"].push_back(outputCountFileName); outputNames.push_back(outputCountFileName); 
+        }
                 
                 if (wroteSomething == false) {  m->mothurOut("Your file contains only rare sequences."); m->mothurOutEndLine();  }
                 outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName);
diff --git a/removerarecommand.h b/removerarecommand.h

index 2d70ba784bdf9b413e8726f1cd160f62b85980d7..7b4c6fb44f80b4d2ac1a96e9a11d3a417e2cc918 100644 (file)
--- a/removerarecommand.h
+++ b/removerarecommand.h
@@ -36,7 +36,7 @@ public:
         void help() { m->mothurOut(getHelpString()); }  
         
  private:
-       string sabundfile, rabundfile, sharedfile, groupfile, listfile, outputDir, groups, label;
+       string sabundfile, rabundfile, sharedfile, groupfile, countfile, listfile, outputDir, groups, label;
         int nseqs, allLines;
         bool abort, byGroup;
         vector<string> outputNames, Groups;
diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp

index 0d53c1a95ed5b968d0a8281f1f3d612d7d4c2047..00b94a9dac842ceeca4a2a0362d87f14ba12ca1b 100644 (file)
--- a/removeseqscommand.cpp
+++ b/removeseqscommand.cpp
@@ -10,13 +10,15 @@
  #include "removeseqscommand.h"
  #include "sequence.hpp"
  #include "listvector.hpp"
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> RemoveSeqsCommand::setParameters(){     
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist);
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy);
                 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport);
@@ -39,9 +41,9 @@ vector<string> RemoveSeqsCommand::setParameters(){
  string RemoveSeqsCommand::getHelpString(){     
         try {
                 string helpString = "";
-               helpString += "The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, list, taxonomy, quality or alignreport file.\n";
+               helpString += "The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, count, list, taxonomy, quality or alignreport file.\n";
                 helpString += "It outputs a file containing the sequences NOT in the .accnos file.\n";
-               helpString += "The remove.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups.  You must provide accnos and at least one of the file parameters.\n";
+               helpString += "The remove.seqs command parameters are accnos, fasta, name, group, count, list, taxonomy, qfile, alignreport and dups.  You must provide accnos and at least one of the file parameters.\n";
                 helpString += "The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=true. \n";
                 helpString += "The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n";
                 helpString += "Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n";
@@ -70,6 +72,7 @@ string RemoveSeqsCommand::getOutputFileNameTag(string type, string inputName="")
              else if (type == "list")        {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "qfile")       {   outputFileName =  "pick" + m->getExtension(inputName);   }
              else if (type == "alignreport") {   outputFileName =  "pick.align.report";                   }
+            else if (type == "count")       {   outputFileName =  "pick.count_table";   }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
          return outputFileName;
@@ -93,6 +96,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(){
                 outputTypes["alignreport"] = tempOutNames;
                 outputTypes["list"] = tempOutNames;
                 outputTypes["qfile"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "RemoveSeqsCommand", "RemoveSeqsCommand");
@@ -131,6 +135,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option)  {
                         outputTypes["alignreport"] = tempOutNames;
                         outputTypes["list"] = tempOutNames;
                         outputTypes["qfile"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
@@ -203,6 +208,14 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -259,13 +272,28 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option)  {
                                 else                            {  temp = "false"; usedDups = "";       }
                         }
                         dups = m->isTrue(temp);
-                       
-                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; }
-                       
-                       if ((fastafile != "") && (namefile == "")) {
-                               vector<string> files; files.push_back(fastafile);
-                               parser.getNameFile(files);
-                       }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+            if (countfile == "not open") { countfile = ""; abort = true; }
+            else if (countfile == "not found") { countfile = "";  }    
+            else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+            
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+                       
+                       if ((countfile == "") && (fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; }
+                       
+            if (countfile == "") {
+                if ((fastafile != "") && (namefile == "")) {
+                    vector<string> files; files.push_back(fastafile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
  
         }
@@ -285,6 +313,12 @@ int RemoveSeqsCommand::execute(){
                 names = m->readAccnos(accnosfile);
                 
                 if (m->control_pressed) { return 0; }
+        
+        if (countfile != "") {
+            if ((fastafile != "") || (listfile != "") || (taxfile != "")) { 
+                m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n");
+            }
+        }
                 
                 //read through the correct file and output lines you want to keep
                 if (namefile != "")                     {               readName();             }
@@ -294,6 +328,7 @@ int RemoveSeqsCommand::execute(){
                 if (listfile != "")                     {               readList();             }
                 if (taxfile != "")                      {               readTax();              }
                 if (qualfile != "")                     {               readQual();             }
+        if (countfile != "")           {               readCount();            }
                 
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
         
@@ -333,7 +368,12 @@ int RemoveSeqsCommand::execute(){
                         itTypes = outputTypes.find("qfile");
                         if (itTypes != outputTypes.end()) {
                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
-                       }                       
+                       }       
+            
+            itTypes = outputTypes.find("count");
+                       if (itTypes != outputTypes.end()) {
+                               if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+                       }
                 }
                 
                 return 0;               
@@ -366,6 +406,12 @@ int RemoveSeqsCommand::readFasta(){
                         if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
                         
                         Sequence currSeq(in);
+            
+            if (!dups) {//adjust name if needed
+                map<string, string>::iterator it = uniqueMap.find(currSeq.getName());
+                if (it != uniqueMap.end()) { currSeq.setName(it->second); }
+            }
+
                         name = currSeq.getName();
                         
                         if (name != "") {
@@ -373,7 +419,7 @@ int RemoveSeqsCommand::readFasta(){
                                 if (names.count(name) == 0) {
                                         wroteSomething = true;
                                         
-                                       currSeq.printSequence(out);
+                    currSeq.printSequence(out);
                                 }else {  removedCount++;  }
                         }
                         m->gobble(in);
@@ -437,6 +483,11 @@ int RemoveSeqsCommand::readQual(){
                         
                         m->gobble(in);
                         
+            if (!dups) {//adjust name if needed
+                map<string, string>::iterator it = uniqueMap.find(saveName);
+                if (it != uniqueMap.end()) { name = ">" + it->second; saveName = it->second; }
+            }
+            
                         if (names.count(saveName) == 0) {
                                 wroteSomething = true;
                                 
@@ -463,6 +514,64 @@ int RemoveSeqsCommand::readQual(){
         }
  }
  //**********************************************************************************************************************
+int RemoveSeqsCommand::readCount(){
+       try {
+        
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+               
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               
+               ifstream in;
+               m->openInputFile(countfile, in);
+               
+               bool wroteSomething = false;
+               int removedCount = 0;
+               
+        string headers = m->getline(in); m->gobble(in);
+        out << headers << endl;
+        
+        string name, rest; int thisTotal;
+        while (!in.eof()) {
+            
+            if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
+            
+            in >> name; m->gobble(in); 
+            in >> thisTotal; m->gobble(in);
+            rest = m->getline(in); m->gobble(in);
+            if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); }
+            
+            if (names.count(name) == 0) {
+                out << name << '\t' << thisTotal << '\t' << rest << endl;
+                wroteSomething = true;
+            }else { removedCount += thisTotal; }
+        }
+        in.close();
+               out.close();
+        
+        //check for groups that have been eliminated
+        CountTable ct;
+        if (ct.testGroups(outputFileName)) {
+            ct.readTable(outputFileName);
+            ct.printTable(outputFileName);
+        }
+
+               
+               if (wroteSomething == false) {  m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine();  }
+               outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName);
+               
+               m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
+        
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RemoveSeqsCommand", "readCount");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
  int RemoveSeqsCommand::readList(){
         try {
                 string thisOutputDir = outputDir;
@@ -597,6 +706,8 @@ int RemoveSeqsCommand::readName(){
                                                 wroteSomething = true;
                                                 
                                                 out << validSecond[0] << '\t';
+                        //we are changing the unique name in the fasta file
+                        uniqueMap[firstCol] = validSecond[0];
                                                 
                                                 //you know you have at least one valid second since first column is valid
                                                 for (int i = 0; i < validSecond.size()-1; i++) {  out << validSecond[i] << ',';  }
@@ -690,9 +801,15 @@ int RemoveSeqsCommand::readTax(){
                         in >> name;                             //read from first column
                         in >> tax;                      //read from second column
                         
+            if (!dups) {//adjust name if needed
+                map<string, string>::iterator it = uniqueMap.find(name);
+                if (it != uniqueMap.end()) { name = it->second; }
+            }
+            
                         //if this name is in the accnos file
                         if (names.count(name) == 0) {
                                 wroteSomething = true;
+            
                                 out << name << '\t' << tax << endl;
                         }else {  removedCount++;  }
                                         
@@ -742,6 +859,11 @@ int RemoveSeqsCommand::readAlign(){
                         if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
                         
                         in >> name;                             //read from first column
+            
+            if (!dups) {//adjust name if needed
+                map<string, string>::iterator it = uniqueMap.find(name);
+                if (it != uniqueMap.end()) { name = it->second; }
+            }
                         
                         //if this name is in the accnos file
                         if (names.count(name) == 0) {
diff --git a/removeseqscommand.h b/removeseqscommand.h

index 474951a9feaee7d497417cf2a03f206367056241..e26e751032402cd39dac933acf3f57814ab2fc0f 100644 (file)
--- a/removeseqscommand.h
+++ b/removeseqscommand.h
@@ -34,13 +34,15 @@ class RemoveSeqsCommand : public Command {
         
         private:
                 set<string> names;
-               string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir;
+               string accnosfile, fastafile, namefile, groupfile, countfile, alignfile, listfile, taxfile, qualfile, outputDir;
                 bool abort, dups;
                 vector<string> outputNames;
+        map<string, string> uniqueMap;
                 
                 int readFasta();
                 int readName();
                 int readGroup();
+        int readCount();
                 int readAlign();
                 int readList();
                 int readTax();
diff --git a/rftreenode.cpp b/rftreenode.cpp

new file mode 100644 (file)

index 0000000..170cfb1
--- /dev/null
+++ b/rftreenode.cpp
@@ -0,0 +1,92 @@
+//
+//  rftreenode.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 10/2/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "rftreenode.hpp"
+
+/***********************************************************************/
+RFTreeNode::RFTreeNode(vector< vector<int> > bootstrappedTrainingSamples,
+           vector<int> globalDiscardedFeatureIndices,
+           int numFeatures,
+           int numSamples,
+           int numOutputClasses,
+           int generation)
+
+: bootstrappedTrainingSamples(bootstrappedTrainingSamples),
+globalDiscardedFeatureIndices(globalDiscardedFeatureIndices),
+numFeatures(numFeatures),
+numSamples(numSamples),
+numOutputClasses(numOutputClasses),
+generation(generation),
+isLeaf(false),
+outputClass(-1),
+splitFeatureIndex(-1),
+splitFeatureValue(-1),
+splitFeatureEntropy(-1.0),
+ownEntropy(-1.0),
+bootstrappedFeatureVectors(numFeatures, vector<int>(numSamples, 0)),
+bootstrappedOutputVector(numSamples, 0),
+leftChildNode(NULL),
+rightChildNode(NULL),
+parentNode(NULL) {
+    m = MothurOut::getInstance();
+    
+    for (int i = 0; i < numSamples; i++) {    // just doing a simple transpose of the matrix
+        if (m->control_pressed) { break; }
+        for (int j = 0; j < numFeatures; j++) { bootstrappedFeatureVectors[j][i] = bootstrappedTrainingSamples[i][j]; }
+    }
+    
+    for (int i = 0; i < numSamples; i++) { if (m->control_pressed) { break; } bootstrappedOutputVector[i] = bootstrappedTrainingSamples[i][numFeatures]; }
+    
+    createLocalDiscardedFeatureList();
+    updateNodeEntropy();
+}
+/***********************************************************************/
+int RFTreeNode::createLocalDiscardedFeatureList(){
+    try {
+
+        for (int i = 0; i < numFeatures; i++) {
+            if (m->control_pressed) { return 0; } 
+            vector<int>::iterator it = find(globalDiscardedFeatureIndices.begin(), globalDiscardedFeatureIndices.end(), i);
+            if (it == globalDiscardedFeatureIndices.end()){                           // NOT FOUND
+                double standardDeviation = m->getStandardDeviation(bootstrappedFeatureVectors[i]);  
+                if (standardDeviation <= 0){ localDiscardedFeatureIndices.push_back(i); }
+            }
+        }
+        
+        return 0;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "RFTreeNode", "createLocalDiscardedFeatureList");
+        exit(1);
+    }  
+}
+/***********************************************************************/
+int RFTreeNode::updateNodeEntropy() {
+    try {
+        
+        vector<int> classCounts(numOutputClasses, 0);
+        for (int i = 0; i < bootstrappedOutputVector.size(); i++) { classCounts[bootstrappedOutputVector[i]]++; }
+        int totalClassCounts = accumulate(classCounts.begin(), classCounts.end(), 0);
+        double nodeEntropy = 0.0;
+        for (int i = 0; i < classCounts.size(); i++) {
+            if (m->control_pressed) { return 0; }
+            if (classCounts[i] == 0) continue;
+            double probability = (double)classCounts[i] / (double)totalClassCounts;
+            nodeEntropy += -(probability * log2(probability));
+        }
+        ownEntropy = nodeEntropy;
+        
+        return 0;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "RFTreeNode", "updateNodeEntropy");
+        exit(1);
+    } 
+}
+
+/***********************************************************************/
diff --git a/rftreenode.hpp b/rftreenode.hpp

new file mode 100755 (executable)

index 0000000..8987ebc
--- /dev/null
+++ b/rftreenode.hpp
@@ -0,0 +1,91 @@
+//
+//  rftreenode.hpp
+//  rrf-fs-prototype
+//
+//  Created by Abu Zaher Faridee on 5/29/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef rrf_fs_prototype_treenode_hpp
+#define rrf_fs_prototype_treenode_hpp
+
+#include "mothurout.h"
+#include "macros.h"
+
+class RFTreeNode{
+    
+public:
+    
+    RFTreeNode(vector< vector<int> > bootstrappedTrainingSamples, vector<int> globalDiscardedFeatureIndices, int numFeatures, int numSamples, int numOutputClasses, int generation);
+    
+    virtual ~RFTreeNode(){}
+    
+    // getters
+    // we need to return const reference so that we have the actual value and not a copy, 
+    // plus we do not modify the value as well
+    const int getSplitFeatureIndex() { return splitFeatureIndex; }
+    // TODO: check if this works properly or returs a shallow copy of the data
+    const vector< vector<int> >& getBootstrappedTrainingSamples() { return bootstrappedTrainingSamples; }
+    const int getSplitFeatureValue() { return splitFeatureValue; }
+    const int getGeneration() { return generation; }
+    const bool checkIsLeaf() { return isLeaf; }
+    // TODO: fix this const pointer dillema
+    // we do not want to modify the data pointer by getLeftChildNode
+    RFTreeNode* getLeftChildNode() { return leftChildNode; }
+    RFTreeNode* getRightChildNode() { return rightChildNode; }
+    const int getOutputClass() { return outputClass; }
+    const int getNumSamples() { return numSamples; }
+    const int getNumFeatures() { return numFeatures; }
+    const vector<int>& getLocalDiscardedFeatureIndices() { return localDiscardedFeatureIndices; }
+    const vector< vector<int> >& getBootstrappedFeatureVectors() { return bootstrappedFeatureVectors; }
+    const vector<int>& getBootstrappedOutputVector() { return bootstrappedOutputVector; }
+    const vector<int>& getFeatureSubsetIndices() { return featureSubsetIndices; }
+    const double getOwnEntropy() { return ownEntropy; }
+    
+    // setters
+    void setIsLeaf(bool isLeaf) { this->isLeaf = isLeaf; }
+    void setOutputClass(int outputClass) { this->outputClass = outputClass; }
+    void setFeatureSubsetIndices(vector<int> featureSubsetIndices) { this->featureSubsetIndices = featureSubsetIndices; }
+    void setLeftChildNode(RFTreeNode* leftChildNode) { this->leftChildNode = leftChildNode; }
+    void setRightChildNode(RFTreeNode* rightChildNode) { this->rightChildNode = rightChildNode; }
+    void setParentNode(RFTreeNode* parentNode) { this->parentNode = parentNode; }
+    void setSplitFeatureIndex(int splitFeatureIndex) { this->splitFeatureIndex = splitFeatureIndex; }
+    void setSplitFeatureValue(int splitFeatureValue) { this->splitFeatureValue = splitFeatureValue; }
+    void setSplitFeatureEntropy(double splitFeatureEntropy) { this->splitFeatureEntropy = splitFeatureEntropy; }
+    
+    // TODO: need to remove this mechanism of friend class
+    //NOTE: friend classes can be useful for testing purposes, but I would avoid using them otherwise.
+    friend class DecisionTree;
+    friend class AbstractDecisionTree;
+    
+private:
+    vector<vector<int> > bootstrappedTrainingSamples;
+    vector<int> globalDiscardedFeatureIndices;
+    vector<int> localDiscardedFeatureIndices;
+    vector<vector<int> > bootstrappedFeatureVectors;
+    vector<int> bootstrappedOutputVector;
+    vector<int> featureSubsetIndices;
+
+    int numFeatures;
+    int numSamples;
+    int numOutputClasses;
+    int generation;
+    bool isLeaf;
+    int outputClass;
+    int splitFeatureIndex;
+    int splitFeatureValue;
+    double splitFeatureEntropy;
+    double ownEntropy;
+    
+    RFTreeNode* leftChildNode;
+    RFTreeNode* rightChildNode;
+    RFTreeNode* parentNode;
+    
+    MothurOut* m;
+    
+    int createLocalDiscardedFeatureList();
+    int updateNodeEntropy();
+    
+};
+
+#endif
diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp

index 6a9a61323b50a4ea688ba9faa7ea855a233bbab9..2b5ebc1a844420e699b04df0dcd6e144befbf1fe 100644 (file)
--- a/screenseqscommand.cpp
+++ b/screenseqscommand.cpp
@@ -8,14 +8,15 @@
   */
  
  #include "screenseqscommand.h"
-
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> ScreenSeqsCommand::setParameters(){     
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
                 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport);
                 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
@@ -44,8 +45,8 @@ vector<string> ScreenSeqsCommand::setParameters(){
  string ScreenSeqsCommand::getHelpString(){     
         try {
                 string helpString = "";
-               helpString += "The screen.seqs command reads a fastafile and creates .....\n";
-               helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
+               helpString += "The screen.seqs command reads a fastafile and screens sequences.\n";
+               helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, taxonomy, optimize, criteria and processors.\n";
                 helpString += "The fasta parameter is required.\n";
                 helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n";
                 helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n";
@@ -83,6 +84,7 @@ string ScreenSeqsCommand::getOutputFileNameTag(string type, string inputName="")
              if (type == "fasta")            {   outputFileName =  "good" + m->getExtension(inputName);   }
              else if (type == "taxonomy")    {   outputFileName =  "good" + m->getExtension(inputName);   }
              else if (type == "name")        {   outputFileName =  "good" + m->getExtension(inputName);   }
+            else if (type == "count")        {   outputFileName =  "good" + m->getExtension(inputName);   }
              else if (type == "group")       {   outputFileName =  "good" + m->getExtension(inputName);   }
              else if (type == "accnos")      {   outputFileName =  "bad.accnos";   }
              else if (type == "qfile")       {   outputFileName =  "good" + m->getExtension(inputName);   }
@@ -110,6 +112,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(){
                 outputTypes["accnos"] = tempOutNames;
                 outputTypes["qfile"] = tempOutNames;
                 outputTypes["taxonomy"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
@@ -149,6 +152,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
                         outputTypes["accnos"] = tempOutNames;
                         outputTypes["qfile"] = tempOutNames;
                         outputTypes["taxonomy"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -202,6 +206,14 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         //check for required parameters
@@ -229,6 +241,19 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
                         else if (namefile == "not found") { namefile = ""; }    
                         else { m->setNameFile(namefile); }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+            
                         alignreport = validParameter.validFile(parameters, "alignreport", true);
                         if (alignreport == "not open") { abort = true; }
                         else if (alignreport == "not found") { alignreport = ""; }
@@ -288,10 +313,12 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
                         temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){       temp = "90";                            }
                         m->mothurConvert(temp, criteria); 
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(fastafile);
-                               parser.getNameFile(files);
-                       }
+                       if (countfile == "") { 
+                if (namefile == "") {
+                    vector<string> files; files.push_back(fastafile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
  
         }
@@ -312,6 +339,11 @@ int ScreenSeqsCommand::execute(){
                 if (optimize.size() != 0) {  //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
                         //use the namefile to optimize correctly
                         if (namefile != "") { nameMap = m->readNames(namefile); }
+            else if (countfile != "") {
+                CountTable ct;
+                ct.readTable(countfile);
+                nameMap = ct.getNameMap();
+            }
                         getSummary(positions); 
                 } 
                 else { 
@@ -472,7 +504,9 @@ int ScreenSeqsCommand::execute(){
                         screenNameGroupFile(badSeqNames);
                         if (m->control_pressed) {  m->mothurRemove(goodSeqFile);  return 0; }   
                 }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
-               
+               else if (countfile != "") {     screenCountFile(badSeqNames);           }
+            
+                
                 if (m->control_pressed) { m->mothurRemove(goodSeqFile);  return 0; }
  
                 if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
@@ -519,6 +553,11 @@ int ScreenSeqsCommand::execute(){
                 if (itTypes != outputTypes.end()) {
                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
                 }
+        
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
  
                 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
                 m->mothurOutEndLine();
@@ -962,7 +1001,69 @@ int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
                 exit(1);
         }
  }
+//***************************************************************************************************************
+int ScreenSeqsCommand::screenCountFile(set<string> badSeqNames){
+       try {
+               ifstream in;
+               m->openInputFile(countfile, in);
+               set<string>::iterator it;
+               
+               string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+        outputNames.push_back(goodCountFile);  outputTypes["count"].push_back(goodCountFile);
+               ofstream goodCountOut;  m->openOutputFile(goodCountFile, goodCountOut);
+               
+        string headers = m->getline(in); m->gobble(in);
+        goodCountOut << headers << endl;
+        
+        string name, rest; int thisTotal;
+        while (!in.eof()) {
  
+                       if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
+            
+                       in >> name; m->gobble(in); 
+            in >> thisTotal; m->gobble(in);
+            rest = m->getline(in); m->gobble(in);
+            
+                       it = badSeqNames.find(name);
+                       
+                       if(it != badSeqNames.end()){
+                               badSeqNames.erase(it);
+                       }
+                       else{
+                               goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
+                       }
+               }
+               
+               if (m->control_pressed) { goodCountOut.close();  in.close(); m->mothurRemove(goodCountFile);  return 0; }
+        
+               //we were unable to remove some of the bad sequences
+               if (badSeqNames.size() != 0) {
+                       for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
+                               m->mothurOut("Your count file does not include the sequence " + *it + " please correct."); 
+                               m->mothurOutEndLine();
+                       }
+               }
+               
+               in.close();
+               goodCountOut.close();
+        
+        //check for groups that have been eliminated
+        CountTable ct;
+        if (ct.testGroups(goodCountFile)) {
+            ct.readTable(goodCountFile);
+            ct.printTable(goodCountFile);
+        }
+               
+               if (m->control_pressed) { m->mothurRemove(goodCountFile);   }
+               
+               return 0;
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ScreenSeqsCommand", "screenCountFile");
+               exit(1);
+       }
+}
  //***************************************************************************************************************
  
  int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
diff --git a/screenseqscommand.h b/screenseqscommand.h

index 771113da1bda0ee34859faaa39b2fe7be1bc2672..b0d7c7c18f3852dd40395b0f699749fc885cc14b 100644 (file)
--- a/screenseqscommand.h
+++ b/screenseqscommand.h
@@ -44,6 +44,7 @@ private:
  
         int screenNameGroupFile(set<string>);
         int screenGroupFile(set<string>);
+    int screenCountFile(set<string>);
         int screenAlignReport(set<string>);
         int screenQual(set<string>);
         int screenTaxonomy(set<string>);
@@ -56,7 +57,7 @@ private:
         #endif
  
         bool abort;
-       string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy;
+       string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile;
         int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria;
         vector<string> outputNames;
         vector<string> optimize;
diff --git a/secondarystructurecommand.cpp b/secondarystructurecommand.cpp

index ee50ab14e6183af4b2fe9979c97336ac1d9b75a5..4d04270ce649f8b2e579625eae7c074b322d10cb 100644 (file)
--- a/secondarystructurecommand.cpp
+++ b/secondarystructurecommand.cpp
@@ -9,13 +9,16 @@
  
  #include "secondarystructurecommand.h"
  #include "sequence.hpp"
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> AlignCheckCommand::setParameters(){     
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
                 CommandParameter pmap("map", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pmap);
-               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+               CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
+        CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                 
                 vector<string> myArray;
@@ -31,7 +34,7 @@ vector<string> AlignCheckCommand::setParameters(){
  string AlignCheckCommand::getHelpString(){     
         try {
                 string helpString = "";
-               helpString += "The align.check command reads a fasta file and map file.\n";
+               helpString += "The align.check command reads a fasta file and map file as well as an optional name or count file.\n";
                 helpString += "It outputs a file containing the secondary structure matches in the .align.check file.\n";
                 helpString += "The align.check command parameters are fasta and map, both are required.\n";
                 helpString += "The align.check command should be in the following format: align.check(fasta=yourFasta, map=yourMap).\n";
@@ -134,6 +137,14 @@ AlignCheckCommand::AlignCheckCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         //check for required parameters
@@ -154,16 +165,25 @@ AlignCheckCommand::AlignCheckCommand(string option)  {
                         else if (namefile == "not found") { namefile = "";  }   
                         else { m->setNameFile(namefile); }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { m->setCountTableFile(countfile); }
+                       
+            if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+            
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
                                 outputDir = ""; 
                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
                         }
                         
-                       if ((namefile == "") && (fastafile != "")){
-                               vector<string> files; files.push_back(fastafile); 
-                               parser.getNameFile(files);
-                       }
+            if (countfile == "") {
+                if ((namefile == "") && (fastafile != "")){
+                    vector<string> files; files.push_back(fastafile); 
+                    parser.getNameFile(files);
+                }
+            }
                 }
  
         }
@@ -183,6 +203,11 @@ int AlignCheckCommand::execute(){
                 readMap();
                 
                 if (namefile != "") { nameMap = m->readNames(namefile); }
+        else if (countfile != "") {
+            CountTable ct;
+            ct.readTable(countfile);
+            nameMap = ct.getNameMap();
+        }
                 
                 if (m->control_pressed) { return 0; }
                 
@@ -216,7 +241,7 @@ int AlignCheckCommand::execute(){
                                 if (haderror == 1) { m->control_pressed = true; break; }
                                 
                                 int num = 1;
-                               if (namefile != "") {
+                               if ((namefile != "") || (countfile != "")) {
                                         //make sure this sequence is in the namefile, else error 
                                         map<string, int>::iterator it = nameMap.find(seq.getName());
                                         
@@ -273,7 +298,7 @@ int AlignCheckCommand::execute(){
                 m->mothurOut("75%-tile:\t" + toString(pound[ptile75]) + "\t" + toString(dash[ptile75]) + "\t" + toString(plus[ptile75]) + "\t" + toString(equal[ptile75]) + "\t" + toString(loop[ptile75]) + "\t" + toString(tilde[ptile75]) + "\t" + toString(total[ptile75])); m->mothurOutEndLine();
                 m->mothurOut("97.5%-tile:\t" + toString(pound[ptile97_5]) + "\t" + toString(dash[ptile97_5]) + "\t" + toString(plus[ptile97_5]) + "\t" + toString(equal[ptile97_5]) + "\t" + toString(loop[ptile97_5]) + "\t" + toString(tilde[ptile97_5]) + "\t" + toString(total[ptile97_5])); m->mothurOutEndLine();
                 m->mothurOut("Maximum:\t" + toString(pound[ptile100]) + "\t" + toString(dash[ptile100]) + "\t" + toString(plus[ptile100]) + "\t" + toString(equal[ptile100]) + "\t" + toString(loop[ptile100]) + "\t" + toString(tilde[ptile100]) + "\t" + toString(total[ptile100])); m->mothurOutEndLine();
-               if (namefile == "") {  m->mothurOut("# of Seqs:\t" + toString(count)); m->mothurOutEndLine(); }
+               if ((namefile == "") && (countfile == "")) {  m->mothurOut("# of Seqs:\t" + toString(count)); m->mothurOutEndLine(); }
                 else { m->mothurOut("# of unique seqs:\t" + toString(count)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(size)); m->mothurOutEndLine(); }
                 
                 
diff --git a/secondarystructurecommand.h b/secondarystructurecommand.h

index 110f019f3beeafdd2a877c423b250232a488e62c..becafc5bb3f29073d0e72859e495f0364c351dfb 100644 (file)
--- a/secondarystructurecommand.h
+++ b/secondarystructurecommand.h
@@ -50,7 +50,7 @@ class AlignCheckCommand : public Command {
         
         private:
                 vector<int> structMap;
-               string mapfile, fastafile, outputDir, namefile;
+               string mapfile, fastafile, outputDir, namefile, countfile;
                 bool abort;
                 int seqLength, haderror;
                 vector<string> outputNames;
diff --git a/sensspeccommand.cpp b/sensspeccommand.cpp

index cfa1f5b6dc0649925ff4762abdef070e93fd7688..b62bb00548f3e1ee66f0c1cf2b4bee117c5416cf 100644 (file)
--- a/sensspeccommand.cpp
+++ b/sensspeccommand.cpp
@@ -14,7 +14,6 @@ vector<string> SensSpecCommand::setParameters(){
         try {
                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
-               //CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pcolumn);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                 CommandParameter pcutoff("cutoff", "Number", "", "-1.00", "", "", "",false,false); parameters.push_back(pcutoff);
@@ -136,16 +135,7 @@ SensSpecCommand::SensSpecCommand(string option)  {
                                         path = m->hasPath(it->second);
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
-                               }
-                               
-                               //it = parameters.find("name");
-                               //user has given a template file
-                               //if(it != parameters.end()){ 
-                                       //path = m->hasPath(it->second);
-                                       //if the user has not given a path then, add inputdir. else leave path alone.
-                                       //if (path == "") {     parameters["name"] = inputDir + it->second;             }
-                               //}
-                               
+                               }                               
                         }
                         //check for required parameters
                         listFile = validParameter.validFile(parameters, "list", true);
@@ -196,12 +186,6 @@ SensSpecCommand::SensSpecCommand(string option)  {
                         else if(!m->isTrue(temp))       {       hard = 0;       }
                         else if(m->isTrue(temp))        {       hard = 1;       }
                         
-//                     temp = validParameter.validFile(parameters, "name", true);
-//                     if (temp == "not found")        {       nameFile = "";          }
-//                     else if(temp == "not open")     {       abort = true;           }
-//                     else                                            {       nameFile = temp;        }
-//                     cout << "name:\t" << nameFile << endl;
-
                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found") { temp = "-1.00"; }
                         m->mothurConvert(temp, cutoff);  
  //                     cout << cutoff << endl;
diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp

index c328a041267d01e21ed2e99780f18de486571685..a9bb5737eac832a203969460f7f4e2759849a975 100644 (file)
--- a/seqsummarycommand.cpp
+++ b/seqsummarycommand.cpp
@@ -8,13 +8,14 @@
   */
  
  #include "seqsummarycommand.h"
-
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> SeqSummaryCommand::setParameters(){     
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+               CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -33,8 +34,9 @@ string SeqSummaryCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The summary.seqs command reads a fastafile and summarizes the sequences.\n";
-               helpString += "The summary.seqs command parameters are fasta, name and processors, fasta is required, unless you have a valid current fasta file.\n";
+               helpString += "The summary.seqs command parameters are fasta, name, count and processors, fasta is required, unless you have a valid current fasta file.\n";
                 helpString += "The name parameter allows you to enter a name file associated with your fasta file. \n";
+        helpString += "The count parameter allows you to enter a count file associated with your fasta file. \n";
                 helpString += "The summary.seqs command should be in the following format: \n";
                 helpString += "summary.seqs(fasta=yourFastaFile, processors=2) \n";
                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";       
@@ -123,6 +125,14 @@ SeqSummaryCommand::SeqSummaryCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //initialize outputTypes
@@ -142,6 +152,13 @@ SeqSummaryCommand::SeqSummaryCommand(string option)  {
                         if (namefile == "not open") { namefile = ""; abort = true; }
                         else if (namefile == "not found") { namefile = "";  }   
                         else { m->setNameFile(namefile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { m->setCountTableFile(countfile); }
+                       
+            if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
@@ -153,11 +170,12 @@ SeqSummaryCommand::SeqSummaryCommand(string option)  {
                         m->setProcessors(temp);
                         m->mothurConvert(temp, processors);
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(fastafile);
-                               parser.getNameFile(files);
-                       }
-                       
+            if (countfile == "") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(fastafile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
         }
         catch(exception& e) {
@@ -186,6 +204,11 @@ int SeqSummaryCommand::execute(){
                 vector<int> longHomoPolymer;
                 
                 if (namefile != "") { nameMap = m->readNames(namefile); }
+        else if (countfile != "") {
+            CountTable ct;
+            ct.readTable(countfile);
+            nameMap = ct.getNameMap();
+        }
                 
                 if (m->control_pressed) { return 0; }
                         
@@ -344,7 +367,7 @@ int SeqSummaryCommand::execute(){
                 int size = startPosition.size();
                 
                 //find means
-               float meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer;
+               double meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer;
                 meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0;
                 for (int i = 0; i < size; i++) {
                         meanStartPosition += startPosition[i];
@@ -353,6 +376,7 @@ int SeqSummaryCommand::execute(){
                         meanAmbigBases += ambigBases[i];
                         meanLongHomoPolymer += longHomoPolymer[i];
                 }
+                
                 //this is an int divide so the remainder is lost
                 meanStartPosition /= (float) size; meanEndPosition /= (float) size; meanLongHomoPolymer /= (float) size; meanSeqLength /= (float) size; meanAmbigBases /= (float) size;
                                 
@@ -380,7 +404,7 @@ int SeqSummaryCommand::execute(){
                 m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine();
                 m->mothurOut("Mean:\t" + toString(meanStartPosition) + "\t" + toString(meanEndPosition) + "\t" + toString(meanSeqLength) + "\t" + toString(meanAmbigBases) + "\t" + toString(meanLongHomoPolymer)); m->mothurOutEndLine();
  
-               if (namefile == "") {  m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); }
+               if ((namefile == "") && (countfile == "")) {  m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); }
                 else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); }
                 
                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
@@ -420,7 +444,7 @@ int SeqSummaryCommand::driverCreateSummary(vector<int>& startPosition, vector<in
  
                 bool done = false;
                 int count = 0;
-       
+       
                 while (!done) {
                                 
                         if (m->control_pressed) { in.close(); outSummary.close(); return 1; }
@@ -430,11 +454,11 @@ int SeqSummaryCommand::driverCreateSummary(vector<int>& startPosition, vector<in
                         if (current.getName() != "") {
                                 
                                 int num = 1;
-                               if (namefile != "") {
+                               if ((namefile != "") || (countfile != "")) {
                                         //make sure this sequence is in the namefile, else error 
                                         map<string, int>::iterator it = nameMap.find(current.getName());
                                         
-                                       if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+                                       if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your name or count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
                                         else { num = it->second; }
                                 }
                                 
@@ -505,11 +529,11 @@ int SeqSummaryCommand::MPICreateSummary(int start, int num, vector<int>& startPo
                         if (current.getName() != "") {
                                 
                                 int num = 1;
-                               if (namefile != "") {
+                               if ((namefile != "") || (countfile != "")) {
                                         //make sure this sequence is in the namefile, else error 
                                         map<string, int>::iterator it = nameMap.find(current.getName());
                                         
-                                       if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your namefile, please correct." << endl; m->control_pressed = true; }
+                                       if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your name or count file, please correct." << endl; m->control_pressed = true; }
                                         else { num = it->second; }
                                 }
                                 
@@ -626,14 +650,17 @@ int SeqSummaryCommand::createProcessesCreateSummary(vector<int>& startPosition,
                 vector<seqSumData*> pDataArray; 
                 DWORD   dwThreadIdArray[processors-1];
                 HANDLE  hThreadArray[processors-1]; 
-               
+        
+               bool hasNameMap = false;
+        if ((namefile !="") || (countfile != "")) { hasNameMap = true; }
+        
                 //Create processor worker threads.
                 for( int i=0; i<processors-1; i++ ){
              
              string extension = "";
              if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
                         // Allocate memory for thread data.
-                       seqSumData* tempSum = new seqSumData(filename, (sumFile+extension), m, lines[i]->start, lines[i]->end, namefile, nameMap);
+                       seqSumData* tempSum = new seqSumData(filename, (sumFile+extension), m, lines[i]->start, lines[i]->end, hasNameMap, nameMap);
                         pDataArray.push_back(tempSum);
                         
                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
diff --git a/seqsummarycommand.h b/seqsummarycommand.h

index 79e8be96974141da315d1038a51096695212ad5c..3926e25f3c2ebb484b357464d1514dfc957c4000 100644 (file)
--- a/seqsummarycommand.h
+++ b/seqsummarycommand.h
@@ -34,7 +34,7 @@ public:
         void help() { m->mothurOut(getHelpString()); }          
  private:
         bool abort;
-       string fastafile, outputDir, namefile;
+       string fastafile, outputDir, namefile, countfile;
         int processors;
         vector<string> outputNames;
         map<string, int> nameMap;
@@ -74,18 +74,18 @@ struct seqSumData {
         unsigned long long end;
         int count;
         MothurOut* m;
-       string namefile;
+       bool hasNameMap;
         map<string, int> nameMap;
         
         
         seqSumData(){}
-       seqSumData(string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, string na, map<string, int> nam) {
+       seqSumData(string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, bool na, map<string, int> nam) {
                 filename = f;
                 sumFile = sf;
                 m = mout;
                 start = st;
                 end = en;
-               namefile = na;
+               hasNameMap = na;
                 nameMap = nam;
                 count = 0;
         }
@@ -123,11 +123,11 @@ static DWORD WINAPI MySeqSumThreadFunction(LPVOID lpParam){
                         if (current.getName() != "") {
                                 
                                 int num = 1;
-                               if (pDataArray->namefile != "") {
+                               if (pDataArray->hasNameMap){
                                         //make sure this sequence is in the namefile, else error 
                                         map<string, int>::iterator it = pDataArray->nameMap.find(current.getName());
                                         
-                                       if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
+                                       if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your name or count file, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
                                         else { num = it->second; }
                                 }
                                 
diff --git a/sequencecountparser.cpp b/sequencecountparser.cpp

new file mode 100644 (file)

index 0000000..1300c0f
--- /dev/null
+++ b/sequencecountparser.cpp
@@ -0,0 +1,289 @@
+//
+//  sequencecountparser.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 8/7/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "sequencecountparser.h"
+
+/************************************************************/
+SequenceCountParser::SequenceCountParser(string countfile, string fastafile) {
+       try {
+               
+               m = MothurOut::getInstance();
+               
+               //read count file
+               CountTable countTable;
+               countTable.readTable(countfile);
+               
+               //initialize maps
+               namesOfGroups = countTable.getNamesOfGroups();
+               for (int i = 0; i < namesOfGroups.size(); i++) {
+                       vector<Sequence> temp;
+                       map<string, int> tempMap;
+                       seqs[namesOfGroups[i]] = temp;
+                       countTablePerGroup[namesOfGroups[i]] = tempMap;
+               }
+               
+               //read fasta file making sure each sequence is in the group file
+               ifstream in;
+               m->openInputFile(fastafile, in);
+               
+        int fastaCount = 0;
+               while (!in.eof()) {
+                       
+                       if (m->control_pressed) { break; }
+                       
+                       Sequence seq(in); m->gobble(in);
+            fastaCount++;
+            if (m->debug) { if((fastaCount) % 1000 == 0){      m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
+                       
+            if (seq.getName() != "") {
+                               
+                allSeqsMap[seq.getName()] = seq.getName();
+                vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
+                
+                for (int i = 0; i < namesOfGroups.size(); i++) {
+                    if (groupCounts[i] != 0) {
+                        seqs[namesOfGroups[i]].push_back(seq); 
+                        countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
+                    }
+                }
+                       }
+               }
+               in.close();                                     
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
+               exit(1);
+       }
+}
+/************************************************************/
+SequenceCountParser::SequenceCountParser(string fastafile, CountTable& countTable) {
+       try {
+               
+               m = MothurOut::getInstance();
+                               
+               //initialize maps
+        if (countTable.hasGroupInfo()) {
+            namesOfGroups = countTable.getNamesOfGroups();
+            for (int i = 0; i < namesOfGroups.size(); i++) {
+                vector<Sequence> temp;
+                map<string, int> tempMap;
+                seqs[namesOfGroups[i]] = temp;
+                countTablePerGroup[namesOfGroups[i]] = tempMap;
+            }
+            
+            //read fasta file making sure each sequence is in the group file
+            ifstream in;
+            m->openInputFile(fastafile, in);
+            
+            int fastaCount = 0;
+            while (!in.eof()) {
+                
+                if (m->control_pressed) { break; }
+                
+                Sequence seq(in); m->gobble(in);
+                fastaCount++;
+                if (m->debug) { if((fastaCount) % 1000 == 0){  m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n.");   } }
+                
+                if (seq.getName() != "") {
+                    
+                    allSeqsMap[seq.getName()] = seq.getName();
+                    vector<int> groupCounts = countTable.getGroupCounts(seq.getName());
+                    
+                    for (int i = 0; i < namesOfGroups.size(); i++) {
+                        if (groupCounts[i] != 0) {
+                            seqs[namesOfGroups[i]].push_back(seq);     
+                            countTablePerGroup[namesOfGroups[i]][seq.getName()] = groupCounts[i];
+                        }
+                    }
+                }
+            }
+            in.close();        
+        }else {  m->control_pressed = true;  m->mothurOut("[ERROR]: cannot parse fasta file by group with a count table that does not include group data, please correct.\n"); }
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "SequenceCountParser");
+               exit(1);
+       }
+}
+/************************************************************/
+SequenceCountParser::~SequenceCountParser(){  }
+/************************************************************/
+int SequenceCountParser::getNumGroups(){ return namesOfGroups.size(); }
+/************************************************************/
+vector<string> SequenceCountParser::getNamesOfGroups(){ return namesOfGroups; }
+/************************************************************/
+int SequenceCountParser::getNumSeqs(string g){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               int num = 0;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
+               }else {
+                       num = (it->second).size();
+               }
+               
+               return num; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getNumSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+vector<Sequence> SequenceCountParser::getSeqs(string g){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               vector<Sequence> seqForThisGroup;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       seqForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences.");  }
+               }
+               
+               return seqForThisGroup; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+int SequenceCountParser::getSeqs(string g, string filename, bool uchimeFormat=false){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               vector<Sequence> seqForThisGroup;
+               vector<seqPriorityNode> nameVector;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       
+                       ofstream out;
+                       m->openOutputFile(filename, out);
+                       
+                       seqForThisGroup = it->second;
+                       
+                       if (uchimeFormat) {
+                               // format should look like 
+                               //>seqName /ab=numRedundantSeqs/
+                               //sequence
+                               
+                               map<string, int> countForThisGroup = getCountTable(g);
+                               map<string, int>::iterator itCount;
+                               int error = 0;
+                               
+                               for (int i = 0; i < seqForThisGroup.size(); i++) {
+                                       itCount = countForThisGroup.find(seqForThisGroup[i].getName());
+                                       
+                                       if (itCount == countForThisGroup.end()){
+                                               error = 1;
+                                               m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your count file, please correct."); m->mothurOutEndLine();
+                                       }else {
+                        seqPriorityNode temp(itCount->second, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName());
+                                               nameVector.push_back(temp);
+                                       }
+                               }
+                               
+                               if (error == 1) { out.close(); m->mothurRemove(filename); return 1; }
+                               
+                               //sort by num represented
+                               sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes);
+                
+                               //print new file in order of
+                               for (int i = 0; i < nameVector.size(); i++) {
+                                       
+                                       if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                                       
+                                       out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; //
+                               }
+                               
+                       }else { 
+                //m->mothurOut("Group " + g +  " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
+                               for (int i = 0; i < seqForThisGroup.size(); i++) {
+                                       
+                                       if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                                       
+                                       seqForThisGroup[i].printSequence(out);  
+                               }
+                       }
+                       out.close();
+               }
+               
+               return 0; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getSeqs");
+               exit(1);
+       }
+}
+
+/************************************************************/
+map<string, int> SequenceCountParser::getCountTable(string g){ 
+       try {
+               map<string, map<string, int> >::iterator it;
+               map<string, int> countForThisGroup;
+               
+               it = countTablePerGroup.find(g);
+               if(it == countTablePerGroup.end()) {
+                       m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       countForThisGroup = it->second;
+            if (m->debug) {  m->mothurOut("[DEBUG]: group " + g + " count file has " + toString(countForThisGroup.size()) + " unique sequences.");  }
+               }
+               
+               return countForThisGroup; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceCountParser", "getCountTable");
+               exit(1);
+       }
+}
+/************************************************************/
+int SequenceCountParser::getCountTable(string g, string filename){ 
+       try {
+               map<string, map<string, int> >::iterator it;
+               map<string, int> countForThisGroup;
+               
+               it = countTablePerGroup.find(g);
+               if(it == countTablePerGroup.end()) {
+                       m->mothurOut("[ERROR]: No countTable available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       countForThisGroup = it->second;
+                       
+                       ofstream out;
+                       m->openOutputFile(filename, out);
+            out << "Representative_Sequence\ttotal\t" << g << endl;
+            
+                       for (map<string, int>::iterator itFile = countForThisGroup.begin(); itFile != countForThisGroup.end(); itFile++) {
+                               
+                               if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
+                               
+                               out << itFile->first << '\t' << itFile->second << '\t' << itFile->second << endl;
+                       }
+                       
+                       out.close();
+               }
+               
+               return 0; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "getCountTable");
+               exit(1);
+       }
+}
+/************************************************************/
+
+
+
diff --git a/sequencecountparser.h b/sequencecountparser.h

new file mode 100644 (file)

index 0000000..4889ea6
--- /dev/null
+++ b/sequencecountparser.h
@@ -0,0 +1,59 @@
+#ifndef Mothur_sequencecountparser_h
+#define Mothur_sequencecountparser_h
+
+//
+//  sequencecountparser.h
+//  Mothur
+//
+//  Created by Sarah Westcott on 8/7/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "mothur.h"
+#include "mothurout.h"
+#include "sequence.hpp"
+#include "counttable.h"
+
+/* This class reads a fasta and count file and parses the data by group. The countfile must contain group information.
+ 
+ Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences. 
+ This is because when we parse the count file we make a unique for each group instead of 1 unique for all
+ groups. 
+ 
+ */
+
+class SequenceCountParser {
+       
+public:
+       
+    SequenceCountParser(string, string);                       //count, fasta - file mismatches will set m->control_pressed = true
+    SequenceCountParser(string, CountTable&);          //fasta, counttable - file mismatches will set m->control_pressed = true
+    ~SequenceCountParser();
+    
+    //general operations
+    int getNumGroups();
+    vector<string> getNamesOfGroups(); 
+    
+    int getNumSeqs(string);            //returns the number of unique sequences in a specific group
+    vector<Sequence> getSeqs(string); //returns unique sequences in a specific group
+    map<string, int> getCountTable(string); //returns seqName -> numberOfRedundantSeqs for a specific group - the count file format, but each line is parsed by group.
+    
+    int getSeqs(string, string, bool); //prints unique sequences in a specific group to a file - group, filename, uchimeFormat=false
+    int getCountTable(string, string); //print seqName -> numberRedundantSeqs for a specific group - group, filename
+    
+    map<string, string> getAllSeqsMap(){ return allSeqsMap; }  //returns map where the key=sequenceName and the value=representativeSequence - helps us remove duplicates after group by group processing
+private:
+       
+    CountTable countTable;
+    MothurOut* m;
+       
+    int numSeqs;
+    map<string, string> allSeqsMap;
+    map<string, vector<Sequence> > seqs; //a vector for each group
+    map<string, map<string, int> > countTablePerGroup; //countTable for each group
+    vector<string> namesOfGroups;
+};
+
+
+
+#endif
diff --git a/sequenceparser.cpp b/sequenceparser.cpp

index 08e5ae8b859280da965566ea9bf4d6dce3b202c7..37891eb44a0b42c2b9f0879dfd084da1893c289b 100644 (file)
--- a/sequenceparser.cpp
+++ b/sequenceparser.cpp
@@ -59,7 +59,7 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi
                 in.close();
                                  
                 if (error == 1) { m->control_pressed = true; }
-                                
+               
                 //read name file
                 ifstream inName;
                 m->openInputFile(nameFile, inName);
@@ -148,6 +148,78 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi
              }
                 }
                 inName.close();
+        
+        //in case file does not end in white space
+        if (rest != "") {
+            vector<string> pieces = m->splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
+                else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { //save one line
+                    if (m->debug) { m->mothurOut("[DEBUG]: reading names: " + firstCol + '\t' + secondCol + ".\n"); }
+                    vector<string> names;
+                    m->splitAtChar(secondCol, names, ',');
+                    
+                    //get aligned string for these seqs from the fasta file
+                    string alignedString = "";
+                    map<string, string>::iterator itAligned = seqName.find(names[0]);
+                    if (itAligned == seqName.end()) {
+                        error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
+                    }else {
+                        alignedString = itAligned->second;
+                    }
+                    
+                    //separate by group - parse one line in name file
+                    map<string, string> splitMap; //group -> name1,name2,...
+                    map<string, string>::iterator it;
+                    for (int i = 0; i < names.size(); i++) {
+                        
+                        string group = groupMap->getGroup(names[i]);
+                        if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
+                        else { 
+                            
+                            it = splitMap.find(group);
+                            if (it != splitMap.end()) { //adding seqs to this group
+                                (it->second) += "," + names[i];
+                                thisnames1.insert(names[i]);
+                                countName++;
+                            }else { //first sighting of this group
+                                splitMap[group] = names[i];
+                                countName++;
+                                thisnames1.insert(names[i]);
+                                
+                                //is this seq in the fasta file?
+                                if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
+                                    Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
+                                    seqs[group].push_back(tempSeq);
+                                }
+                            }
+                        }
+                        
+                        allSeqsMap[names[i]] = names[0];
+                    }
+                    
+                    
+                    //fill nameMapPerGroup - holds all lines in namefile separated by group
+                    for (it = splitMap.begin(); it != splitMap.end(); it++) {
+                        //grab first name
+                        string firstName = "";
+                        for(int i = 0; i < (it->second).length(); i++) {
+                            if (((it->second)[i]) != ',') {
+                                firstName += ((it->second)[i]);
+                            }else { break; }
+                        }
+                        
+                        //group1 -> seq1 -> seq1,seq2,seq3
+                        nameMapPerGroup[it->first][firstName] = it->second;
+                    }
+                    
+                    pairDone = false; 
+                }
+            }
+        }
                 
                 if (error == 1) { m->control_pressed = true; }
                         
@@ -238,8 +310,6 @@ vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGr
  /************************************************************/
  bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
  /************************************************************/
-string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
-/************************************************************/
  int SequenceParser::getNumSeqs(string g){ 
         try {
                 map<string, vector<Sequence> >::iterator it;
@@ -330,7 +400,7 @@ int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){
                                         
                                         if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
                                         
-                                       out << ">" << nameVector[i].name  << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl;
+                                       out << ">" <<  nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; //
                                 }
                                 
                         }else { 
diff --git a/sequenceparser.h b/sequenceparser.h

index 23fcb9ecc983e1cf59ef1fde1f976da80773a152..98438f648b6f61d4c89192f7333aedfccfeced36 100644 (file)
--- a/sequenceparser.h
+++ b/sequenceparser.h
@@ -36,7 +36,6 @@ class SequenceParser {
                 int getNumGroups();
                 vector<string> getNamesOfGroups();      
                 bool isValidGroup(string);  //return true if string is a valid group
-               string getGroup(string);        //returns group of a specific sequence
                 
                 int getNumSeqs(string);         //returns the number of unique sequences in a specific group
                 vector<Sequence> getSeqs(string); //returns unique sequences in a specific group
diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp

index 08cf21e5d6b543684cfebe56c0cdaf8697139125..c50255aeb2ae9c202074b78d89d460651cefdc43 100644 (file)
--- a/sffinfocommand.cpp
+++ b/sffinfocommand.cpp
@@ -9,17 +9,26 @@
  
  #include "sffinfocommand.h"
  #include "endiannessmacros.h"
+#include "trimoligos.h"
+#include "sequence.hpp"
+#include "qualityscores.h"
  
  //**********************************************************************************************************************
  vector<string> SffInfoCommand::setParameters(){        
         try {           
                 CommandParameter psff("sff", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(psff);
+        CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
                 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos);
                 CommandParameter psfftxt("sfftxt", "String", "", "", "", "", "",false,false); parameters.push_back(psfftxt);
                 CommandParameter pflow("flow", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pflow);
                 CommandParameter ptrim("trim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(ptrim);
                 CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pfasta);
                 CommandParameter pqfile("name", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqfile);
+        CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
+               CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
+        CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
+               CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
+        CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                 
@@ -37,10 +46,16 @@ string SffInfoCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The sffinfo command reads a sff file and extracts the sequence data, or you can use it to parse a sfftxt file.\n";
-               helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, and trim. sff is required. \n";
+               helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, oligos, bdiffs, tdiffs, ldiffs, sdiffs, pdiffs and trim. sff is required. \n";
                 helpString += "The sff parameter allows you to enter the sff file you would like to extract data from.  You may enter multiple files by separating them by -'s.\n";
                 helpString += "The fasta parameter allows you to indicate if you would like a fasta formatted file generated.  Default=True. \n";
                 helpString += "The qfile parameter allows you to indicate if you would like a quality file generated.  Default=True. \n";
+        helpString += "The oligos parameter allows you to provide an oligos file to split your sff file into separate sff files by barcode. \n";
+        helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
+               helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
+               helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
+        helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
+               helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
                 helpString += "The flow parameter allows you to indicate if you would like a flowgram file generated.  Default=True. \n";
                 helpString += "The sfftxt parameter allows you to indicate if you would like a sff.txt file generated.  Default=False. \n";
                 helpString += "If you want to parse an existing sfftxt file into flow, fasta and quality file, enter the file name using the sfftxt parameter. \n";
@@ -68,6 +83,7 @@ string SffInfoCommand::getOutputFileNameTag(string type, string inputName=""){
              if (type == "fasta")            {   outputFileName =  "fasta";   }
              else if (type == "flow")    {   outputFileName =  "flow";   }
              else if (type == "sfftxt")        {   outputFileName =  "sff.txt";   }
+            else if (type == "sff")        {   outputFileName =  "sff";   }
              else if (type == "qfile")       {   outputFileName =  "qual";   }
               else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
@@ -90,6 +106,7 @@ SffInfoCommand::SffInfoCommand(){
                 outputTypes["flow"] = tempOutNames;
                 outputTypes["sfftxt"] = tempOutNames;
                 outputTypes["qfile"] = tempOutNames;
+        outputTypes["sff"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "SffInfoCommand", "SffInfoCommand");
@@ -101,7 +118,8 @@ SffInfoCommand::SffInfoCommand(){
  SffInfoCommand::SffInfoCommand(string option)  {
         try {
                 abort = false; calledHelp = false;   
-               hasAccnos = false;
+               hasAccnos = false; hasOligos = false;
+        split = 1;
                 
                 //allow user to run help
                 if(option == "help") { help(); abort = true; calledHelp = true; }
@@ -126,6 +144,7 @@ SffInfoCommand::SffInfoCommand(string option)  {
                         outputTypes["flow"] = tempOutNames;
                         outputTypes["sfftxt"] = tempOutNames;
                         outputTypes["qfile"] = tempOutNames;
+            outputTypes["sff"] = tempOutNames;
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
@@ -268,7 +287,80 @@ SffInfoCommand::SffInfoCommand(string option)  {
                                 //make sure there is at least one valid file left
                                 if (accnosFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
                         }
-                       
+            
+            oligosfile = validParameter.validFile(parameters, "oligos", false);
+                       if (oligosfile == "not found") { oligosfile = "";  }
+                       else { 
+                               hasOligos = true;
+                               m->splitAtDash(oligosfile, oligosFileNames);
+                               
+                               //go through files and make sure they are good, if not, then disregard them
+                               for (int i = 0; i < oligosFileNames.size(); i++) {
+                                       bool ignore = false;
+                                       if (oligosFileNames[i] == "current") { 
+                                               oligosFileNames[i] = m->getOligosFile(); 
+                                               if (oligosFileNames[i] != "") {  m->mothurOut("Using " + oligosFileNames[i] + " as input file for the accnos parameter where you had given current."); m->mothurOutEndLine(); }
+                                               else {  
+                                                       m->mothurOut("You have no current oligosfile, ignoring current."); m->mothurOutEndLine(); ignore=true; 
+                                                       //erase from file list
+                                                       oligosFileNames.erase(oligosFileNames.begin()+i);
+                                                       i--;
+                                               }
+                                       }
+                                       
+                                       if (!ignore) {
+                        
+                                               if (inputDir != "") {
+                                                       string path = m->hasPath(oligosFileNames[i]);
+                                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                                       if (path == "") {       oligosFileNames[i] = inputDir + oligosFileNames[i];             }
+                                               }
+                        
+                                               ifstream in;
+                                               int ableToOpen = m->openInputFile(oligosFileNames[i], in, "noerror");
+                        
+                                               //if you can't open it, try default location
+                                               if (ableToOpen == 1) {
+                                                       if (m->getDefaultPath() != "") { //default path is set
+                                                               string tryPath = m->getDefaultPath() + m->getSimpleName(oligosFileNames[i]);
+                                                               m->mothurOut("Unable to open " + oligosFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               oligosFileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               //if you can't open it, try default location
+                                               if (ableToOpen == 1) {
+                                                       if (m->getOutputDir() != "") { //default path is set
+                                                               string tryPath = m->getOutputDir() + m->getSimpleName(oligosFileNames[i]);
+                                                               m->mothurOut("Unable to open " + oligosFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+                                                               ifstream in2;
+                                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                               in2.close();
+                                                               oligosFileNames[i] = tryPath;
+                                                       }
+                                               }
+                                               in.close();
+                                               
+                                               if (ableToOpen == 1) { 
+                                                       m->mothurOut("Unable to open " + oligosFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
+                                                       //erase from file list
+                                                       oligosFileNames.erase(oligosFileNames.begin()+i);
+                                                       i--;
+                                               }
+                                       }
+                               }
+                               
+                               //make sure there is at least one valid file left
+                               if (oligosFileNames.size() == 0) { m->mothurOut("no valid oligos files."); m->mothurOutEndLine(); abort = true; }
+                       }
+
+                       if (hasOligos) {
+                split = 2;
+                               if (oligosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a oligos file, you must have one for each sff file."); m->mothurOutEndLine(); }
+                       }
+            
                         if (hasAccnos) {
                                 if (accnosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a accnos file, you must have one for each sff file."); m->mothurOutEndLine(); }
                         }
@@ -284,7 +376,24 @@ SffInfoCommand::SffInfoCommand(string option)  {
                         
                         temp = validParameter.validFile(parameters, "trim", false);                                     if (temp == "not found"){       temp = "T";                             }
                         trim = m->isTrue(temp); 
+            
+            temp = validParameter.validFile(parameters, "bdiffs", false);              if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, bdiffs);
+                       
+                       temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, pdiffs);
+            
+            temp = validParameter.validFile(parameters, "ldiffs", false);              if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, ldiffs);
+            
+            temp = validParameter.validFile(parameters, "sdiffs", false);              if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, sdiffs);
                         
+                       temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs;  temp = toString(tempTotal); }
+                       m->mothurConvert(temp, tdiffs);
+                       
+                       if(tdiffs == 0){        tdiffs = bdiffs + pdiffs + ldiffs + sdiffs;     }
+            
                         temp = validParameter.validFile(parameters, "sfftxt", false);                           
                         if (temp == "not found")        {       temp = "F";      sfftxt = false; sfftxtFilename = "";           }
                         else if (m->isTrue(temp))       {       sfftxt = true;          sfftxtFilename = "";                            }
@@ -311,6 +420,8 @@ SffInfoCommand::SffInfoCommand(string option)  {
                                 if (filename != "") { filenames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the sff parameter."); m->mothurOutEndLine(); }
                                 else {  m->mothurOut("[ERROR]: you must provide a valid sff or sfftxt file."); m->mothurOutEndLine(); abort=true;  }
                         }
+            
+            
                 }
         }
         catch(exception& e) {
@@ -334,8 +445,11 @@ int SffInfoCommand::execute(){
                         
                         string accnos = "";
                         if (hasAccnos) { accnos = accnosFileNames[s]; }
+            
+            string oligos = "";
+            if (hasOligos) { oligos = oligosFileNames[s]; }
                         
-                       int numReads = extractSffInfo(filenames[s], accnos);
+                       int numReads = extractSffInfo(filenames[s], accnos, oligos);
  
                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to extract " + toString(numReads) + ".");
                 }
@@ -375,13 +489,15 @@ int SffInfoCommand::execute(){
         }
  }
  //**********************************************************************************************************************
-int SffInfoCommand::extractSffInfo(string input, string accnos){
+int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){
         try {
-               
+               currentFileName = input;
                 if (outputDir == "") {  outputDir += m->hasPath(input); }
                 
                 if (accnos != "")       {  readAccnosFile(accnos);  }
                 else                            {       seqNames.clear();               }
+         
+        if (oligos != "")   {   readOligos(oligos);  split = 2;   }
  
                 ofstream outSfftxt, outFasta, outQual, outFlow;
                 string outFastaFileName, outQualFileName;
@@ -424,14 +540,10 @@ int SffInfoCommand::extractSffInfo(string input, string accnos){
                 while (!in.eof()) {
                         
                         bool print = true;
-                       
-                       //read header
-                       Header readheader;
-                       readHeader(in, readheader);
-                       
+                                               
                         //read data
-                       seqRead read; 
-                       readSeqData(in, read, header.numFlowsPerRead, readheader.numBases);
+                       seqRead read;  Header readheader;
+                       readSeqData(in, read, header.numFlowsPerRead, readheader);
              bool okay = sanityCheck(readheader, read);
              if (!okay) { break; }
              
@@ -448,7 +560,7 @@ int SffInfoCommand::extractSffInfo(string input, string accnos){
                         
                         count++;
                         mycount++;
-               
+        
                         //report progress
                         if((count+1) % 10000 == 0){     m->mothurOut(toString(count+1)); m->mothurOutEndLine();         }
                 
@@ -467,6 +579,48 @@ int SffInfoCommand::extractSffInfo(string input, string accnos){
                 if (qual)       {  outQual.close();             }
                 if (flow)       {  outFlow.close();             }
                 
+        if (split > 1) {
+            //create new common headers for each file with the correct number of reads
+            adjustCommonHeader(header);
+            
+                       map<string, string>::iterator it;
+                       set<string> namesToRemove;
+                       for(int i=0;i<filehandles.size();i++){
+                               for(int j=0;j<filehandles[0].size();j++){
+                                       if (filehandles[i][j] != "") {
+                                               if (namesToRemove.count(filehandles[i][j]) == 0) {
+                                                       if(m->isBlank(filehandles[i][j])){
+                                                               m->mothurRemove(filehandles[i][j]);
+                                m->mothurRemove(filehandlesHeaders[i][j]);
+                                                               namesToRemove.insert(filehandles[i][j]);
+                            }
+                                               }
+                                       }
+                               }
+                       }
+            
+            //append new header to reads
+            for (int i = 0; i < filehandles.size(); i++) {
+                for (int j = 0; j < filehandles[i].size(); j++) {
+                    m->appendFiles(filehandles[i][j], filehandlesHeaders[i][j]);
+                    m->renameFile(filehandlesHeaders[i][j], filehandles[i][j]);
+                    m->mothurRemove(filehandlesHeaders[i][j]);
+                    if (numSplitReads[i][j] == 0) { m->mothurRemove(filehandles[i][j]); }
+                }
+            }
+                       
+                       //remove names for outputFileNames, just cleans up the output
+                       for(int i = 0; i < outputNames.size(); i++) { 
+                if (namesToRemove.count(outputNames[i]) != 0) { 
+                    outputNames.erase(outputNames.begin()+i);
+                    i--;
+                } 
+            }
+            
+            if(m->isBlank(noMatchFile)){  m->mothurRemove(noMatchFile); }
+            else { outputNames.push_back(noMatchFile); outputTypes["sff"].push_back(noMatchFile); }
+        }
+        
                 return count;
         }
         catch(exception& e) {
@@ -477,20 +631,20 @@ int SffInfoCommand::extractSffInfo(string input, string accnos){
  //**********************************************************************************************************************
  int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){
         try {
-
+        
                 if (!in.eof()) {
  
                         //read magic number
                         char buffer[4];
                         in.read(buffer, 4);
                         header.magicNumber = be_int4(*(unsigned int *)(&buffer));
-               
+            
                         //read version
                         char buffer9[4];
                         in.read(buffer9, 4);
                         header.version = "";
-                       for (int i = 0; i < 4; i++) {  header.version += toString((int)(buffer9[i])); }
-                               
+                       for (int i = 0; i < 4; i++) {  header.version += toString((int)(buffer9[i]));  }
+    
                         //read offset
                         char buffer2 [8];
                         in.read(buffer2, 8);
@@ -539,17 +693,18 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){
                         header.keySequence = tempBuffer2;
                         if (header.keySequence.length() > header.keyLength) { header.keySequence = header.keySequence.substr(0, header.keyLength);  }
                         delete[] tempBuffer2;
-                               
+                       
                         /* Pad to 8 chars */
                         unsigned long long spotInFile = in.tellg();
                         unsigned long long spot = (spotInFile + 7)& ~7;  // ~ inverts
                         in.seekg(spot);
-                       
-               }else{
+            
+        }else{
                         m->mothurOut("Error reading sff common header."); m->mothurOutEndLine();
                 }
-
+        
                 return 0;
+        
         }
         catch(exception& e) {
                 m->errorOut(e, "SffInfoCommand", "readCommonHeader");
@@ -557,21 +712,207 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){
         }
  }
  //**********************************************************************************************************************
-int SffInfoCommand::readHeader(ifstream& in, Header& header){
+int SffInfoCommand::adjustCommonHeader(CommonHeader header){
         try {
-       
-               if (!in.eof()) {
+
+        char* mybuffer = new char[4];
+        ifstream in;
+        in.open(currentFileName.c_str(), ios::binary);
+        
+        //magic number
+        in.read(mybuffer,4);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+        
+        //version
+        mybuffer = new char[4];
+        in.read(mybuffer,4);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+        
+        //offset
+        mybuffer = new char[8];
+        in.read(mybuffer,8);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+            
                         
-                       //read header length
+        //read index length
+               mybuffer = new char[4];
+        in.read(mybuffer,4);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+               
+        //change num reads
+        mybuffer = new char[4];
+        in.read(mybuffer,4);
+        delete[] mybuffer;
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                //convert number of reads to 4 byte char*
+                char* thisbuffer = new char[4];
+                thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF;
+                thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF;
+                thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF;
+                thisbuffer[3] = numSplitReads[i][j] & 0xFF;
+                out.write(thisbuffer, 4);
+                out.close();
+                delete[] thisbuffer;
+            }
+        }
+            
+        //read header length
+        mybuffer = new char[2];
+        in.read(mybuffer,2);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+            
+        //read key length
+        mybuffer = new char[2];
+        in.read(mybuffer,2);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+                       
+        //read number of flow reads
+        mybuffer = new char[2];
+        in.read(mybuffer,2);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+            
+        //read format code
+        mybuffer = new char[1];
+        in.read(mybuffer,1);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+                       
+        //read flow chars
+        mybuffer = new char[header.numFlowsPerRead];
+        in.read(mybuffer,header.numFlowsPerRead);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+                       
+        //read key
+        mybuffer = new char[header.keyLength];
+        in.read(mybuffer,header.keyLength);
+        for (int i = 0; i < filehandlesHeaders.size(); i++) {  
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, in.gcount()); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+        
+                       
+        /* Pad to 8 chars */
+        unsigned long long spotInFile = in.tellg();
+        unsigned long long spot = (spotInFile + 7)& ~7;  // ~ inverts
+        in.seekg(spot);
+        
+        mybuffer = new char[spot-spotInFile];
+        for (int i = 0; i < filehandlesHeaders.size(); i++) { 
+            for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
+                ofstream out;
+                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                out.write(mybuffer, spot-spotInFile); 
+                out.close();
+            }
+        }
+        delete[] mybuffer;
+        in.close();
+               return 0;
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SffInfoCommand", "adjustCommonHeader");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, Header& header){
+       try {
+        unsigned long long startSpotInFile = in.tellg();
+               if (!in.eof()) {
+            
+            /*****************************************/
+            //read header
+            
+            //read header length
                         char buffer [2];
                         in.read(buffer, 2);
                         header.headerLength = be_int2(*(unsigned short *)(&buffer));
-                                               
+            
                         //read name length
                         char buffer2 [2];
                         in.read(buffer2, 2);
                         header.nameLength = be_int2(*(unsigned short *)(&buffer2));
-
+            
                         //read num bases
                         char buffer3 [4];
                         in.read(buffer3, 4);
@@ -592,12 +933,12 @@ int SffInfoCommand::readHeader(ifstream& in, Header& header){
                         char buffer6 [2];
                         in.read(buffer6, 2);
                         header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6));
-
+            
                         //read clipAdapterRight
                         char buffer7 [2];
                         in.read(buffer7, 2);
                         header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7));
-               
+            
                         //read name
                         char* tempBuffer = new char[header.nameLength];
                         in.read(&(*tempBuffer), header.nameLength);
@@ -612,24 +953,10 @@ int SffInfoCommand::readHeader(ifstream& in, Header& header){
                         unsigned long long spotInFile = in.tellg();
                         unsigned long long spot = (spotInFile + 7)& ~7;
                         in.seekg(spot);
-                       
-               }else{
-                       m->mothurOut("Error reading sff header info."); m->mothurOutEndLine();
-               }
  
-               return 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "SffInfoCommand", "readHeader");
-               exit(1);
-       }
-}
-//**********************************************************************************************************************
-int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, int numBases){
-       try {
-       
-               if (!in.eof()) {
-       
+            /*****************************************/
+            //sequence read 
+            
                         //read flowgram
                         read.flowgram.resize(numFlowReads);
                         for (int i = 0; i < numFlowReads; i++) {  
@@ -639,33 +966,62 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, i
                         }
              
                         //read flowIndex
-                       read.flowIndex.resize(numBases);
-                       for (int i = 0; i < numBases; i++) {  
+                       read.flowIndex.resize(header.numBases);
+                       for (int i = 0; i < header.numBases; i++) {  
                                 char temp[1];
                                 in.read(temp, 1);
                                 read.flowIndex[i] = be_int1(*(unsigned char *)(&temp));
                         }
         
                         //read bases
-                       char* tempBuffer = new char[numBases];
-                       in.read(&(*tempBuffer), numBases);
-                       read.bases = tempBuffer;
-                       if (read.bases.length() > numBases) { read.bases = read.bases.substr(0, numBases);  }
-                       delete[] tempBuffer;
+                       char* tempBuffer6 = new char[header.numBases];
+                       in.read(&(*tempBuffer6), header.numBases);
+                       read.bases = tempBuffer6;
+                       if (read.bases.length() > header.numBases) { read.bases = read.bases.substr(0, header.numBases);  }
+                       delete[] tempBuffer6;
  
                         //read qual scores
-                       read.qualScores.resize(numBases);
-                       for (int i = 0; i < numBases; i++) {  
+                       read.qualScores.resize(header.numBases);
+                       for (int i = 0; i < header.numBases; i++) {  
                                 char temp[1];
                                 in.read(temp, 1);
                                 read.qualScores[i] = be_int1(*(unsigned char *)(&temp));
                         }
         
                         /* Pad to 8 chars */
-                       unsigned long long spotInFile = in.tellg();
-                       unsigned long long spot = (spotInFile + 7)& ~7;
+                       spotInFile = in.tellg();
+                       spot = (spotInFile + 7)& ~7;
                         in.seekg(spot);
-                       
+            
+            if (split > 1) {
+                char * mybuffer;
+                mybuffer = new char [spot-startSpotInFile];
+                ifstream in2;
+                m->openInputFile(currentFileName, in2);
+                in2.seekg(startSpotInFile);
+                in2.read(mybuffer,spot-startSpotInFile);
+                in2.close();
+                
+                int barcodeIndex, primerIndex;
+                int trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex);
+                                
+                if(trashCodeLength == 0){
+                    ofstream out;
+                    m->openOutputFileAppend(filehandles[barcodeIndex][primerIndex], out);
+                    out.write(mybuffer, in2.gcount()); 
+                    out.close();
+                    delete[] mybuffer;
+                    numSplitReads[barcodeIndex][primerIndex]++;
+                               }
+                               else{
+                                       ofstream out;
+                    m->openOutputFileAppend(noMatchFile, out);
+                    out.write(mybuffer, in2.gcount()); 
+                    out.close();
+                    delete[] mybuffer;
+                               }
+                               
+                       }
                 }else{
                         m->mothurOut("Error reading."); m->mothurOutEndLine();
                 }
@@ -678,6 +1034,83 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, i
         }
  }
  //**********************************************************************************************************************
+int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& primer) {
+       try {
+        //find group read belongs to
+        TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
+        
+        int success = 1;
+        string trashCode = "";
+        int currentSeqsDiffs = 0;
+        
+        string seq = read.bases;
+        
+        if (trim) {
+            if(header.clipQualRight < header.clipQualLeft){
+                seq = "NNNN";
+            }
+            else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){
+                seq = seq.substr((header.clipQualLeft-1), (header.clipQualRight-header.clipQualLeft));
+            }
+            else {
+                seq = seq.substr(header.clipQualLeft-1);
+            }
+        }else{
+            //if you wanted the sfftxt then you already converted the bases to the right case
+            if (!sfftxt) {
+                //make the bases you want to clip lowercase and the bases you want to keep upper case
+                if(header.clipQualRight == 0){ header.clipQualRight = seq.length();    }
+                for (int i = 0; i < (header.clipQualLeft-1); i++) { seq[i] = tolower(seq[i]);  }
+                for (int i = (header.clipQualLeft-1); i < (header.clipQualRight-1); i++)  {   seq[i] = toupper(seq[i]);  }
+                for (int i = (header.clipQualRight-1); i < seq.length(); i++) {   seq[i] = tolower(seq[i]);  }
+            }
+        }
+        
+        Sequence currSeq(header.name, seq);
+        QualityScores currQual;
+        
+        if(numLinkers != 0){
+            success = trimOligos.stripLinker(currSeq, currQual);
+            if(success > ldiffs)               {       trashCode += 'k';       }
+            else{ currentSeqsDiffs += success;  }
+            
+        }
+        
+        if(barcodes.size() != 0){
+            success = trimOligos.stripBarcode(currSeq, currQual, barcode);
+            if(success > bdiffs)               {       trashCode += 'b';       }
+            else{ currentSeqsDiffs += success;  }
+        }
+        
+        if(numSpacers != 0){
+            success = trimOligos.stripSpacer(currSeq, currQual);
+            if(success > sdiffs)               {       trashCode += 's';       }
+            else{ currentSeqsDiffs += success;  }
+            
+        }
+        
+        if(numFPrimers != 0){
+            success = trimOligos.stripForward(currSeq, currQual, primer, true);
+            if(success > pdiffs)               {       trashCode += 'f';       }
+            else{ currentSeqsDiffs += success;  }
+        }
+        
+        if (currentSeqsDiffs > tdiffs) {       trashCode += 't';   }
+        
+        if(revPrimer.size() != 0){
+            success = trimOligos.stripReverse(currSeq, currQual);
+            if(!success)                               {       trashCode += 'r';       }
+        }
+
+        
+        return trashCode.length();
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SffInfoCommand", "findGroup");
+               exit(1);
+       }
+}     
+//**********************************************************************************************************************
  int SffInfoCommand::decodeName(string& timestamp, string& region, string& xy, string name) {
         try {
                 
@@ -1175,6 +1608,224 @@ vector<unsigned int> SffInfoCommand::parseHeaderLineToIntVector(ifstream& file,
                 exit(1);
         }
  }
+//***************************************************************************************************************
+
+bool SffInfoCommand::readOligos(string oligoFile){
+       try {
+        filehandles.clear();
+        numSplitReads.clear();
+        filehandlesHeaders.clear();
+        
+               ifstream inOligos;
+               m->openInputFile(oligoFile, inOligos);
+               
+               string type, oligo, group;
+        
+               int indexPrimer = 0;
+               int indexBarcode = 0;
+               
+               while(!inOligos.eof()){
+            
+                       inOligos >> type; 
+            
+                       if(type[0] == '#'){
+                               while (!inOligos.eof()) {       char c = inOligos.get();  if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
+                               m->gobble(inOligos);
+                       }
+                       else{
+                               m->gobble(inOligos);
+                               //make type case insensitive
+                               for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
+                               
+                               inOligos >> oligo;
+                               
+                               for(int i=0;i<oligo.length();i++){
+                                       oligo[i] = toupper(oligo[i]);
+                                       if(oligo[i] == 'U')     {       oligo[i] = 'T'; }
+                               }
+                               
+                               if(type == "FORWARD"){
+                                       group = "";
+                                       
+                                       // get rest of line in case there is a primer name
+                                       while (!inOligos.eof()) {       
+                                               char c = inOligos.get(); 
+                                               if (c == 10 || c == 13){        break;  }
+                                               else if (c == 32 || c == 9){;} //space or tab
+                                               else {  group += c;  }
+                                       } 
+                                       
+                                       //check for repeat barcodes
+                                       map<string, int>::iterator itPrime = primers.find(oligo);
+                                       if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
+                                       
+                                       primers[oligo]=indexPrimer; indexPrimer++;              
+                                       primerNameVector.push_back(group);
+                               }else if(type == "REVERSE"){
+                                       //Sequence oligoRC("reverse", oligo);
+                                       //oligoRC.reverseComplement();
+                    string oligoRC = reverseOligo(oligo);
+                                       revPrimer.push_back(oligoRC);
+                               }
+                               else if(type == "BARCODE"){
+                                       inOligos >> group;
+                                       
+                                       //check for repeat barcodes
+                                       map<string, int>::iterator itBar = barcodes.find(oligo);
+                                       if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
+                    
+                                       barcodes[oligo]=indexBarcode; indexBarcode++;
+                                       barcodeNameVector.push_back(group);
+                               }else if(type == "LINKER"){
+                                       linker.push_back(oligo);
+                               }else if(type == "SPACER"){
+                                       spacer.push_back(oligo);
+                               }
+                               else{   m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
+                       }
+                       m->gobble(inOligos);
+               }       
+               inOligos.close();
+               
+               if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ split = 1;      }
+               
+               //add in potential combos
+               if(barcodeNameVector.size() == 0){
+                       barcodes[""] = 0;
+                       barcodeNameVector.push_back("");                        
+               }
+               
+               if(primerNameVector.size() == 0){
+                       primers[""] = 0;
+                       primerNameVector.push_back("");                 
+               }
+               
+               filehandles.resize(barcodeNameVector.size());
+               for(int i=0;i<filehandles.size();i++){
+                       filehandles[i].assign(primerNameVector.size(), "");
+               }
+                       
+               if(split > 1){
+                       set<string> uniqueNames; //used to cleanup outputFileNames
+                       for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
+                               for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
+                                       
+                                       string primerName = primerNameVector[itPrimer->second];
+                                       string barcodeName = barcodeNameVector[itBar->second];
+                                       
+                                       string comboGroupName = "";
+                                       string fastaFileName = "";
+                                       string qualFileName = "";
+                                       string nameFileName = "";
+                                       
+                                       if(primerName == ""){
+                                               comboGroupName = barcodeNameVector[itBar->second];
+                                       }
+                                       else{
+                                               if(barcodeName == ""){
+                                                       comboGroupName = primerNameVector[itPrimer->second];
+                                               }
+                                               else{
+                                                       comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second];
+                                               }
+                                       }
+                                       
+                                       ofstream temp;
+                                       string thisFilename = outputDir + m->getRootName(m->getSimpleName(currentFileName)) + comboGroupName + "." + getOutputFileNameTag("sff");
+                                       if (uniqueNames.count(thisFilename) == 0) {
+                                               outputNames.push_back(thisFilename);
+                                               outputTypes["sff"].push_back(thisFilename);
+                                               uniqueNames.insert(thisFilename);
+                                       }
+                                       
+                                       filehandles[itBar->second][itPrimer->second] = thisFilename;
+                                       m->openOutputFile(thisFilename, temp);          temp.close();
+                               }
+                       }
+               }
+               numFPrimers = primers.size();
+        numLinkers = linker.size();
+        numSpacers = spacer.size();
+               noMatchFile = outputDir + m->getRootName(m->getSimpleName(currentFileName)) + "scrap." + getOutputFileNameTag("sff");
+        m->mothurRemove(noMatchFile);
+        
+               bool allBlank = true;
+               for (int i = 0; i < barcodeNameVector.size(); i++) {
+                       if (barcodeNameVector[i] != "") {
+                               allBlank = false;
+                               break;
+                       }
+               }
+               for (int i = 0; i < primerNameVector.size(); i++) {
+                       if (primerNameVector[i] != "") {
+                               allBlank = false;
+                               break;
+                       }
+               }
+               
+        filehandlesHeaders.resize(filehandles.size());
+        numSplitReads.resize(filehandles.size());
+        for (int i = 0; i < filehandles.size(); i++) { 
+            numSplitReads[i].resize(filehandles[i].size(), 0); 
+            for (int j = 0; j < filehandles[i].size(); j++) {
+                filehandlesHeaders[i].push_back(filehandles[i][j]+"headers");
+            }
+        }
+                             
+               if (allBlank) {
+                       m->mothurOut("[WARNING]: your oligos file does not contain any group names.  mothur will not create a split the sff file."); m->mothurOutEndLine();
+                       split = 1;
+                       return false;
+               }
+               
+               return true;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SffInfoCommand", "readOligos");
+               exit(1);
+       }
+}
+//********************************************************************/
+string SffInfoCommand::reverseOligo(string oligo){
+       try {
+        string reverse = "";
+        
+        for(int i=oligo.length()-1;i>=0;i--){
+            
+            if(oligo[i] == 'A')                {       reverse += 'T'; }
+            else if(oligo[i] == 'T'){  reverse += 'A'; }
+            else if(oligo[i] == 'U'){  reverse += 'A'; }
+            
+            else if(oligo[i] == 'G'){  reverse += 'C'; }
+            else if(oligo[i] == 'C'){  reverse += 'G'; }
+            
+            else if(oligo[i] == 'R'){  reverse += 'Y'; }
+            else if(oligo[i] == 'Y'){  reverse += 'R'; }
+            
+            else if(oligo[i] == 'M'){  reverse += 'K'; }
+            else if(oligo[i] == 'K'){  reverse += 'M'; }
+            
+            else if(oligo[i] == 'W'){  reverse += 'W'; }
+            else if(oligo[i] == 'S'){  reverse += 'S'; }
+            
+            else if(oligo[i] == 'B'){  reverse += 'V'; }
+            else if(oligo[i] == 'V'){  reverse += 'B'; }
+            
+            else if(oligo[i] == 'D'){  reverse += 'H'; }
+            else if(oligo[i] == 'H'){  reverse += 'D'; }
+            
+            else                                               {       reverse += 'N'; }
+        }
+        
+        
+        return reverse;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SffInfoCommand", "reverseOligo");
+               exit(1);
+       }
+}
  
  //**********************************************************************************************************************
  
diff --git a/sffinfocommand.h b/sffinfocommand.h

index 4e72a960a7bb07d4e94df795bea462519f6d777a..4917a274029f778e957791a744d66b5ff059890c 100644 (file)
--- a/sffinfocommand.h
+++ b/sffinfocommand.h
@@ -78,18 +78,24 @@ public:
         void help() { m->mothurOut(getHelpString()); }  
         
  private:
-       string sffFilename, sfftxtFilename, outputDir, accnosName;
-       vector<string> filenames, outputNames, accnosFileNames;
-       bool abort, fasta, qual, trim, flow, sfftxt, hasAccnos;
-       int mycount;
+       string sffFilename, sfftxtFilename, outputDir, accnosName, currentFileName, oligosfile, noMatchFile;
+       vector<string> filenames, outputNames, accnosFileNames, oligosFileNames;
+       bool abort, fasta, qual, trim, flow, sfftxt, hasAccnos, hasOligos;
+       int mycount, split, numFPrimers, numLinkers, numSpacers, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs;
         set<string> seqNames;
+    map<string, int> barcodes;
+    map<string, int> primers;
+    vector<string> linker, spacer, primerNameVector, barcodeNameVector, revPrimer;
+    vector<vector<int> > numSplitReads;
+    vector<vector<string> > filehandles, filehandlesHeaders;
      
         //extract sff file functions
-       int extractSffInfo(string, string);
+       int extractSffInfo(string, string, string);
         int readCommonHeader(ifstream&, CommonHeader&);
-       int readHeader(ifstream&, Header&);
-       int readSeqData(ifstream&, seqRead&, int, int);
+       //int readHeader(ifstream&, Header&);
+       int readSeqData(ifstream&, seqRead&, int, Header&);
         int decodeName(string&, string&, string&, string);
+    bool readOligos(string oligosFile);
         
         int printCommonHeader(ofstream&, CommonHeader&); 
         int printHeader(ofstream&, Header&);
@@ -100,6 +106,9 @@ private:
         int readAccnosFile(string);
         int parseSffTxt();
         bool sanityCheck(Header&, seqRead&);
+    int adjustCommonHeader(CommonHeader);
+    int findGroup(Header header, seqRead read, int& barcode, int& primer);
+    string reverseOligo(string oligo);
      
         //parsesfftxt file functions
         int parseHeaderLineToInt(ifstream&);
diff --git a/sffmultiplecommand.cpp b/sffmultiplecommand.cpp

new file mode 100644 (file)

index 0000000..05bc9aa
--- /dev/null
+++ b/sffmultiplecommand.cpp
@@ -0,0 +1,836 @@
+//
+//  sffmultiplecommand.cpp
+//  Mothur
+//
+//  Created by Sarah Westcott on 8/14/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "sffmultiplecommand.h"
+
+
+
+//**********************************************************************************************************************
+vector<string> SffMultipleCommand::setParameters(){    
+       try {           
+               CommandParameter pfile("file", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfile);
+        
+        //sffinfo
+               CommandParameter ptrim("trim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(ptrim);
+        
+        //trim.flows
+               CommandParameter pmaxhomop("maxhomop", "Number", "", "9", "", "", "",false,false); parameters.push_back(pmaxhomop);
+               CommandParameter pmaxflows("maxflows", "Number", "", "450", "", "", "",false,false); parameters.push_back(pmaxflows);
+               CommandParameter pminflows("minflows", "Number", "", "450", "", "", "",false,false); parameters.push_back(pminflows);
+               CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
+               CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
+        CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
+               CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
+        CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
+               CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal);
+               CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise);
+               CommandParameter porder("order", "String", "", "TACG", "", "", "",false,false); parameters.push_back(porder);
+
+        //shhh.flows
+        CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
+               CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
+               CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
+        CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
+               CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
+               CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
+        
+        //trim.seqs parameters
+        CommandParameter pallfiles("allfiles", "Boolean", "", "t", "", "", "",false,false); parameters.push_back(pallfiles);
+        CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
+               CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
+               CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength);
+               CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
+               CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
+        CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst);
+               CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast);
+
+        
+        CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+               
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SffMultipleCommand", "setParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string SffMultipleCommand::getHelpString(){    
+       try {
+               string helpString = "";
+               helpString += "The sff.multiple command reads a file containing sff filenames and optional oligos filenames. It runs the files through sffinfo, trim.flows, shhh.flows and trim.seqs combining the results.\n";
+               helpString += "The sff.multiple command parameters are: ";
+        vector<string> parameters = setParameters();
+        for (int i = 0; i < parameters.size()-1; i++) {
+            helpString += parameters[i] + ", ";
+        }
+        helpString += parameters[parameters.size()-1] + ".\n";
+               helpString += "The file parameter allows you to enter the a file containing the list of sff files and optional oligos files.\n";
+        helpString += "The trim parameter allows you to indicate if you would like a sequences and quality scores generated by sffinfo trimmed to the clipQualLeft and clipQualRight values.  Default=True. \n";
+        helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
+               helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
+               helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
+               helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
+               helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
+               helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
+               helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
+        helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
+               helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
+               helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
+               helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
+               helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
+               helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
+
+               helpString += "Example sff.multiple(file=mySffOligosFile.txt, trim=F).\n";
+               helpString += "Note: No spaces between parameter labels (i.e. file), '=' and parameters (i.e.mySffOligosFile.txt).\n";
+               return helpString;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SffMultipleCommand", "getHelpString");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string SffMultipleCommand::getOutputFileNameTag(string type, string inputName=""){     
+       try {
+        string outputFileName = "";
+               map<string, vector<string> >::iterator it;
+        
+        //is this a type this command creates
+        it = outputTypes.find(type);
+        if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
+        else {
+            if (type == "fasta")            {   outputFileName =  "fasta";   }
+            else if (type == "name")    {   outputFileName =  "names";   }
+            else if (type == "group")        {   outputFileName =  "groups";   }
+            else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
+        }
+        return outputFileName;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SffMultipleCommand", "getOutputFileNameTag");
+               exit(1);
+       }
+}
+
+
+//**********************************************************************************************************************
+SffMultipleCommand::SffMultipleCommand(){      
+       try {
+               abort = true; calledHelp = true; 
+               setParameters();
+               vector<string> tempOutNames;
+               outputTypes["fasta"] = tempOutNames;
+        outputTypes["name"] = tempOutNames;
+        outputTypes["group"] = tempOutNames;
+               outputTypes["flow"] = tempOutNames;
+               outputTypes["qfile"] = tempOutNames;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SffMultipleCommand", "SffMultipleCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+SffMultipleCommand::SffMultipleCommand(string option)  {
+       try {
+               abort = false; calledHelp = false;  append=false; makeGroup=false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; calledHelp = true; }
+               else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+               
+               else {
+                       //valid paramters for this command
+                       vector<string> myArray = setParameters();
+                       
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+            map<string,string>::iterator it;
+            
+                       //check to make sure all parameters are valid for command
+                       for (it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //initialize outputTypes
+                       vector<string> tempOutNames;
+                       outputTypes["fasta"] = tempOutNames;
+                       outputTypes["flow"] = tempOutNames;
+                       outputTypes["qfile"] = tempOutNames;
+            outputTypes["name"] = tempOutNames;
+            outputTypes["group"] = tempOutNames;
+
+                       
+                       //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
+                       
+                       //if the user changes the input directory command factory will send this info to us in the output parameter 
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                               string path;
+                it = parameters.find("file");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["file"] = inputDir + it->second;             }
+                               }
+                
+                it = parameters.find("lookup");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
+                               }
+                       }
+            
+                       filename = validParameter.validFile(parameters, "file", true);
+            if (filename == "not open") { filename = ""; abort = true; }
+            else if (filename == "not found") { filename = "";  }
+                       
+                       string temp;
+                       temp = validParameter.validFile(parameters, "trim", false);                                     if (temp == "not found"){       temp = "T";                             }
+                       trim = m->isTrue(temp); 
+            
+            temp = validParameter.validFile(parameters, "minflows", false);    if (temp == "not found") { temp = "450"; }
+                       m->mothurConvert(temp, minFlows);  
+            
+                       temp = validParameter.validFile(parameters, "maxflows", false); if (temp == "not found") { temp = "450"; }
+                       m->mothurConvert(temp, maxFlows);  
+            
+            temp = validParameter.validFile(parameters, "maxhomop", false);            if (temp == "not found"){       temp = "9";             }
+                       m->mothurConvert(temp, maxHomoP);  
+            
+                       temp = validParameter.validFile(parameters, "signal", false);           if (temp == "not found"){       temp = "0.50";  }
+                       m->mothurConvert(temp, signal);  
+            
+                       temp = validParameter.validFile(parameters, "noise", false);            if (temp == "not found"){       temp = "0.70";  }
+                       m->mothurConvert(temp, noise);  
+            
+                       temp = validParameter.validFile(parameters, "bdiffs", false);           if (temp == "not found"){       temp = "0";             }
+                       m->mothurConvert(temp, bdiffs);
+                       
+                       temp = validParameter.validFile(parameters, "pdiffs", false);           if (temp == "not found"){       temp = "0";             }
+                       m->mothurConvert(temp, pdiffs);
+                       
+            temp = validParameter.validFile(parameters, "ldiffs", false);              if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, ldiffs);
+            
+            temp = validParameter.validFile(parameters, "sdiffs", false);              if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, sdiffs);
+                       
+                       temp = validParameter.validFile(parameters, "tdiffs", false);           if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs;  temp = toString(tempTotal); }
+                       m->mothurConvert(temp, tdiffs);
+                       
+                       if(tdiffs == 0){        tdiffs = bdiffs + pdiffs + ldiffs + sdiffs;     }
+            
+                       
+                       temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
+                       m->setProcessors(temp);
+                       m->mothurConvert(temp, processors);
+            
+                       flowOrder = validParameter.validFile(parameters, "order", false);
+                       if (flowOrder == "not found"){ flowOrder = "TACG";              }
+                       else if(flowOrder.length() != 4){
+                               m->mothurOut("The value of the order option must be four bases long\n");
+                       }
+            
+            temp = validParameter.validFile(parameters, "cutoff", false);      if (temp == "not found"){       temp = "0.01";          }
+                       m->mothurConvert(temp, cutoff); 
+                       
+                       temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
+                       minDelta = temp; 
+            
+                       temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
+                       m->mothurConvert(temp, maxIters); 
+            
+            temp = validParameter.validFile(parameters, "large", false);       if (temp == "not found"){       temp = "0";             }
+                       m->mothurConvert(temp, largeSize); 
+            if (largeSize != 0) { large = true; }
+            else { large = false;  }
+            if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
+            
+                       temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
+                       m->mothurConvert(temp, sigma); 
+            
+            temp = validParameter.validFile(parameters, "flip", false);
+                       if (temp == "not found")    {   flip = 0;       }
+                       else {  flip = m->isTrue(temp);         }
+                       
+                       temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
+                       m->mothurConvert(temp, maxAmbig);  
+                       
+                       temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, minLength); 
+                       
+                       temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
+                       m->mothurConvert(temp, maxLength);
+                                               
+                       temp = validParameter.validFile(parameters, "keepfirst", false);        if (temp == "not found") { temp = "0"; }
+                       convert(temp, keepFirst);
+            
+                       temp = validParameter.validFile(parameters, "removelast", false);       if (temp == "not found") { temp = "0"; }
+                       convert(temp, removeLast);
+                       
+                       temp = validParameter.validFile(parameters, "allfiles", false);         if (temp == "not found") { temp = "F"; }
+                       allFiles = m->isTrue(temp);
+            
+            temp = validParameter.validFile(parameters, "keepforward", false);         if (temp == "not found") { temp = "F"; }
+                       keepforward = m->isTrue(temp);
+            
+            temp = validParameter.validFile(parameters, "lookup", true);
+                       if (temp == "not found")        {       
+                               lookupFileName = "LookUp_Titanium.pat"; 
+                               
+                               int ableToOpen;
+                               ifstream in;
+                               ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
+                               in.close();     
+                               
+                               //if you can't open it, try input location
+                               if (ableToOpen == 1) {
+                                       if (inputDir != "") { //default path is set
+                                               string tryPath = inputDir + lookupFileName;
+                                               m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
+                                               ifstream in2;
+                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                               in2.close();
+                                               lookupFileName = tryPath;
+                                       }
+                               }
+                               
+                               //if you can't open it, try default location
+                               if (ableToOpen == 1) {
+                                       if (m->getDefaultPath() != "") { //default path is set
+                                               string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
+                                               m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
+                                               ifstream in2;
+                                               ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                               in2.close();
+                                               lookupFileName = tryPath;
+                                       }
+                               }
+                               
+                               //if you can't open it its not in current working directory or inputDir, try mothur excutable location
+                               if (ableToOpen == 1) {
+                                       string exepath = m->argv;
+                                       string tempPath = exepath;
+                                       for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
+                                       exepath = exepath.substr(0, (tempPath.find_last_of('m')));
+                                       
+                                       string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
+                                       m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
+                                       ifstream in2;
+                                       ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                       in2.close();
+                                       lookupFileName = tryPath;
+                               }
+                               
+                               if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
+                       }
+                       else if(temp == "not open")     {       
+                               
+                               lookupFileName = validParameter.validFile(parameters, "lookup", false);
+                               
+                               //if you can't open it its not inputDir, try mothur excutable location
+                               string exepath = m->argv;
+                               string tempPath = exepath;
+                               for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
+                               exepath = exepath.substr(0, (tempPath.find_last_of('m')));
+                
+                               string tryPath = m->getFullPathName(exepath) + lookupFileName;
+                               m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
+                               ifstream in2;
+                               int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                               in2.close();
+                               lookupFileName = tryPath;
+                               
+                               if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
+                       }else                                           {       lookupFileName = temp;  }
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SffMultipleCommand", "SffMultipleCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int SffMultipleCommand::execute(){
+       try {
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
+               
+               vector<string> sffFiles, oligosFiles;
+        readFile(sffFiles, oligosFiles);
+        
+        outputDir = m->hasPath(filename);
+        string fileroot = outputDir + m->getRootName(m->getSimpleName(filename));
+        string fasta = fileroot + getOutputFileNameTag("fasta");
+        string name = fileroot + getOutputFileNameTag("name");
+        string group = fileroot + getOutputFileNameTag("group");
+        
+        if (m->control_pressed) { return 0; }
+        
+        if (sffFiles.size() < processors) { processors = sffFiles.size(); }
+        
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+        //trim.flows, shhh.flows cannot handle multiple processors for windows.
+        processors = 1; m->mothurOut("This command can only use 1 processor on Windows platforms, using 1 processors.\n\n");
+#endif
+        if (processors == 1) { driver(sffFiles, oligosFiles, 0, sffFiles.size(), fasta, name, group); }
+        else { createProcesses(sffFiles, oligosFiles, fasta, name, group); } 
+               
+               if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);        } return 0; }
+               
+        if (append) { 
+            outputNames.push_back(fasta); outputTypes["fasta"].push_back(fasta);
+            m->setFastaFile(fasta);
+            outputNames.push_back(name); outputTypes["name"].push_back(name);
+            m->setNameFile(name);
+            if (makeGroup) { outputNames.push_back(group); outputTypes["group"].push_back(group); m->setGroupFile(group); }
+        }
+        
+               //report output filenames
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
+        
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SffMultipleCommand", "execute");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int SffMultipleCommand::readFile(vector<string>& sffFiles, vector<string>& oligosFiles){
+       try {
+        
+        ifstream in;
+        m->openInputFile(filename, in);
+        bool allBlank = true;
+        bool allFull = true;
+        
+        string oligos, sff;
+        while (!in.eof()) {
+            
+            if (m->control_pressed) { break; }
+            
+            in >> sff;
+            
+            sff = m->getFullPathName(sff);
+            
+            //ignore file pairing
+            if(sff[0] == '#'){ while (!in.eof())       {       char c = in.get();  if (c == 10 || c == 13){    break;  }       } m->gobble(in); }
+            else { //check for oligos file
+                oligos = "";
+            
+                // get rest of line in case there is a oligos filename
+                while (!in.eof())      {       
+                    char c = in.get(); 
+                    if (c == 10 || c == 13){   break;  }
+                    else if (c == 32 || c == 9){;} //space or tab
+                    else {     oligos += c;  }
+                } 
+                sffFiles.push_back(sff);
+                if (oligos != "") { oligos = m->getFullPathName(oligos); allBlank = false;  }
+                if (oligos == "") { allFull = false;  }
+                oligosFiles.push_back(oligos); //will push a blank if there is not an oligos for this sff file
+            }
+            m->gobble(in);
+        }
+        in.close();
+        
+        if (allBlank || allFull) { append = true; }
+        if (allFull) { makeGroup = true; }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SffMultipleCommand", "readFile");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+//runs sffinfo, summary.seqs, trim.flows, shhh.flows, trim.seqs, summary.seqs for each sff file.
+int SffMultipleCommand::driver(vector<string> sffFiles, vector<string> oligosFiles, int start, int end, string fasta, string name, string group){
+    try {
+        m->mothurRemove(fasta); m->mothurRemove(name); m->mothurRemove(group);
+        int count = 0;
+        for (int s = start; s < end; s++) {
+            
+            string sff = sffFiles[s];
+            string oligos = oligosFiles[s];
+            
+            m->mothurOut("\n>>>>>\tProcessing " + sff + " (file " + toString(s+1) + " of " + toString(sffFiles.size()) + ")\t<<<<<\n");
+            
+            //run sff.info
+            string inputString = "sff=" + sff + ", flow=T";
+            if (trim) { inputString += ", trim=T"; }
+            m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+            m->mothurOut("Running command: sffinfo(" + inputString + ")"); m->mothurOutEndLine(); 
+            m->mothurCalling = true;
+            
+            Command* sffCommand = new SffInfoCommand(inputString);
+            sffCommand->execute();
+            
+            if (m->control_pressed){ break; }
+            
+            map<string, vector<string> > filenames = sffCommand->getOutputFiles();
+            
+            delete sffCommand;
+            m->mothurCalling = false;
+            m->mothurOutEndLine(); 
+            
+            //run summary.seqs on the fasta file
+            string fastaFile = "";
+            map<string, vector<string> >::iterator it = filenames.find("fasta");
+            if (it != filenames.end()) {  if ((it->second).size() != 0) { fastaFile = (it->second)[0];  } }
+            else {  m->mothurOut("[ERROR]: sffinfo did not create a fasta file, quitting.\n"); m->control_pressed = true; break;  }
+            
+            inputString = "fasta=" + fastaFile + ", processors=1";
+            m->mothurOutEndLine(); 
+            m->mothurOut("Running command: summary.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
+            m->mothurCalling = true;
+            
+            Command* summarySeqsCommand = new SeqSummaryCommand(inputString);
+            summarySeqsCommand->execute();
+            
+            if (m->control_pressed){ break; }
+            
+            map<string, vector<string> > temp = summarySeqsCommand->getOutputFiles();
+            mergeOutputFileList(filenames, temp);
+            
+            delete summarySeqsCommand;
+            m->mothurCalling = false;
+            
+            m->mothurOutEndLine(); 
+            
+            //run trim.flows on the fasta file
+            string flowFile = "";
+            it = filenames.find("flow");
+            if (it != filenames.end()) {  if ((it->second).size() != 0) { flowFile = (it->second)[0];  } }
+            else {  m->mothurOut("[ERROR]: sffinfo did not create a flow file, quitting.\n"); m->control_pressed = true; break;  }
+            
+            inputString = "flow=" + flowFile;
+            if (oligos != "") { inputString += ", oligos=" + oligos; }
+            inputString += ", maxhomop=" + toString(maxHomoP) + ", maxflows=" + toString(maxFlows) + ", minflows=" + toString(minFlows);
+            inputString += ", pdiffs=" + toString(pdiffs) + ", bdiffs=" + toString(bdiffs) + ", ldiffs=" + toString(ldiffs) + ", sdiffs=" + toString(sdiffs);
+            inputString += ", tdiffs=" + toString(tdiffs) + ", signal=" + toString(signal) + ", noise=" + toString(noise) + ", order=" + flowOrder + ", processors=1";
+            
+            m->mothurOutEndLine(); 
+            m->mothurOut("Running command: trim.flows(" + inputString + ")"); m->mothurOutEndLine(); 
+            m->mothurCalling = true;
+            
+            Command* trimFlowCommand = new TrimFlowsCommand(inputString);
+            trimFlowCommand->execute();
+            
+            if (m->control_pressed){ break; }
+            
+            temp = trimFlowCommand->getOutputFiles();
+            mergeOutputFileList(filenames, temp);
+            
+            delete trimFlowCommand;
+            m->mothurCalling = false;
+            
+            
+            string fileFileName = "";
+            flowFile = "";
+            if (oligos != "") { 
+                it = temp.find("file");
+                if (it != temp.end()) {  if ((it->second).size() != 0) { fileFileName = (it->second)[0];  } }
+                else {  m->mothurOut("[ERROR]: trim.flows did not create a file file, quitting.\n"); m->control_pressed = true; break;  }
+            }else {
+                vector<string> flowFiles;
+                it = temp.find("flow");
+                if (it != temp.end()) {  if ((it->second).size() != 0) { flowFiles = (it->second);  } }
+                else {  m->mothurOut("[ERROR]: trim.flows did not create a flow file, quitting.\n"); m->control_pressed = true; break;  }
+                
+                for (int i = 0; i < flowFiles.size(); i++) {
+                    string end = flowFiles[i].substr(flowFiles[i].length()-9);
+                    if (end == "trim.flow") {
+                        flowFile = flowFiles[i]; i+=flowFiles.size(); //if we found the trim.flow file stop looking
+                    }
+                }
+            }
+            
+            if ((fileFileName == "") && (flowFile == "")) { m->mothurOut("[ERROR]: trim.flows did not create a file file or a trim.flow file, quitting.\n"); m->control_pressed = true; break;  }
+            
+            if (fileFileName != "") { inputString = "file=" + fileFileName; }
+            else { inputString = "flow=" + flowFile; }
+            
+            inputString += ", lookup=" + lookupFileName + ", cutoff=" + toString(cutoff); + ", maxiters=" + toString(maxIters);
+            if (large) { inputString += ", large=" + toString(largeSize); }
+            inputString += ", sigma=" +toString(sigma);
+            inputString += ", mindelta=" + toString(minDelta);  
+            inputString += ", order=" + flowOrder + ", processors=1";
+            
+            //run shhh.flows
+            m->mothurOutEndLine(); 
+            m->mothurOut("Running command: shhh.flows(" + inputString + ")"); m->mothurOutEndLine(); 
+            m->mothurCalling = true;
+            
+            Command* shhhFlowCommand = new ShhherCommand(inputString);
+            shhhFlowCommand->execute();
+            
+            if (m->control_pressed){ break; }
+            
+            temp = shhhFlowCommand->getOutputFiles();
+            mergeOutputFileList(filenames, temp);
+            
+            delete shhhFlowCommand;
+            m->mothurCalling = false;
+            
+            vector<string> fastaFiles;
+            vector<string> nameFiles;
+            it = temp.find("fasta");
+            if (it != temp.end()) {  if ((it->second).size() != 0) { fastaFiles = (it->second);  } }
+            else {  m->mothurOut("[ERROR]: shhh.flows did not create a fasta file, quitting.\n"); m->control_pressed = true; break;  }
+           
+            it = temp.find("name");
+            if (it != temp.end()) {  if ((it->second).size() != 0) { nameFiles = (it->second);  } }
+            else {  m->mothurOut("[ERROR]: shhh.flows did not create a name file, quitting.\n"); m->control_pressed = true; break;  }
+            
+            //find fasta and name files with the shortest name.  This is because if there is a composite name it will be the shortest.
+            fastaFile = fastaFiles[0];
+            for (int i = 1; i < fastaFiles.size(); i++) { if (fastaFiles[i].length() < fastaFile.length()) { fastaFile = fastaFiles[i]; } }
+            string nameFile = nameFiles[0];
+            for (int i = 1; i < nameFiles.size(); i++) { if (nameFiles[i].length() < nameFile.length()) { nameFile = nameFiles[i]; } }
+            
+            inputString = "fasta=" + fastaFile + ", name=" + nameFile;
+            if (oligos != "") { inputString += ", oligos=" + oligos; }
+            if (allFiles) { inputString += ", allfiles=t"; }
+            else { inputString += ", allfiles=f";  }
+            if (flip) { inputString += ", flip=t"; }
+            else { inputString += ", flip=f";  }
+            if (keepforward) { inputString += ", keepforward=t"; }
+            else { inputString += ", keepforward=f";  }
+            
+            
+            inputString += ", pdiffs=" + toString(pdiffs) + ", bdiffs=" + toString(bdiffs) + ", ldiffs=" + toString(ldiffs) + ", sdiffs=" + toString(sdiffs);
+            inputString += ", tdiffs=" + toString(tdiffs) + ", maxambig=" + toString(maxAmbig) + ", minlength=" + toString(minLength) + ", maxlength=" + toString(maxLength);
+            if (keepFirst != 0) { inputString += ", keepfirst=" + toString(keepFirst); }
+            if (removeLast != 0) { inputString += ", removelast=" + toString(removeLast); }
+            inputString += ", processors=1";
+            
+            //run trim.seqs
+            m->mothurOutEndLine(); 
+            m->mothurOut("Running command: trim.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
+            m->mothurCalling = true;
+            
+            Command* trimseqsCommand = new TrimSeqsCommand(inputString);
+            trimseqsCommand->execute();
+            
+            if (m->control_pressed){ break; }
+            
+            temp = trimseqsCommand->getOutputFiles();
+            mergeOutputFileList(filenames, temp);
+            
+            delete trimseqsCommand;
+            m->mothurCalling = false;
+            
+            it = temp.find("fasta");
+            if (it != temp.end()) {  if ((it->second).size() != 0) { fastaFiles = (it->second);  } }
+            else {  m->mothurOut("[ERROR]: trim.seqs did not create a fasta file, quitting.\n"); m->control_pressed = true; break;  }
+            
+            for (int i = 0; i < fastaFiles.size(); i++) {
+                string end = fastaFiles[i].substr(fastaFiles[i].length()-10);
+                if (end == "trim.fasta") {
+                    fastaFile = fastaFiles[i]; i+=fastaFiles.size(); //if we found the trim.fasta file stop looking
+                }
+            }
+            
+            it = temp.find("name");
+            if (it != temp.end()) {  if ((it->second).size() != 0) { nameFiles = (it->second);  } }
+            else {  m->mothurOut("[ERROR]: trim.seqs did not create a name file, quitting.\n"); m->control_pressed = true; break;  }
+            
+            for (int i = 0; i < nameFiles.size(); i++) {
+                string end = nameFiles[i].substr(nameFiles[i].length()-10);
+                if (end == "trim.names") {
+                    nameFile = nameFiles[i]; i+=nameFiles.size(); //if we found the trim.names file stop looking
+                }
+            }
+            
+            vector<string> groupFiles;
+            string groupFile = "";
+            if (makeGroup) {
+                it = temp.find("group");
+                if (it != temp.end()) {  if ((it->second).size() != 0) { groupFiles = (it->second);  } }
+            
+                //find group file with the shortest name.  This is because if there is a composite group file it will be the shortest.
+                groupFile = groupFiles[0];
+                for (int i = 1; i < groupFiles.size(); i++) { if (groupFiles[i].length() < groupFile.length()) { groupFile = groupFiles[i]; } }
+            }
+            
+            inputString = "fasta=" + fastaFile + ", processors=1, name=" + nameFile;
+            m->mothurOutEndLine(); 
+            m->mothurOut("Running command: summary.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
+            m->mothurCalling = true;
+            
+            summarySeqsCommand = new SeqSummaryCommand(inputString);
+            summarySeqsCommand->execute();
+            
+            if (m->control_pressed){ break; }
+            
+            temp = summarySeqsCommand->getOutputFiles();
+            mergeOutputFileList(filenames, temp);
+            
+            delete summarySeqsCommand;
+            m->mothurCalling = false;
+            
+            m->mothurOutEndLine(); 
+            m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+            
+            if (append) {
+                m->appendFiles(fastaFile, fasta);
+                m->appendFiles(nameFile, name);
+                if (makeGroup) { m->appendFiles(groupFile, group);  }
+            }
+            count++;
+            
+            for (it = filenames.begin(); it != filenames.end(); it++) {
+                for (int i = 0; i < (it->second).size(); i++) {
+                    outputNames.push_back((it->second)[i]); outputTypes[it->first].push_back((it->second)[i]);
+                }
+            }
+        }
+        
+        return count;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SffMultipleCommand", "driver");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int SffMultipleCommand::mergeOutputFileList(map<string, vector<string> >& files, map<string, vector<string> >& temp){
+    try {
+        map<string, vector<string> >::iterator it;
+        for (it = temp.begin(); it != temp.end(); it++) {
+            map<string, vector<string> >::iterator it2 = files.find(it->first);
+            if (it2 == files.end()) { //we do not already have this type so just add it
+                files[it->first] = it->second;
+            }else { //merge them
+                for (int i = 0; i < (it->second).size(); i++) {
+                    files[it->first].push_back((it->second)[i]);
+                }
+            }
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SffMultipleCommand", "mergeOutputFileList");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int SffMultipleCommand::createProcesses(vector<string> sffFiles, vector<string> oligosFiles, string fasta, string name, string group){
+    try {
+        vector<int> processIDS;
+               int process = 1;
+               int num = 0;
+                               
+               //divide the groups between the processors
+               vector<linePair> lines;
+        vector<int> numFilesToComplete;
+               int numFilesPerProcessor = sffFiles.size() / processors;
+               for (int i = 0; i < processors; i++) {
+                       int startIndex =  i * numFilesPerProcessor;
+                       int endIndex = (i+1) * numFilesPerProcessor;
+                       if(i == (processors - 1)){      endIndex = sffFiles.size();     }
+                       lines.push_back(linePair(startIndex, endIndex));
+            numFilesToComplete.push_back((endIndex-startIndex));
+               }
+               
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)         
+               
+               //loop through and create all the processes you want
+               while (process != processors) {
+                       int pid = fork();
+                       
+                       if (pid > 0) {
+                               processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
+                               process++;
+                       }else if (pid == 0){
+                               num = driver(sffFiles, oligosFiles, lines[process].start, lines[process].end, fasta + toString(getpid()) + ".temp", name  + toString(getpid()) + ".temp", group  + toString(getpid()) + ".temp");
+                
+                //pass numSeqs to parent
+                               ofstream out;
+                               string tempFile = toString(getpid()) + ".num.temp";
+                               m->openOutputFile(tempFile, out);
+                               out << num << '\t' << outputNames.size() << endl;
+                for (int i = 0; i < outputNames.size(); i++) {  out << outputNames[i] << endl;  }
+                               out.close();
+                
+                               exit(0);
+                       }else { 
+                               m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
+                               for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+                               exit(0);
+                       }
+               }
+               
+               //do my part
+               num = driver(sffFiles, oligosFiles, lines[0].start, lines[0].end, fasta, name, group);
+               
+               //force parent to wait until all the processes are done
+               for (int i=0;i<processIDS.size();i++) { 
+                       int temp = processIDS[i];
+                       wait(&temp);
+               }
+        
+        for (int i=0;i<processIDS.size();i++) { 
+            ifstream in;
+                       string tempFile = toString(processIDS[i]) + ".num.temp";
+                       m->openInputFile(tempFile, in);
+                       if (!in.eof()) { 
+                int tempNum = 0; int outputNamesSize = 0; 
+                in >> tempNum >> outputNamesSize; m->gobble(in);
+                for (int j = 0; j < outputNamesSize; j++) {
+                    string tempName;
+                    in >> tempName; m->gobble(in);
+                    outputNames.push_back(tempName);
+                }
+                if (tempNum != numFilesToComplete[i+1]) {
+                    m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
+                }
+            }
+                       in.close(); m->mothurRemove(tempFile);
+            
+            if (append) {
+                m->appendFiles(fasta+toString(processIDS[i])+".temp", fasta);   m->mothurRemove(fasta+toString(processIDS[i])+".temp");
+                m->appendFiles(name+toString(processIDS[i])+".temp", name);     m->mothurRemove(name+toString(processIDS[i])+".temp");
+                if (makeGroup) { m->appendFiles(group+toString(processIDS[i])+".temp", group);  m->mothurRemove(group+toString(processIDS[i])+".temp"); }
+            }
+        }
+#endif
+        return 0;
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "ShhherCommand", "createProcesses");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+
+
+
diff --git a/sffmultiplecommand.h b/sffmultiplecommand.h

new file mode 100644 (file)

index 0000000..4ab2c97
--- /dev/null
+++ b/sffmultiplecommand.h
@@ -0,0 +1,62 @@
+#ifndef Mothur_sffmultiplecommand_h
+#define Mothur_sffmultiplecommand_h
+
+//
+//  sffmultiplecommand.h
+//  Mothur
+//
+//  Created by Sarah Westcott on 8/14/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "command.hpp"
+#include "sffinfocommand.h"
+#include "seqsummarycommand.h"
+#include "trimflowscommand.h"
+#include "shhhercommand.h"
+#include "trimseqscommand.h"
+
+class SffMultipleCommand : public Command {
+       
+public:
+       SffMultipleCommand(string);
+       SffMultipleCommand();
+       ~SffMultipleCommand(){}
+       
+       vector<string> setParameters();
+       string getCommandName()                 { return "sff.multiple";                        }
+       string getCommandCategory()             { return "Sequence Processing";         }
+       string getOutputFileNameTag(string, string);
+       string getHelpString(); 
+       string getCitation() { return "http://www.mothur.org/wiki/Sff.multiple"; }
+       string getDescription()         { return "run multiple sff files through, sffinfo, trim.flow, shhh.flows and trim.seqs combining the results"; }
+    
+       int execute(); 
+       void help() { m->mothurOut(getHelpString()); }  
+       
+private:
+    
+    struct linePair {
+               int start;
+               int end;
+               linePair(int i, int j) : start(i), end(j) {}
+       };
+
+       string filename, outputDir, flowOrder, lookupFileName, minDelta;
+       vector<string> outputNames;
+       bool abort, trim, large, flip, allFiles, keepforward, append, makeGroup;
+       int maxFlows, minFlows, minLength, maxLength, maxHomoP, tdiffs, bdiffs, pdiffs, sdiffs, ldiffs;
+       int processors, maxIters, largeSize;
+       float signal, noise, cutoff, sigma;
+    int keepFirst, removeLast, maxAmbig;
+    
+    int readFile(vector<string>& sffFiles, vector<string>& oligosFiles);
+    int createProcesses(vector<string> sffFiles, vector<string> oligosFiles, string, string, string);
+    int driver(vector<string> sffFiles, vector<string> oligosFiles, int start, int end, string, string, string);
+    int mergeOutputFileList(map<string, vector<string> >& files, map<string, vector<string> >& temp);
+
+
+    
+};
+
+#endif
diff --git a/sharedcommand.cpp b/sharedcommand.cpp

index 1150e53735481aea3377ec81f78c0d2544f324e6..542f8d346447322aea63491e092b97632449bd62 100644 (file)
--- a/sharedcommand.cpp
+++ b/sharedcommand.cpp
@@ -188,7 +188,11 @@ SharedCommand::SharedCommand(string option)  {
               countfile = validParameter.validFile(parameters, "count", true);
               if (countfile == "not open") { countfile = ""; abort = true; }    
               else if (countfile == "not found") { countfile = ""; }
-             else {  m->setCountTableFile(countfile); }
+             else {  
+                 m->setCountTableFile(countfile); 
+                 CountTable temp;
+                 if (!temp.testGroups(countfile)) { m->mothurOut("[ERROR]: Your count file does not have group info, aborting."); m->mothurOutEndLine(); abort=true; }
+             }
                          
              if ((biomfile == "") && (listfile == "")) { 
                                 //is there are current file available for either of these?
@@ -825,7 +829,7 @@ int SharedCommand::createSharedFromListGroup(string filename) {
          int error = ListGroupSameSeqs(namesSeqs, SharedList);
          
          if ((!pickedGroups) && (SharedList->getNumSeqs() != numGroupNames)) {  //if the user has not specified any groups and their files don't match exit with error
-            m->mothurOut("Your group file contains " + toString(numGroupNames) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct."); m->mothurOutEndLine(); 
+            m->mothurOut("Your group file contains " + toString(numGroupNames) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
              
              out.close(); m->mothurRemove(filename); //remove blank shared file you made
              
@@ -1083,8 +1087,12 @@ int SharedCommand::ListGroupSameSeqs(vector<string>& groupMapsSeqs, SharedListVe
                         for (int j = 0; j < listNames.size(); j++) {
                                 int num = groupNamesSeqs.count(listNames[j]);
                                 
-                               if (num == 0) { error = 1; m->mothurOut("[ERROR]: " + listNames[j] + " is in your listfile and not in your groupfile. Please correct."); m->mothurOutEndLine(); }
-                               else { groupNamesSeqs.erase(listNames[j]); }
+                               if (num == 0) { 
+                    error = 1; 
+                    if (groupfile != "") { 
+                        m->mothurOut("[ERROR]: " + listNames[j] + " is in your listfile and not in your groupfile. Please correct."); m->mothurOutEndLine();   } 
+                    else{ m->mothurOut("[ERROR]: " + listNames[j] + " is in your listfile and not in your count file. Please correct."); m->mothurOutEndLine();        }
+                }else { groupNamesSeqs.erase(listNames[j]); }
                         }
                 }
                 
diff --git a/sharedrabundvector.h b/sharedrabundvector.h

index 792543e5abe87e9ac346f98004f33af1c8a39e46..419d15a15af7b73872c2678675aff1a4f564c759 100644 (file)
--- a/sharedrabundvector.h
+++ b/sharedrabundvector.h
@@ -24,7 +24,6 @@
         An individual which knows the OTU from which it came, 
         the group it is in and its abundance.  */
  
-//class GlobalData;
  
  class SharedRAbundVector : public DataVector {
         
diff --git a/sharedutilities.cpp b/sharedutilities.cpp

index 151b254c7ca983abe1568a962aba5eef0e2f5ead..71d7782cad87c7914c2d14071ec1cc657b2803d1 100644 (file)
--- a/sharedutilities.cpp
+++ b/sharedutilities.cpp
@@ -120,7 +120,7 @@ void SharedUtil::setGroups(vector<string>& userGroups, vector<string>& allGroups
                                 
                                 //if the user only entered invalid groups
                                 if (userGroups.size() == 0) { 
-                                       m->mothurOut("You provided no valid groups. I will run the command using all the groups in your groupfile."); m->mothurOutEndLine();
+                                       m->mothurOut("You provided no valid groups. I will run the command using all the groups in your file."); m->mothurOutEndLine();
                                         for (int i = 0; i < allGroups.size(); i++) {
                                                 userGroups.push_back(allGroups[i]);
                                         }
diff --git a/shhhercommand.cpp b/shhhercommand.cpp

index c34f25de509c78b9e1b97b7c303142864e3c7e01..19ffc899d60ac778ac4eedbeb2034a62b5e04bd7 100644 (file)
--- a/shhhercommand.cpp
+++ b/shhhercommand.cpp
@@ -776,8 +776,8 @@ int ShhherCommand::execute(){
  
                 
                 if(compositeFASTAFileName != ""){
-                       outputNames.push_back(compositeFASTAFileName);
-                       outputNames.push_back(compositeNamesFileName);
+                       outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
+                       outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); 
                 }
  
                 m->mothurOutEndLine();
@@ -1039,7 +1039,12 @@ void ShhherCommand::getFlowData(){
          
          float intensity;
          
-        flowFile >> numFlowCells;
+        string numFlowTest;
+        flowFile >> numFlowTest;
+        
+        if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
+        else { convert(numFlowTest, numFlowCells); }
+        
          int index = 0;//pcluster
          while(!flowFile.eof()){
              
@@ -1376,17 +1381,17 @@ string ShhherCommand::cluster(string distFileName, string namesFileName){
      try {
          
          ReadMatrix* read = new ReadColumnMatrix(distFileName);         
-        read->setCutoff(cutoff);
-        
-        NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
-        clusterNameMap->readMap();
-        read->read(clusterNameMap);
-        
-        ListVector* list = read->getListVector();
-        SparseMatrix* matrix = read->getMatrix();
+               read->setCutoff(cutoff);
+               
+               NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
+               clusterNameMap->readMap();
+               read->read(clusterNameMap);
          
-        delete read; 
-        delete clusterNameMap; 
+               ListVector* list = read->getListVector();
+               SparseDistanceMatrix* matrix = read->getDMatrix();
+               
+               delete read; 
+               delete clusterNameMap; 
          
          RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
          
@@ -1738,7 +1743,7 @@ void ShhherCommand::writeQualities(vector<int> otuCounts){
              }
          }
          qualityFile.close();
-        outputNames.push_back(qualityFileName);
+        outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
          
      }
      catch(exception& e) {
@@ -1783,7 +1788,7 @@ void ShhherCommand::writeSequences(vector<int> otuCounts){
          }
          fastaFile.close();
          
-        outputNames.push_back(fastaFileName);
+        outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
          
          if(compositeFASTAFileName != ""){
              m->appendFiles(fastaFileName, compositeFASTAFileName);
@@ -1820,7 +1825,7 @@ void ShhherCommand::writeNames(vector<int> otuCounts){
              }
          }
          nameFile.close();
-        outputNames.push_back(nameFileName);
+        outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
          
          
          if(compositeNamesFileName != ""){
@@ -1852,7 +1857,7 @@ void ShhherCommand::writeGroups(){
              groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
          }
          groupFile.close();
-        outputNames.push_back(groupFileName);
+        outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
          
      }
      catch(exception& e) {
@@ -1912,7 +1917,7 @@ void ShhherCommand::writeClusters(vector<int> otuCounts){
              }
          }
          otuCountsFile.close();
-        outputNames.push_back(otuCountsFileName);
+        outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
          
      }
      catch(exception& e) {
@@ -1926,7 +1931,7 @@ void ShhherCommand::writeClusters(vector<int> otuCounts){
  
  int ShhherCommand::execute(){
         try {
-               if (abort == true) { return 0; }
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
                 getSingleLookUp();      if (m->control_pressed) { return 0; }
                 getJointLookUp();       if (m->control_pressed) { return 0; }
@@ -1943,8 +1948,8 @@ int ShhherCommand::execute(){
  #endif
          
                 if(compositeFASTAFileName != ""){
-                       outputNames.push_back(compositeFASTAFileName);
-                       outputNames.push_back(compositeNamesFileName);
+                       outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
+                       outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
                 }
  
                 m->mothurOutEndLine();
@@ -2029,7 +2034,7 @@ int ShhherCommand::createProcesses(vector<string> filenames){
                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
                 //////////////////////////////////////////////////////////////////////////////////////////////////////
-               
+               /*
                 vector<shhhFlowsData*> pDataArray; 
                 DWORD   dwThreadIdArray[processors-1];
                 HANDLE  hThreadArray[processors-1]; 
@@ -2060,7 +2065,7 @@ int ShhherCommand::createProcesses(vector<string> filenames){
                         CloseHandle(hThreadArray[i]);
                         delete pDataArray[i];
                 }
-               
+               */
          #endif
          
          for (int i=0;i<processIDS.size();i++) { 
@@ -2382,7 +2387,12 @@ int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVecto
                 thisFlowDataIntI.clear();
                 thisNameMap.clear();
                 
-               flowFile >> numFlowCells;
+               string numFlowTest;
+        flowFile >> numFlowTest;
+        
+        if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
+        else { convert(numFlowTest, numFlowCells); }
+        
          if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
                 int index = 0;//pcluster
                 while(!flowFile.eof()){
@@ -3256,7 +3266,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string quality
                         }
                 }
                 qualityFile.close();
-               outputNames.push_back(qualityFileName);
+               outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
          
         }
         catch(exception& e) {
@@ -3300,7 +3310,7 @@ void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTU
                 }
                 fastaFile.close();
          
-               outputNames.push_back(fastaFileName);
+               outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
          
                 if(thisCompositeFASTAFileName != ""){
                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
@@ -3335,7 +3345,7 @@ void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, s
                         }
                 }
                 nameFile.close();
-               outputNames.push_back(nameFileName);
+               outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
                 
                 
                 if(thisCompositeNamesFileName != ""){
@@ -3360,7 +3370,7 @@ void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSe
                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
                 }
                 groupFile.close();
-               outputNames.push_back(groupFileName);
+               outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
          
         }
         catch(exception& e) {
@@ -3419,7 +3429,7 @@ void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int num
                         }
                 }
                 otuCountsFile.close();
-               outputNames.push_back(otuCountsFileName);
+               outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
          
         }
         catch(exception& e) {
diff --git a/shhhercommand.h b/shhhercommand.h

index 8446444bcc23d1f0548cfe2c2d2cfc9868467434..ef52dcd1d761cb26d67bb3c8556e7358c3d57440 100644 (file)
--- a/shhhercommand.h
+++ b/shhhercommand.h
@@ -18,7 +18,6 @@
  #include "sabundvector.hpp"
  #include "listvector.hpp"
  #include "cluster.hpp"
-#include "sparsematrix.hpp"
  #include <cfloat>
  
  //**********************************************************************************************************************
@@ -167,7 +166,7 @@ private:
         
  };
  
-/**************************************************************************************************/
+/**************************************************************************************************
  //custom data structure for threads to use.
  // This is passed by void pointer so it can be any data type
  // that can be passed using a single void pointer (LPVOID).
@@ -203,7 +202,7 @@ struct shhhFlowsData {
         }
  };
  
-/**************************************************************************************************/
+/**************************************************************************************************
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
  #else
  static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ 
@@ -234,7 +233,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              int numFlowCells;
              
              //int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              
              ifstream flowFile;
             // cout << "herethread " << flowFileName << '\t' << &flowFile << endl;
@@ -279,13 +278,13 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                  }
              }
            //  cout << "here" << endl; 
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
         
                         if (pDataArray->m->control_pressed) { return 0; }
                         
                         pDataArray->m->mothurOut("Identifying unique flowgrams...\n");
                         //int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              int numUniques = 0;
              uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
              uniqueCount.assign(numSeqs, 0);                                                    //      anWeights
@@ -364,7 +363,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                  }
              }            
              
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
                         
                         if (pDataArray->m->control_pressed) { return 0; }
                         
@@ -374,7 +373,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              double begClock = clock();
              
              //flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);  
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              ostringstream outStream;
              outStream.setf(ios::fixed, ios::floatfield);
              outStream.setf(ios::dec, ios::basefield);
@@ -390,7 +389,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                  
                  for(int j=0;j<i;j++){
                      //float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
-                    /*****************************************************************************************************/
+                    /*****************************************************************************************************
                      int seqA = mapUniqueToSeq[i]; int seqB = mapUniqueToSeq[j];
                      int minLength = lengths[mapSeqToUnique[seqA]];
                      if(lengths[seqB] < minLength){     minLength = lengths[mapSeqToUnique[seqB]];      }
@@ -413,7 +412,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                      }
                      
                      flowDistance /= (float) minLength;
-                    /*****************************************************************************************************/
+                    /*****************************************************************************************************
  
                      if(flowDistance < 1e-6){
                          outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
@@ -439,14 +438,14 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                  pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
                  pDataArray->m->mothurOutEndLine();
              }
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
              pDataArray->m->mothurOutEndLine();
              pDataArray->m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
              
                         string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
                         //createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              vector<string> duplicateNames(numUniques, "");
              for(int i=0;i<numSeqs;i++){
                  duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
@@ -460,14 +459,14 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                  nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
              }
              nameFile.close();
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
                         if (pDataArray->m->control_pressed) { return 0; }
                         
                         pDataArray->m->mothurOut("\nClustering flowgrams...\n");
              string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
                         //cluster(listFileName, distFileName, namesFileName);
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              ReadMatrix* read = new ReadColumnMatrix(distFileName);     
              read->setCutoff(pDataArray->cutoff);
              
@@ -502,7 +501,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              listFileOut.close();
              
              delete matrix;     delete cluster; delete rabund; delete list;
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
                         if (pDataArray->m->control_pressed) { return 0; }
              
@@ -516,7 +515,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              
                         
                         //int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
-                       /*****************************************************************************************************/
+                       /*****************************************************************************************************
              ifstream listFile;
              pDataArray->m->openInputFile(listFileName, listFile);
              string label;
@@ -596,7 +595,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              seqIndex = seqNumber;
              
              listFile.close();      
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
                         if (pDataArray->m->control_pressed) { return 0; }
                         
@@ -643,7 +642,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                                 double cycClock = clock();
                                 unsigned long long cycTime = time(NULL);
                                 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
                  int indexFill = 0;
                  for(int i=0;i<numOTUs;i++){
                      
@@ -657,13 +656,13 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                          indexFill++;
                      }
                  }
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
  
                                 
                                 if (pDataArray->m->control_pressed) { break; }
                  
                                 //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
                  for(int i=0;i<numOTUs;i++){
                      
                      if (pDataArray->m->control_pressed) { break; }
@@ -708,7 +707,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                              
                              for(int k=0;k<position;k++){
                                 // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
-                                /*****************************************************************************************************/
+                                /*****************************************************************************************************
                                  int flowAValue = anL[k] * numFlowCells;
                                  int flowBValue = nI * numFlowCells;
                                  
@@ -721,7 +720,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                                  }
                                  
                                  dist = dist / (double)lengths[nI];
-                                /*****************************************************************************************************/
+                                /*****************************************************************************************************
                                  adF[k] += dist * tauValue;
                              }
                          }
@@ -743,12 +742,12 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                          centroids[i] = -1;                     
                      }
                  }
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
  
                                 if (pDataArray->m->control_pressed) { break; }
                  
                                 //maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
                  double maxChange = 0;
                  
                  for(int i=0;i<numOTUs;i++){
@@ -768,12 +767,12 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                      if(difference > maxChange){        maxChange = difference; }
                  }
                  maxDelta = maxChange;
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
  
                  if (pDataArray->m->control_pressed) { break; }
                  
                                 //double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
                  vector<long double> P(numSeqs, 0);
                  int effNumOTUs = 0;
                  
@@ -804,12 +803,12 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                  }
                  
                  nLL = nLL -(double)numSeqs * log(pDataArray->sigma);
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
  
                  if (pDataArray->m->control_pressed) { break; }
                  
                                 //checkCentroids(numOTUs, centroids, weight);
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
                  vector<int> unique(numOTUs, 1);
                  
                  for(int i=0;i<numOTUs;i++){
@@ -837,12 +836,12 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                          }
                      }
                  }
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
  
                                 if (pDataArray->m->control_pressed) { break; }
                                 
                                 //calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
                  int total = 0;
                  vector<double> newTau(numOTUs,0);
                  vector<double> norms(numSeqs, 0);
@@ -860,7 +859,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                          
                          if(weight[j] > MIN_WEIGHT && change[j] == 1){
                              //dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
-                            /*****************************************************************************************************/
+                            /*****************************************************************************************************
                              int flowAValue = centroids[j] * numFlowCells;
                              int flowBValue = i * numFlowCells;
                              
@@ -873,7 +872,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                              }
                              
                              dist[indexOffset + j] = distTemp / (double)lengths[i];
-                            /*****************************************************************************************************/
+                            /*****************************************************************************************************
  
                          }
                          
@@ -917,7 +916,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                      
                  }
  
-                /*****************************************************************************************************/
+                /*****************************************************************************************************
  
                                 if (pDataArray->m->control_pressed) { break; }
                                 
@@ -931,7 +930,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                         
                         pDataArray->m->mothurOut("\nFinalizing...\n");
                         //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              int indexFill = 0;
              for(int i=0;i<numOTUs;i++){
                  
@@ -945,12 +944,12 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                      indexFill++;
                  }
              }
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
                         if (pDataArray->m->control_pressed) { break; }
                         
                         //setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
              
              for(int i=0;i<numOTUs;i++){
@@ -993,7 +992,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              }
              
              //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);   
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              indexFill = 0;
              for(int i=0;i<numOTUs;i++){
                  
@@ -1009,7 +1008,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              }
              /*****************************************************************************************************/
  
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
                         if (pDataArray->m->control_pressed) { break; }
                         
@@ -1017,7 +1016,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
                         
                         //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);       
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              for(int i=0;i<numOTUs;i++){
                  
                  if (pDataArray->m->control_pressed) { break; }
@@ -1062,7 +1061,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                          
                          for(int k=0;k<position;k++){
                              // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
-                            /*****************************************************************************************************/
+                            /*****************************************************************************************************
                              int flowAValue = anL[k] * numFlowCells;
                              int flowBValue = nI * numFlowCells;
                              
@@ -1075,7 +1074,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                              }
                              
                              dist = dist / (double)lengths[nI];
-                            /*****************************************************************************************************/
+                            /*****************************************************************************************************
                              adF[k] += dist * tauValue;
                          }
                      }
@@ -1098,13 +1097,13 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                  }
              }
  
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
              if (pDataArray->m->control_pressed) { break; }
              
                         //writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); 
              if (pDataArray->m->control_pressed) { break; }
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              string thisOutputDir = pDataArray->outputDir;
              if (pDataArray->outputDir == "") {  thisOutputDir += pDataArray->m->hasPath(flowFileName);  }
              string qualityFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.qual";
@@ -1200,11 +1199,11 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              }
              qualityFile.close();
              pDataArray->outputNames.push_back(qualityFileName);
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
             // writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);
              if (pDataArray->m->control_pressed) { break; }
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              thisOutputDir = pDataArray->outputDir;
              if (pDataArray->outputDir == "") {  thisOutputDir += pDataArray->m->hasPath(flowFileName);  }
              string fastaFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.fasta";
@@ -1243,11 +1242,11 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                  pDataArray->m->appendFiles(fastaFileName, pDataArray->thisCompositeFASTAFileName);
              }
  
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
              //writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                               
              if (pDataArray->m->control_pressed) { break; }
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              thisOutputDir = pDataArray->outputDir;
              if (pDataArray->outputDir == "") {  thisOutputDir += pDataArray->m->hasPath(flowFileName);  }
              string nameFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.names";
@@ -1275,11 +1274,11 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              if(pDataArray->thisCompositeNameFileName != ""){
                  pDataArray->m->appendFiles(nameFileName, pDataArray->thisCompositeNameFileName);
              }          
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
              //writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                        
              if (pDataArray->m->control_pressed) { break; }
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              thisOutputDir = pDataArray->outputDir;
              if (pDataArray->outputDir == "") {  thisOutputDir += pDataArray->m->hasPath(flowFileName);  }
              string otuCountsFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.counts";
@@ -1327,12 +1326,12 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
                  }
              }
              otuCountsFile.close();
-            pDataArray->outputNames.push_back(otuCountsFileName);
-            /*****************************************************************************************************/
+            pDataArray->outputNames.push_back(otuCountsFileName)
+            /*****************************************************************************************************
  
              //writeGroups(flowFileName, numSeqs, seqNameVector);                                               
              if (pDataArray->m->control_pressed) { break; }
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
              thisOutputDir = pDataArray->outputDir;
              if (pDataArray->outputDir == "") {  thisOutputDir += pDataArray->m->hasPath(flowFileName);  }
              string fileRoot = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName));
@@ -1346,7 +1345,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
              }
              groupFile.close();
              pDataArray->outputNames.push_back(groupFileName);
-            /*****************************************************************************************************/
+            /*****************************************************************************************************
  
              pDataArray->m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
                 }
@@ -1362,7 +1361,7 @@ static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
         }
  } 
  #endif
-
+*/
  
  #endif
  
diff --git a/sortseqscommand.cpp b/sortseqscommand.cpp

index ee7bf7304c0a952fb29064aa87e0cf3892b98808..b0af15441aaedcebb325294e1c91b8b7ab5389b1 100644 (file)
--- a/sortseqscommand.cpp
+++ b/sortseqscommand.cpp
@@ -15,8 +15,9 @@ vector<string> SortSeqsCommand::setParameters(){
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
          CommandParameter pflow("flow", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pflow);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy);
                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pqfile);
                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
@@ -37,8 +38,8 @@ vector<string> SortSeqsCommand::setParameters(){
  string SortSeqsCommand::getHelpString(){       
         try {
                 string helpString = "";
-               helpString += "The sort.seqs command puts the sequences in the same order for the following file types: accnos fasta, name, group, taxonomy, flow or quality file.\n";
-        helpString += "The sort.seqs command parameters are accnos, fasta, name, group, taxonomy, flow, qfile and large.\n";
+               helpString += "The sort.seqs command puts the sequences in the same order for the following file types: accnos fasta, name, group, count, taxonomy, flow or quality file.\n";
+        helpString += "The sort.seqs command parameters are accnos, fasta, name, group, count, taxonomy, flow, qfile and large.\n";
          helpString += "The accnos file allows you to specify the order you want the files in.  If none is provided, mothur will use the order of the first file it reads.\n";
          helpString += "The large parameters is used to indicate your files are too large to fit in RAM.\n";
                 helpString += "The sort.seqs command should be in the following format: sort.seqs(fasta=yourFasta).\n";
@@ -65,6 +66,7 @@ string SortSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
              if (type == "fasta")            {   outputFileName =  "sorted" + m->getExtension(inputName);   }
              else if (type == "taxonomy")    {   outputFileName =  "sorted" + m->getExtension(inputName);   }
              else if (type == "name")        {   outputFileName =  "sorted" + m->getExtension(inputName);   }
+            else if (type == "count")       {   outputFileName =  "sorted" + m->getExtension(inputName);   }
              else if (type == "group")       {   outputFileName =  "sorted" + m->getExtension(inputName);   }
              else if (type == "flow")        {   outputFileName =  "sorted" + m->getExtension(inputName);   }
              else if (type == "qfile")       {   outputFileName =  "sorted" + m->getExtension(inputName);   }
@@ -87,6 +89,7 @@ SortSeqsCommand::SortSeqsCommand(){
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["taxonomy"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
                 outputTypes["group"] = tempOutNames;
                 outputTypes["qfile"] = tempOutNames;
          outputTypes["flow"] = tempOutNames;
@@ -127,6 +130,7 @@ SortSeqsCommand::SortSeqsCommand(string option)  {
                         outputTypes["group"] = tempOutNames;
                         outputTypes["qfile"] = tempOutNames;
              outputTypes["flow"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
@@ -191,6 +195,14 @@ SortSeqsCommand::SortSeqsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
              
                         
@@ -229,16 +241,31 @@ SortSeqsCommand::SortSeqsCommand(string option)  {
                         if (qualfile == "not open") { abort = true; }
                         else if (qualfile == "not found") {  qualfile = "";  }                  
                         else { m->setQualFile(qualfile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
                         
              string temp = validParameter.validFile(parameters, "large", false);                if (temp == "not found") { temp = "f"; }
                         large = m->isTrue(temp);
              
-                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (taxfile == "") && (flowfile == "") && (qualfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, flow or quality."); m->mothurOutEndLine(); abort = true; }
+                       if ((fastafile == "") && (namefile == "") && (countfile == "") && (groupfile == "") && (taxfile == "") && (flowfile == "") && (qualfile == ""))  { m->mothurOut("You must provide at least one of the following: fasta, name, group, count, taxonomy, flow or quality."); m->mothurOutEndLine(); abort = true; }
                         
-                       if ((fastafile != "") && (namefile == "")) {
-                               vector<string> files; files.push_back(fastafile);
-                               parser.getNameFile(files);
-                       }
+            if (countfile == "") {
+                if ((fastafile != "") && (namefile == "")) {
+                    vector<string> files; files.push_back(fastafile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
          
         }
@@ -267,6 +294,7 @@ int SortSeqsCommand::execute(){
          if (qualfile != "")                    {               readQual();             }
          if (namefile != "")                    {               readName();             }
                 if (groupfile != "")            {               readGroup();    }
+        if (countfile != "")           {               readCount();    }
          if (taxfile != "")                     {               readTax();              }
                 
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
@@ -308,7 +336,12 @@ int SortSeqsCommand::execute(){
              itTypes = outputTypes.find("flow");
                         if (itTypes != outputTypes.end()) {
                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFlowFile(current); }
-                       }       
+                       }
+            
+            itTypes = outputTypes.find("count");
+                       if (itTypes != outputTypes.end()) {
+                               if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+                       }
                 }
                 
                 return 0;               
@@ -927,7 +960,88 @@ int SortSeqsCommand::readName(){
                 exit(1);
         }
  }
-
+//**********************************************************************************************************************
+int SortSeqsCommand::readCount(){
+       try {
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(countfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); 
+        outputTypes["count"].push_back(outputFileName);  outputNames.push_back(outputFileName);
+        
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+        
+               ifstream in;
+               m->openInputFile(countfile, in);
+               string firstCol, rest;
+               
+        if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
+            
+            vector<string> seqs; seqs.resize(names.size(), "");
+            
+            string headers = m->getline(in); m->gobble(in);
+            
+            while(!in.eof()){
+                if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
+                
+                in >> firstCol;                m->gobble(in);          
+                rest = m->getline(in);    m->gobble(in);
+                
+                if (firstCol != "") {
+                    map<string, int>::iterator it = names.find(firstCol);
+                    if (it != names.end()) { //we found it, so put it in the vector in the right place.
+                        seqs[it->second] = firstCol + '\t' + rest;  
+                    }else { //if we cant find it then add it to the end
+                        names[firstCol] = seqs.size();
+                        seqs.push_back((firstCol + '\t' + rest));
+                        m->mothurOut(firstCol + " was not in the contained the file which determined the order, adding it to the end.\n");
+                    }
+                }
+            }
+            in.close();        
+            
+            int count = 0;
+            out << headers << endl;
+            for (int i = 0; i < seqs.size(); i++) {
+                if (seqs[i] != "") { out << seqs[i] << endl; count++; }
+            }
+            out.close();
+            
+            m->mothurOut("Ordered " + toString(count) + " sequences from " + countfile + ".\n");
+            
+        }else { //read in file to fill names
+            int count = 0;
+            
+            string headers = m->getline(in); m->gobble(in);
+            out << headers << endl;
+            
+            while(!in.eof()){
+                if (m->control_pressed) { in.close();  out.close();  m->mothurRemove(outputFileName);  return 0; }
+                
+                in >> firstCol;                m->gobble(in);          
+                rest = m->getline(in);  m->gobble(in);
+                
+                if (firstCol != "") {
+                    //if this name is in the accnos file
+                    names[firstCol] = count;
+                    count++;
+                    out << firstCol << '\t' << rest << endl;
+                }
+                m->gobble(in);
+            }
+            in.close();        
+            out.close();
+            
+            m->mothurOut("\nUsing " + countfile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
+        }
+        
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SortSeqsCommand", "readCount");
+               exit(1);
+       }
+}
  //**********************************************************************************************************************
  int SortSeqsCommand::readGroup(){
         try {
diff --git a/sortseqscommand.h b/sortseqscommand.h

index 6d9c5ed8f2c6183ffdce14f52bff60da3f2316a8..4ba8e4294d4b7d01bc8de7ba57cef84817f718ce 100644 (file)
--- a/sortseqscommand.h
+++ b/sortseqscommand.h
@@ -36,7 +36,7 @@ public:
         
  private:
      map<string, int> names;
-    string accnosfile, fastafile, namefile, groupfile, taxfile, qualfile, flowfile, outputDir;
+    string accnosfile, fastafile, namefile, groupfile, countfile, taxfile, qualfile, flowfile, outputDir;
      bool abort, large;
      vector<string> outputNames;
      
@@ -45,6 +45,7 @@ private:
      int readName();
      int readGroup();
      int readTax();
+    int readCount();
      int readQual();
      
  };
diff --git a/sparsedistancematrix.cpp b/sparsedistancematrix.cpp

index 7d505239d88b4d817b3b6c5fd3609c6c18e785b4..b315c487da1e39053307870560ca165d1068c49a 100644 (file)
--- a/sparsedistancematrix.cpp
+++ b/sparsedistancematrix.cpp
@@ -126,7 +126,7 @@ ull SparseDistanceMatrix::getSmallestCell(ull& row){
                 return col;
         }
         catch(exception& e) {
-               m->errorOut(e, "SparseMatrix", "getSmallestCell");
+               m->errorOut(e, "SparseDistanceMatrix", "getSmallestCell");
                 exit(1);
         }
  }
@@ -141,7 +141,7 @@ int SparseDistanceMatrix::sortSeqVec(){
          return 0;
      }
         catch(exception& e) {
-               m->errorOut(e, "SparseMatrix", "getSmallestCell");
+               m->errorOut(e, "SparseDistanceMatrix", "sortSeqVec");
                 exit(1);
         }
  }
diff --git a/splitabundcommand.cpp b/splitabundcommand.cpp

index bc1cdb3ecfaa8ca528b88c2534e40c61263aacef..48fada8eb7fc89d858f24c4d54dcc0a64d8a8225 100644 (file)
--- a/splitabundcommand.cpp
+++ b/splitabundcommand.cpp
@@ -8,13 +8,15 @@
   */
  
  #include "splitabundcommand.h"
+#include "sharedutilities.h"
  
  //**********************************************************************************************************************
  vector<string> SplitAbundCommand::setParameters(){     
         try {           
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist);
                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
                 CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff);
@@ -37,8 +39,8 @@ string SplitAbundCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The split.abund command reads a fasta file and a list or a names file splits the sequences into rare and abundant groups. \n";
-               helpString += "The split.abund command parameters are fasta, list, name, cutoff, group, label, groups, cutoff and accnos.\n";
-               helpString += "The fasta and a list or name parameter are required, and you must provide a cutoff value.\n";
+               helpString += "The split.abund command parameters are fasta, list, name, count, cutoff, group, label, groups, cutoff and accnos.\n";
+               helpString += "The fasta and a list or name or count parameter are required, and you must provide a cutoff value.\n";
                 helpString += "The cutoff parameter is used to qualify what is abundant and rare.\n";
                 helpString += "The group parameter allows you to parse a group file into rare and abundant groups.\n";
                 helpString += "The label parameter is used to read specific labels in your listfile you want to use.\n";
@@ -69,6 +71,7 @@ string SplitAbundCommand::getOutputFileNameTag(string type, string inputName="")
              if (type == "fasta")            {   outputFileName =  "fasta";   }
              else if (type == "list")    {   outputFileName =  "list";   }
              else if (type == "name")        {   outputFileName =  "names";   }
+            else if (type == "count")        {   outputFileName =  "count_table";   }
              else if (type == "group")       {   outputFileName =  "groups";   }
              else if (type == "accnos")        {   outputFileName =  "accnos";   }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
@@ -88,6 +91,7 @@ SplitAbundCommand::SplitAbundCommand(){
                 vector<string> tempOutNames;
                 outputTypes["list"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
                 outputTypes["accnos"] = tempOutNames;
                 outputTypes["group"] = tempOutNames;
                 outputTypes["fasta"] = tempOutNames;
@@ -126,7 +130,8 @@ SplitAbundCommand::SplitAbundCommand(string option)  {
                         outputTypes["name"] = tempOutNames;
                         outputTypes["accnos"] = tempOutNames;
                         outputTypes["group"] = tempOutNames;
-                       outputTypes["fasta"] = tempOutNames;                    
+                       outputTypes["fasta"] = tempOutNames;    
+            outputTypes["count"] = tempOutNames;
                                                                                                 
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -165,6 +170,13 @@ SplitAbundCommand::SplitAbundCommand(string option)  {
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
  
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -194,35 +206,52 @@ SplitAbundCommand::SplitAbundCommand(string option)  {
                         if (groupfile == "not open") {  groupfile = ""; abort = true; } 
                         else if (groupfile == "not found") { groupfile = ""; }
                         else {  
-                               groupMap = new GroupMap(groupfile);
-                               
-                               int error = groupMap->readMap();
+                               int error = groupMap.readMap(groupfile);
                                 if (error == 1) { abort = true; }
                                 m->setGroupFile(groupfile);
                         }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else {
+                m->setCountTableFile(countfile); 
+                ct.readTable(countfile);
+            }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+            
                         groups = validParameter.validFile(parameters, "groups", false);         
                         if (groups == "not found") { groups = ""; }
-                       else if (groups == "all") { 
-                               if (groupfile != "") {  Groups = groupMap->getNamesOfGroups();  } 
-                               else {  m->mothurOut("You cannot select groups without a valid groupfile, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = "";   }
-                       }else { 
-                               m->splitAtDash(groups, Groups);
-                       }
+                       else { m->splitAtDash(groups, Groups); }
                         
-                       if ((groupfile == "") && (groups != "")) {  m->mothurOut("You cannot select groups without a valid groupfile, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = "";  Groups.clear(); }
+                       if (((groupfile == "") && (countfile == ""))&& (groups != "")) {  m->mothurOut("You cannot select groups without a valid group or count file, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = "";  Groups.clear(); }
                         
+            if (countfile != "") {
+                if (!ct.hasGroupInfo()) { m->mothurOut("You cannot pick groups without group info in your count file; I will disregard your groups selection."); m->mothurOutEndLine(); groups = "";  Groups.clear(); }
+            }
+            
                         //do you have all files needed
-                       if ((listfile == "") && (namefile == "")) { 
+                       if ((listfile == "") && (namefile == "") && (countfile == "")) { 
                                 namefile = m->getNameFile(); 
                                 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
                                 else {                          
                                         listfile = m->getListFile(); 
                                         if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
-                                       else {  m->mothurOut("You have no current list or namefile and the list or name parameter is required."); m->mothurOutEndLine(); abort = true; }
+                                       else {  
+                        countfile  = m->getCountTableFile(); 
+                        if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                        else {         m->mothurOut("You have no current list, count or namefile and one is required."); m->mothurOutEndLine(); abort = true; }
+                    }
                                 }
                         }
-                       
+            
                         //check for optional parameter and set defaults
                         // ...at some point should added some additional type checking...
                         label = validParameter.validFile(parameters, "label", false);                   
@@ -248,14 +277,20 @@ SplitAbundCommand::SplitAbundCommand(string option)  {
         }
  }
  //**********************************************************************************************************************
-SplitAbundCommand::~SplitAbundCommand(){ 
-       if (groupfile != "") {  delete groupMap;  } 
-}
+SplitAbundCommand::~SplitAbundCommand(){}
  //**********************************************************************************************************************
  int SplitAbundCommand::execute(){
         try {
         
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
+        
+        if (Groups.size() != 0) {
+            vector<string> allGroups;
+            if (countfile != "") { allGroups = ct.getNamesOfGroups(); }
+            else { allGroups = groupMap.getNamesOfGroups(); }
+            SharedUtil util;
+            util.setGroups(Groups, allGroups);
+        }
                 
                 if (listfile != "") { //you are using a listfile to determine abundance
                         if (outputDir == "") { outputDir = m->hasPath(listfile); }
@@ -264,19 +299,19 @@ int SplitAbundCommand::execute(){
                         set<string> processedLabels;
                         set<string> userLabels = labels;        
                         
-                       input = new InputData(listfile, "list");
-                       list = input->getListVector();
+                       InputData input(listfile, "list");
+                       ListVector* list = input.getListVector();
                         string lastLabel = list->getLabel();
                         
                         //do you have a namefile or do we need to similate one?
                         if (namefile != "") {  readNamesFile();         }
                         else                            { createNameMap(list);  }
                         
-                       if (m->control_pressed) { delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]); } return 0; }
+                       if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]); } return 0; }
                         
                         while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                         
-                               if (m->control_pressed) { delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]); } return 0; }
+                               if (m->control_pressed) {  delete list; for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } return 0; }
                                 
                                 if(allLines == 1 || labels.count(list->getLabel()) == 1){
                                                 
@@ -291,7 +326,7 @@ int SplitAbundCommand::execute(){
                                                 string saveLabel = list->getLabel();
                                                 
                                                 delete list;
-                                               list = input->getListVector(lastLabel); //get new list vector to process
+                                               list = input.getListVector(lastLabel); //get new list vector to process
                                                 
                                                 m->mothurOut(list->getLabel()); m->mothurOutEndLine();
                                                 splitList(list);
@@ -307,10 +342,10 @@ int SplitAbundCommand::execute(){
                                 lastLabel = list->getLabel();
                                         
                                 delete list;
-                               list = input->getListVector(); //get new list vector to process
+                               list = input.getListVector(); //get new list vector to process
                         }
                         
-                       if (m->control_pressed) { delete input;  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+                       if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); } return 0; }
                         
                         //output error messages about any remaining user labels
                         set<string>::iterator it;
@@ -326,12 +361,12 @@ int SplitAbundCommand::execute(){
  
                         }
                         
-                       if (m->control_pressed) { delete input;  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+                       if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); } return 0; }
                         
                         //run last label if you need to
                         if (needToRun == true)  {
                                 if (list != NULL) {     delete list;    }
-                               list = input->getListVector(lastLabel); //get new list vector to process
+                               list = input.getListVector(lastLabel); //get new list vector to process
                                 
                                 m->mothurOut(list->getLabel()); m->mothurOutEndLine();
                                 splitList(list);                
@@ -339,11 +374,9 @@ int SplitAbundCommand::execute(){
                                 delete list;
                         }
                         
-                       delete input;
-                       
                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); }      return 0;       }
                                                                         
-               }else { //you are using the namefile to determine abundance
+               }else if (namefile != "") { //you are using the namefile to determine abundance
                         if (outputDir == "") { outputDir = m->hasPath(namefile); }
                         
                         splitNames(); 
@@ -353,7 +386,14 @@ int SplitAbundCommand::execute(){
                         if (groupfile != "")                            {  parseGroup(tag);             }
                         if (accnos)                                                     {  writeAccnos(tag);    }
                         if (fastafile != "")                            {  parseFasta(tag);             }
-               }
+               }else {
+            //split by countfile
+            string tag = "";
+            splitCount();
+            
+                       if (accnos)                                                     {  writeAccnos(tag);    }
+                       if (fastafile != "")                            {  parseFasta(tag);             }
+        }
                 
                 //set fasta file as new current fastafile
                 string current = "";
@@ -381,6 +421,11 @@ int SplitAbundCommand::execute(){
                 if (itTypes != outputTypes.end()) {
                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
                 }
+        
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
                 
                 m->mothurOutEndLine();
                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
@@ -401,6 +446,7 @@ int SplitAbundCommand::splitList(ListVector* thisList) {
                 abundNames.clear();
                 
                 //get rareNames and abundNames
+        int numRareBins = 0;
                 for (int i = 0; i < thisList->getNumBins(); i++) {
                         if (m->control_pressed) { return 0; }
                         
@@ -409,8 +455,15 @@ int SplitAbundCommand::splitList(ListVector* thisList) {
                         vector<string> names;
                         m->splitAtComma(bin, names);  //parses bin into individual sequence names
                         int size = names.size();
-                               
+            
+            //if countfile is not blank we assume the list file is unique, otherwise we assume it includes all seqs
+            if (countfile != "") {
+                size = 0;
+                for (int j = 0; j < names.size(); j++) {  size += ct.getNumSeqs(names[j]); }
+            }
+            
                         if (size <= cutoff) {
+                numRareBins++;
                                 for (int j = 0; j < names.size(); j++) {  rareNames.insert(names[j]);  }
                         }else{
                                 for (int j = 0; j < names.size(); j++) {  abundNames.insert(names[j]);  }
@@ -419,13 +472,14 @@ int SplitAbundCommand::splitList(ListVector* thisList) {
  
                 
                 string tag = thisList->getLabel() + ".";
-               
-               writeList(thisList, tag);
-               
+       
+               writeList(thisList, tag, numRareBins);
+    
                 if (groupfile != "")                            {  parseGroup(tag);             }
                 if (accnos)                                                     {  writeAccnos(tag);    }
                 if (fastafile != "")                            {  parseFasta(tag);             }
-               
+        if (countfile != "")                           {  parseCount(tag);             }
+        
                 return 0;
  
         }
@@ -435,24 +489,13 @@ int SplitAbundCommand::splitList(ListVector* thisList) {
         }
  }
  /**********************************************************************************************************************/
-int SplitAbundCommand::writeList(ListVector* thisList, string tag) { 
+int SplitAbundCommand::writeList(ListVector* thisList, string tag, int numRareBins) { 
         try {
                 
                 map<string, ofstream*> filehandles;
                 
                 if (Groups.size() == 0) {
-                       SAbundVector* sabund = new SAbundVector();
-                       *sabund = thisList->getSAbundVector();
-               
-                       //find out how many bins are rare and how many are abundant so you can process the list vector one bin at a time
-                       // and don't have to store the bins until you are done with the whole vector, this save alot of space.
-                       int numRareBins = 0;
-                       for (int i = 0; i <= sabund->getMaxRank(); i++) {
-                               if (i > cutoff) { break; }
-                               numRareBins += sabund->get(i);
-                       }
                         int numAbundBins = thisList->getNumBins() - numRareBins;
-                       delete sabund;
  
                         ofstream aout;
                         ofstream rout;
@@ -471,9 +514,15 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) {
                         for (int i = 0; i < thisList->getNumBins(); i++) {
                                 if (m->control_pressed) { break; }
                         
-                               string bin = list->get(i); 
-                       
-                               int size = m->getNumNames(bin);
+                               string bin = thisList->get(i); 
+                vector<string> names;
+                m->splitAtComma(bin, names);
+                
+                               int size = names.size();
+                if (countfile != "") {
+                    size = 0;
+                    for (int j = 0; j < names.size(); j++) {  size += ct.getNumSeqs(names[j]); }
+                }
                         
                                 if (size <= cutoff) {  rout << bin << '\t';  }
                                 else                            {  aout << bin << '\t'; }
@@ -499,8 +548,8 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) {
                                 temp2 = new ofstream;
                                 filehandles[Groups[i]+".abund"] = temp2;
                                 
-                string rareGroupFileName = fileroot + Groups[i] + tag + ".rare." + getOutputFileNameTag("list");
-                string abundGroupFileName = fileroot + Groups[i] + tag + ".abund." + getOutputFileNameTag("list");
+                string rareGroupFileName = fileroot + Groups[i] +"."+ tag + "rare." + getOutputFileNameTag("list");
+                string abundGroupFileName = fileroot + Groups[i] +"."+ tag + "abund." + getOutputFileNameTag("list");
                                 m->openOutputFile(rareGroupFileName, *(filehandles[Groups[i]+".rare"]));
                                 m->openOutputFile(abundGroupFileName, *(filehandles[Groups[i]+".abund"]));
                                 outputNames.push_back(rareGroupFileName); outputTypes["list"].push_back(rareGroupFileName);
@@ -520,7 +569,7 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) {
                                 if (m->control_pressed) { break; }
                         
                                 map<string, string> groupBins;
-                               string bin = list->get(i); 
+                               string bin = thisList->get(i); 
                         
                                 vector<string> names;
                                 m->splitAtComma(bin, names);  //parses bin into individual sequence names
@@ -534,19 +583,34 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) {
                                                 rareAbund = ".abund";
                                         }
                                         
-                                       string group = groupMap->getGroup(names[j]);
-                               
-                                       if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want
-                                               itGroup = groupBins.find(group+rareAbund);
-                                               if(itGroup == groupBins.end()) {
-                                                       groupBins[group+rareAbund] = names[j];  //add first name
-                                                       groupNumBins[group+rareAbund]++;
-                                               }else{ //add another name
-                                                       groupBins[group+rareAbund] +=  "," + names[j];
-                                               }
-                                       }else if(group == "not found") {
-                                               m->mothurOut(names[j] + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
-                                       }
+                    if (countfile == "") {
+                        string group = groupMap.getGroup(names[j]);
+                        
+                        if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want
+                            itGroup = groupBins.find(group+rareAbund);
+                            if(itGroup == groupBins.end()) {
+                                groupBins[group+rareAbund] = names[j];  //add first name
+                                groupNumBins[group+rareAbund]++;
+                            }else{ //add another name
+                                groupBins[group+rareAbund] +=  "," + names[j];
+                            }
+                        }else if(group == "not found") {
+                            m->mothurOut(names[j] + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
+                        }
+                    }else {
+                        vector<string> thisSeqsGroups = ct.getGroups(names[j]);
+                        for (int k = 0; k < thisSeqsGroups.size(); k++) {
+                            if (m->inUsersGroups(thisSeqsGroups[k], Groups)) { //only add if this is in a group we want
+                                itGroup = groupBins.find(thisSeqsGroups[k]+rareAbund);
+                                if(itGroup == groupBins.end()) {
+                                    groupBins[thisSeqsGroups[k]+rareAbund] = names[j];  //add first name
+                                    groupNumBins[thisSeqsGroups[k]+rareAbund]++;
+                                }else{ //add another name
+                                    groupBins[thisSeqsGroups[k]+rareAbund] +=  "," + names[j];
+                                }
+                            }
+                        }
+                    }
                                 }
                         
                         
@@ -572,6 +636,37 @@ int SplitAbundCommand::writeList(ListVector* thisList, string tag) {
         }
  }
  /**********************************************************************************************************************/
+int SplitAbundCommand::splitCount() { //countfile
+       try {
+               rareNames.clear();
+               abundNames.clear();     
+        
+               vector<string> allNames = ct.getNamesOfSeqs();
+        for (int i = 0; i < allNames.size(); i++) {
+            
+            if (m->control_pressed) { return 0; }
+            
+            int size = ct.getNumSeqs(allNames[i]);
+            nameMap[allNames[i]] = allNames[i];
+            
+                       if (size <= cutoff) {
+                               rareNames.insert(allNames[i]); 
+                       }else{
+                               abundNames.insert(allNames[i]); 
+                       }
+               }
+        
+        //write out split count files
+        parseCount("");
+               
+               return 0;  
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SplitAbundCommand", "splitCount");
+               exit(1);
+       }
+}
+/**********************************************************************************************************************/
  int SplitAbundCommand::splitNames() { //namefile
         try {
                 
@@ -658,6 +753,115 @@ int SplitAbundCommand::createNameMap(ListVector* thisList) {
         }
  }
  /**********************************************************************************************************************/
+int SplitAbundCommand::parseCount(string tag) { //namefile
+       try {
+               
+               map<string, ofstream*> filehandles;
+        
+               if (Groups.size() == 0) {
+                       string rare = outputDir + m->getRootName(m->getSimpleName(countfile))  + tag + "rare." + getOutputFileNameTag("count");
+            outputNames.push_back(rare); outputTypes["count"].push_back(rare);
+                       
+                       string abund = outputDir + m->getRootName(m->getSimpleName(countfile))  + tag + "abund." + getOutputFileNameTag("count");
+                       outputNames.push_back(abund); outputTypes["count"].push_back(abund);
+                       
+            CountTable rareTable;
+            CountTable abundTable;
+            if (ct.hasGroupInfo()) {  
+                vector<string> ctGroups = ct.getNamesOfGroups();
+                for (int i = 0; i < ctGroups.size(); i++) {  rareTable.addGroup(ctGroups[i]);  abundTable.addGroup(ctGroups[i]); }
+            }
+            
+                       if (rareNames.size() != 0) {
+                               for (set<string>::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) {
+                    if (ct.hasGroupInfo()) {
+                        vector<int> groupCounts = ct.getGroupCounts(*itRare);
+                        rareTable.push_back(*itRare, groupCounts);
+                    }else {
+                        int groupCounts = ct.getNumSeqs(*itRare);
+                        rareTable.push_back(*itRare, groupCounts);
+                    }
+                               }
+                if (rareTable.hasGroupInfo()) {
+                    vector<string> ctGroups = rareTable.getNamesOfGroups();
+                    for (int i = 0; i < ctGroups.size(); i++) { 
+                        if (rareTable.getGroupCount(ctGroups[i]) == 0) { rareTable.removeGroup(ctGroups[i]); }
+                    }
+                }
+                rareTable.printTable(rare);
+                       }
+                       
+            
+                       if (abundNames.size() != 0) {
+                               for (set<string>::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) {
+                                       if (ct.hasGroupInfo()) {
+                        vector<int> groupCounts = ct.getGroupCounts(*itAbund);
+                        abundTable.push_back(*itAbund, groupCounts);
+                    }else {
+                        int groupCounts = ct.getNumSeqs(*itAbund);
+                        abundTable.push_back(*itAbund, groupCounts);
+                    }
+                               }
+                if (abundTable.hasGroupInfo()) {
+                    vector<string> ctGroups = abundTable.getNamesOfGroups();
+                    for (int i = 0; i < ctGroups.size(); i++) { 
+                        if (abundTable.getGroupCount(ctGroups[i]) == 0) { abundTable.removeGroup(ctGroups[i]); }
+                    }
+                }
+                abundTable.printTable(abund);
+                       }
+                       
+               }else{ //parse names by abundance and group
+                       map<string, CountTable*> countTableMap;
+                       map<string, CountTable*>::iterator it3;
+            
+                       for (int i=0; i<Groups.size(); i++) {
+                               CountTable* rareCt = new CountTable();
+                rareCt->addGroup(Groups[i]);
+                               countTableMap[Groups[i]+".rare"] = rareCt;
+                               CountTable* abundCt = new CountTable();
+                abundCt->addGroup(Groups[i]);
+                               countTableMap[Groups[i]+".abund"] = abundCt;
+                       }
+                       
+            vector<string> allNames = ct.getNamesOfSeqs();
+                       for (int i = 0; i < allNames.size(); i++) {                             
+                               string rareAbund;
+                               if (rareNames.count(allNames[i]) != 0) { //you are a rare name
+                    rareAbund = ".rare";
+                               }else{ //you are a abund name
+                    rareAbund = ".abund";
+                               }
+                               
+                vector<string> thisSeqsGroups = ct.getGroups(allNames[i]);
+                for (int j = 0; j < thisSeqsGroups.size(); j++) {
+                    if (m->inUsersGroups(thisSeqsGroups[j], Groups)) { //only add if this is in a group we want
+                        int num = ct.getGroupCount(allNames[i], thisSeqsGroups[j]);
+                        vector<int> nums; nums.push_back(num);
+                        countTableMap[thisSeqsGroups[j]+rareAbund]->push_back(allNames[i], nums); 
+                    }
+                }
+                       }
+                       
+                       
+                       for (it3 = countTableMap.begin(); it3 != countTableMap.end(); it3++) { 
+                string fileroot =  outputDir + m->getRootName(m->getSimpleName(countfile));
+                string filename = fileroot + it3->first + "." + getOutputFileNameTag("count");
+                outputNames.push_back(filename);  outputTypes["count"].push_back(filename);
+                (it3->second)->printTable(filename);
+                               delete it3->second;
+                       }
+               }
+        
+               return 0;
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SplitAbundCommand", "parseCount");
+               exit(1);
+       }
+}
+/**********************************************************************************************************************/
  int SplitAbundCommand::writeNames() { //namefile
         try {
                 
@@ -723,7 +927,7 @@ int SplitAbundCommand::writeNames() { //namefile
                                 map<string, string>::iterator itout;
                                 for (int i = 0; i < names.size(); i++) {
                                         
-                                       string group = groupMap->getGroup(names[i]);
+                                       string group = groupMap.getGroup(names[i]);
                                         
                                         if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want
                                                 itout = outputStrings.find(group+rareAbund);
@@ -803,7 +1007,7 @@ int SplitAbundCommand::writeAccnos(string tag) {
                         
                         //write rare
                         for (set<string>::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) {
-                                       string group = groupMap->getGroup(*itRare);
+                                       string group = groupMap.getGroup(*itRare);
                                         
                                         if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want
                                                 *(filehandles[group+".rare"]) << *itRare << endl;
@@ -812,7 +1016,7 @@ int SplitAbundCommand::writeAccnos(string tag) {
                                 
                         //write abund   
                         for (set<string>::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) {
-                                       string group = groupMap->getGroup(*itAbund);
+                                       string group = groupMap.getGroup(*itAbund);
                                         
                                         if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want
                                                 *(filehandles[group+".abund"]) << *itAbund << endl;
@@ -860,7 +1064,7 @@ int SplitAbundCommand::parseGroup(string tag) { //namefile
                                 
                                 for (int i = 0; i < names.size(); i++) {
                                 
-                                       string group = groupMap->getGroup(names[i]);
+                                       string group = groupMap.getGroup(names[i]);
                                 
                                         if (group == "not found") { 
                                                 m->mothurOut(names[i] + " is not in your groupfile, ignoring, please correct."); m->mothurOutEndLine();
@@ -907,7 +1111,7 @@ int SplitAbundCommand::parseGroup(string tag) { //namefile
                                 
                                 for (int i = 0; i < names.size(); i++) {
                                 
-                                       string group = groupMap->getGroup(names[i]);
+                                       string group = groupMap.getGroup(names[i]);
                                                                         
                                         if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want
                                                 *(filehandles[group+rareAbund]) << names[i] << '\t' << group << endl;
@@ -964,7 +1168,7 @@ int SplitAbundCommand::parseFasta(string tag) { //namefile
                                         itNames = nameMap.find(seq.getName());
                                         
                                         if (itNames == nameMap.end()) {
-                                               m->mothurOut(seq.getName() + " is not in your namesfile, ignoring."); m->mothurOutEndLine();
+                                               m->mothurOut(seq.getName() + " is not in your names or list file, ignoring."); m->mothurOutEndLine();
                                         }else{
                                                 if (rareNames.count(seq.getName()) != 0) { //you are a rare name
                                                         seq.printSequence(rout);
@@ -1008,7 +1212,7 @@ int SplitAbundCommand::parseFasta(string tag) { //namefile
                                         map<string, string>::iterator itNames = nameMap.find(seq.getName());
                                         
                                         if (itNames == nameMap.end()) {
-                                               m->mothurOut(seq.getName() + " is not in your namesfile, ignoring."); m->mothurOutEndLine();
+                                               m->mothurOut(seq.getName() + " is not in your names or list file, ignoring."); m->mothurOutEndLine();
                                         }else{
                                                 vector<string> names;
                                                 m->splitAtComma(itNames->second, names);  //parses bin into individual sequence names
@@ -1019,17 +1223,25 @@ int SplitAbundCommand::parseFasta(string tag) { //namefile
                                                 }else{ //you are a abund name
                                                         rareAbund = ".abund";
                                                 }
-                               
-                                               for (int i = 0; i < names.size(); i++) {
-                               
-                                                       string group = groupMap->getGroup(seq.getName());
-                                       
-                                                       if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want
-                                                               seq.printSequence(*(filehandles[group+rareAbund]));
-                                                       }else if(group == "not found") {
-                                                               m->mothurOut(seq.getName() + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
-                                                       }
-                                               }
+                        
+                        if (countfile == "") {
+                            for (int i = 0; i < names.size(); i++) {
+                                string group = groupMap.getGroup(seq.getName());
+                                
+                                if (m->inUsersGroups(group, Groups)) { //only add if this is in a group we want
+                                    seq.printSequence(*(filehandles[group+rareAbund]));
+                                }else if(group == "not found") {
+                                    m->mothurOut(seq.getName() + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
+                                }
+                            }
+                        }else {
+                            vector<string> thisSeqsGroups = ct.getGroups(names[0]); //we only need names[0], because there is no namefile
+                            for (int i = 0; i < thisSeqsGroups.size(); i++) {
+                                if (m->inUsersGroups(thisSeqsGroups[i], Groups)) { //only add if this is in a group we want
+                                    seq.printSequence(*(filehandles[thisSeqsGroups[i]+rareAbund]));
+                                }
+                            }
+                        }
                                         }
                                 }
                         }
diff --git a/splitabundcommand.h b/splitabundcommand.h

index 232c36ba36763dcdb325fb9f8404d537e92450a6..d0542642a85d1efe0ea19280cceaaebc584911df 100644 (file)
--- a/splitabundcommand.h
+++ b/splitabundcommand.h
@@ -22,6 +22,7 @@ also allow an option where a user can give a group file with the list or names f
  #include "inputdata.h"
  #include "listvector.hpp"
  #include "sequence.hpp"
+#include "counttable.h"
  
  /***************************************************************************************/
  
@@ -47,24 +48,24 @@ private:
         int splitList(ListVector*);
         int splitNames(); //namefile
         int writeNames(); 
-       int writeList(ListVector*, string); 
+       int writeList(ListVector*, string, int); 
         int writeAccnos(string); 
         int parseGroup(string); 
         int parseFasta(string); 
+    int parseCount(string);
+    int splitCount();
         int readNamesFile(); //namefile
         int createNameMap(ListVector*);
         
         vector<string> outputNames;
-       ListVector* list;
-       GroupMap* groupMap;
-       InputData* input;
+    GroupMap groupMap;
+    CountTable ct;
         
-       string outputDir, listfile, namefile, groupfile, label, groups, fastafile, inputFile;
+       string outputDir, listfile, namefile, groupfile, countfile, label, groups, fastafile, inputFile;
         set<string> labels, rareNames, abundNames;
         vector<string> Groups;
         bool abort, allLines, accnos;
         int cutoff;
-       //map<string, bool> wroteListFile;
         map<string, string> nameMap;
         
         
diff --git a/splitgroupscommand.cpp b/splitgroupscommand.cpp

index af3ca665b5791d0001141f27192b78aab171d71a..f3c6cd9e2b7a5e5308bc6624bb4489062f71b2a8 100644 (file)
--- a/splitgroupscommand.cpp
+++ b/splitgroupscommand.cpp
@@ -10,13 +10,15 @@
  #include "splitgroupscommand.h"
  #include "sharedutilities.h"
  #include "sequenceparser.h"
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> SplitGroupCommand::setParameters(){     
         try {           
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "CountGroup", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "CountGroup", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -34,9 +36,9 @@ vector<string> SplitGroupCommand::setParameters(){
  string SplitGroupCommand::getHelpString(){     
         try {
                 string helpString = "";
-               helpString += "The split.group command reads a group file, and parses your fasta and names files by groups. \n";
-               helpString += "The split.group command parameters are fasta, name, group and groups.\n";
-               helpString += "The fasta and group parameters are required.\n";
+               helpString += "The split.group command reads a group or count file, and parses your fasta and names or count files by groups. \n";
+               helpString += "The split.group command parameters are fasta, name, group, count and groups.\n";
+               helpString += "The fasta and group or count parameters are required.\n";
                 helpString += "The groups parameter allows you to select groups to create files for.  \n";
                 helpString += "For example if you set groups=A-B-C, you will get a .A.fasta, .A.names, .B.fasta, .B.names, .C.fasta, .C.names files.  \n";
                 helpString += "If you want .fasta and .names files for all groups, set groups=all.  \n";
@@ -62,6 +64,7 @@ string SplitGroupCommand::getOutputFileNameTag(string type, string inputName="")
          else {
              if (type == "fasta")            {   outputFileName =  "fasta";   }
              else if (type == "name")        {   outputFileName =  "names";   }
+            else if (type == "count")        {   outputFileName =  "count_table";   }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
          return outputFileName;
@@ -79,6 +82,7 @@ SplitGroupCommand::SplitGroupCommand(){
                 vector<string> tempOutNames;
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "SplitGroupCommand", "SplitGroupCommand");
@@ -112,6 +116,7 @@ SplitGroupCommand::SplitGroupCommand(string option)  {
                         vector<string> tempOutNames;
                         outputTypes["fasta"] = tempOutNames;
                         outputTypes["name"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                 
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -141,7 +146,14 @@ SplitGroupCommand::SplitGroupCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
-
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
  
                         
@@ -160,23 +172,56 @@ SplitGroupCommand::SplitGroupCommand(string option)  {
                         
                         groupfile = validParameter.validFile(parameters, "group", true);
                         if (groupfile == "not open") {  groupfile = ""; abort = true; } 
-                       else if (groupfile == "not found") {                    
-                               groupfile = m->getGroupFile(); 
-                               if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); }
-                               else {  m->mothurOut("You have no current groupfile and the group parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       else if (groupfile == "not found") { groupfile = "";
                         }else {  m->setGroupFile(groupfile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = ""; }  
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
+            
+            if ((countfile != "") && (groupfile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; }
+            
+            if ((countfile == "") && (groupfile == "")) {
+                if (namefile == "") { //check for count then group
+                    countfile = m->getCountTableFile(); 
+                                       if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                                       else { 
+                                               groupfile = m->getGroupFile(); 
+                        if (groupfile != "") {  m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); }
+                        else { 
+                            m->mothurOut("You need to provide a count or group file."); m->mothurOutEndLine(); 
+                            abort = true; 
+                        }      
+                                       }       
+                }else { //check for group
+                    groupfile = m->getGroupFile(); 
+                    if (groupfile != "") {  m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); }
+                    else { 
+                        m->mothurOut("You need to provide a count or group file."); m->mothurOutEndLine(); 
+                        abort = true; 
+                    }  
+                }
+            }
                         
                         groups = validParameter.validFile(parameters, "groups", false);         
                         if (groups == "not found") { groups = ""; }
                         else { m->splitAtDash(groups, Groups);  }
                                                 
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
-                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(groupfile);      }
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
+                if (groupfile != "") { outputDir = m->hasPath(groupfile); }
+                else { outputDir = m->hasPath(countfile);  }
+            }
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(fastafile);
-                               parser.getNameFile(files);
-                       }
+            if (countfile == "") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(fastafile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
  
         }
@@ -191,13 +236,48 @@ int SplitGroupCommand::execute(){
         
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
-               SequenceParser* parser;
+        if (countfile == "" ) {  runNameGroup();  }
+        else { runCount();  }
+                               
+               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
+               
+               string current = "";
+               itTypes = outputTypes.find("fasta");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
+               }
+               
+               itTypes = outputTypes.find("name");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
+               }
+        
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
+               
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
                 
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SplitGroupCommand", "execute");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int SplitGroupCommand::runNameGroup(){
+       try {
+        SequenceParser* parser;
                 if (namefile == "") {   parser = new SequenceParser(groupfile, fastafile);                              }
                 else                            {       parser = new SequenceParser(groupfile, fastafile, namefile);    }
                 
                 if (m->control_pressed) { delete parser; return 0; }
-
+        
                 vector<string> namesGroups = parser->getNamesOfGroups();
                 SharedUtil util;  util.setGroups(Groups, namesGroups);  
                 
@@ -215,7 +295,7 @@ int SplitGroupCommand::execute(){
                         parser->getSeqs(Groups[i], newFasta, false);
                         outputNames.push_back(newFasta); outputTypes["fasta"].push_back(newFasta);
                         if (m->control_pressed) { delete parser; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);        } return 0; }
-
+            
                         if (namefile != "") { 
                                 parser->getNameMap(Groups[i], newName); 
                                 outputNames.push_back(newName); outputTypes["name"].push_back(newName);
@@ -225,29 +305,77 @@ int SplitGroupCommand::execute(){
                 }
                 
                 delete parser;
-               
-               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
-               
-               string current = "";
-               itTypes = outputTypes.find("fasta");
-               if (itTypes != outputTypes.end()) {
-                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
-               }
-               
-               itTypes = outputTypes.find("name");
-               if (itTypes != outputTypes.end()) {
-                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
-               }
-               
-               m->mothurOutEndLine();
-               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
-               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
-               m->mothurOutEndLine();
-               
-               return 0;
+        
+        return 0;
+
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SplitGroupCommand", "runNameGroup");
+               exit(1);
         }
+}
+//**********************************************************************************************************************
+int SplitGroupCommand::runCount(){
+       try {
+        
+        CountTable ct;
+        ct.readTable(countfile);
+        if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: your count file does not contain group info, cannot split by group.\n"); m->control_pressed = true; }
+        
+        if (m->control_pressed) { return 0; }
+        
+        vector<string> namesGroups = ct.getNamesOfGroups();
+        SharedUtil util;  util.setGroups(Groups, namesGroups); 
+        
+        //fill filehandles with neccessary ofstreams
+        map<string, ofstream*> ffiles;
+        map<string, ofstream*> cfiles;
+        ofstream* temp;
+        for (int i=0; i<Groups.size(); i++) {
+            temp = new ofstream;
+            ffiles[Groups[i]] = temp;
+            string newFasta = outputDir + m->getRootName(m->getSimpleName(fastafile)) + Groups[i] + "." + getOutputFileNameTag("fasta");
+            outputNames.push_back(newFasta); outputTypes["fasta"].push_back(newFasta);
+            m->openOutputFile(newFasta, (*temp));
+            temp = new ofstream;
+            cfiles[Groups[i]] = temp;
+            string newCount = outputDir + m->getRootName(m->getSimpleName(countfile)) + Groups[i] + "." + getOutputFileNameTag("count");
+            m->openOutputFile(newCount, (*temp));
+            outputNames.push_back(newCount); outputTypes["count"].push_back(newCount);
+            (*temp) << "Representative_Sequence\ttotal\t" << Groups[i] << endl;
+        }
+        
+        ifstream in; 
+        m->openInputFile(fastafile, in);
+        
+        while (!in.eof()) {
+            Sequence seq(in); m->gobble(in);
+            
+            if (m->control_pressed) { break; }
+            if (seq.getName() != "") {
+                vector<string> thisSeqsGroups = ct.getGroups(seq.getName());
+                for (int i = 0; i < thisSeqsGroups.size(); i++) {
+                    if (m->inUsersGroups(thisSeqsGroups[i], Groups)) { //if this sequence belongs to a group we want them print
+                        seq.printSequence(*(ffiles[thisSeqsGroups[i]]));
+                        int numSeqs = ct.getGroupCount(seq.getName(), Groups[i]);
+                        (*(cfiles[thisSeqsGroups[i]])) << seq.getName() << '\t' << numSeqs << '\t' << numSeqs << endl;
+                    }
+                }
+            }
+        }
+        in.close();
+        
+        //close and delete ofstreams
+        for (int i=0; i<Groups.size(); i++) {  
+            (*ffiles[Groups[i]]).close(); delete ffiles[Groups[i]];
+            (*cfiles[Groups[i]]).close(); delete cfiles[Groups[i]];
+        }
+        
+        return 0;
+
+    }
         catch(exception& e) {
-               m->errorOut(e, "SplitGroupCommand", "execute");
+               m->errorOut(e, "SplitGroupCommand", "runCount");
                 exit(1);
         }
  }
diff --git a/splitgroupscommand.h b/splitgroupscommand.h

index a8dc9a19e656651be58416c99e802e1b98b72a56..62e063d8a0fe93c3942959527cde646996e3430d 100644 (file)
--- a/splitgroupscommand.h
+++ b/splitgroupscommand.h
@@ -42,9 +42,12 @@ public:
  private:
         vector<string> outputNames;
                 
-       string outputDir, namefile, groupfile, groups, fastafile;
+       string outputDir, namefile, groupfile, countfile, groups, fastafile;
         vector<string> Groups;
         bool abort;
+    
+    int runNameGroup();
+    int runCount();
  };
  
  /***************************************************************************************/
diff --git a/splitmatrix.cpp b/splitmatrix.cpp

index 384b09af1bb94be09c5607c8b863b28b16215731..28bc5d4cec4fc4ec409bca26359ed103951c8701 100644 (file)
--- a/splitmatrix.cpp
+++ b/splitmatrix.cpp
@@ -14,21 +14,23 @@
  
  /***********************************************************************/
  
-SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
+SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){
         m = MothurOut::getInstance();
         distFile = distfile;
         cutoff = c;
         namefile = name;
         method = t;
         taxFile = tax;
+    countfile = count;
         large = l;
  }
  /***********************************************************************/
  
-SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, float cu, string t, int p, bool cl, string output){
+SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){
         m = MothurOut::getInstance();
         fastafile = ffile;
         namefile = name;
+    countfile = count;
         taxFile = tax;
         cutoff = c;  //tax level cutoff
         distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
@@ -50,7 +52,8 @@ int SplitMatrix::split(){
                 }else {
                         m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
                         map<string, string> temp;
-                       temp[distFile] = namefile;
+                       if (namefile != "") {  temp[distFile] = namefile; }
+            else { temp[distFile] = countfile; }
                         dists.push_back(temp);
                 }
                 
@@ -159,7 +162,7 @@ int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numG
                                 it = seqGroup.find(query.getName());
                                 
                                 //save names in case no namefile is given
-                               if (namefile == "") {  names.insert(query.getName()); }
+                               if ((namefile == "") && (countfile == "")) {  names.insert(query.getName()); }
                         
                                 if (it != seqGroup.end()) { //not singleton 
                                         m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
@@ -196,74 +199,21 @@ int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numG
                         m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
                         
                         //remove old names files just in case
-                       m->mothurRemove((namefile + "." + toString(i) + ".temp"));
+                       if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); }
+            else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); }
                 }
-                       
-               singleton = namefile + ".extra.temp";
-               ofstream remainingNames;
-               m->openOutputFile(singleton, remainingNames);
-               
-               bool wroteExtra = false;
-
-               ifstream bigNameFile;
-               m->openInputFile(namefile, bigNameFile);
-               
-               string name, nameList;
-               while(!bigNameFile.eof()){
-                       bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
-                       
-                       //did this sequence get assigned a group
-                       it = seqGroup.find(name);
-                       
-                       if (it != seqGroup.end()) {  
-                               m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
-                               outFile << name << '\t' << nameList << endl;
-                               outFile.close();
-                       }else{
-                               wroteExtra = true;
-                               remainingNames << name << '\t' << nameList << endl;
-                       }
-               }
-               bigNameFile.close();
-               
-               for(int i=0;i<numGroups;i++){
-                       string tempNameFile = namefile + "." + toString(i) + ".temp";
-                       if (outputDir == "") { outputDir = m->hasPath(fastafile); }
-                       string tempDistFile = "";
+        
+        vector<string> tempDistFiles;    
+        for(int i=0;i<numGroups;i++){
+            if (outputDir == "") { outputDir = m->hasPath(fastafile); }
+            string tempDistFile = "";
              if (classic) { tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
              else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
-
-                       //if there are valid distances
-                       ifstream fileHandle;
-                       fileHandle.open(tempDistFile.c_str());
-                       if(fileHandle)  {       
-                               m->gobble(fileHandle);
-                               if (!fileHandle.eof()) {  //check for blank file - this could occur if all dists in group are above cutoff
-                                       map<string, string> temp;
-                                       temp[tempDistFile] = tempNameFile;
-                                       dists.push_back(temp);
-                               }else {
-                                       ifstream in;
-                                       m->openInputFile(tempNameFile, in);
-                               
-                                       while(!in.eof()) { 
-                                               in >> name >> nameList;  m->gobble(in);
-                                               wroteExtra = true;
-                                               remainingNames << name << '\t' << nameList << endl;
-                                       }
-                                       in.close();
-                                       m->mothurRemove(tempNameFile);
-                               }
-                       }
-                       fileHandle.close();
-               }
-               
-               remainingNames.close();
-               if (!wroteExtra) { 
-                       m->mothurRemove(singleton);
-                       singleton = "none";
-               }
-
+            tempDistFiles.push_back(tempDistFile);
+        }
+        
+        splitNames(seqGroup, numGroups, tempDistFiles);
+        
                 if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
                 
                 return 0;
@@ -279,9 +229,10 @@ int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroup
                 map<string, int>::iterator it;
                 map<string, int>::iterator it2;
                 
+        ofstream outFile;
                 ifstream dFile;
                 m->openInputFile(distFile, dFile);
-               ofstream outFile;
+               
                 
                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
                         m->mothurRemove((distFile + "." + toString(i) + ".temp"));
@@ -326,9 +277,15 @@ int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroup
                         }
                 }
                 dFile.close();
-       
+        
+        string inputFile = namefile;
+        if (countfile != "") { inputFile = countfile; }
+        
+        vector<string> tempDistFiles;
                 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
-                       m->mothurRemove((namefile + "." + toString(i) + ".temp"));
+            string tempDistFile = distFile + "." + toString(i) + ".temp";
+            tempDistFiles.push_back(tempDistFile);
+                       m->mothurRemove((inputFile + "." + toString(i) + ".temp"));
                         
                         //write out any remaining buffers
                         if (numOutputs[i] > 0) {
@@ -341,63 +298,8 @@ int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroup
                         }
                 }
                 
-               ifstream bigNameFile;
-               m->openInputFile(namefile, bigNameFile);
-               
-               singleton = namefile + ".extra.temp";
-               ofstream remainingNames;
-               m->openOutputFile(singleton, remainingNames);
-               
-               bool wroteExtra = false;
-                                               
-               string name, nameList;
-               while(!bigNameFile.eof()){
-                       bigNameFile >> name >> nameList;  m->gobble(bigNameFile);
-                       
-                       //did this sequence get assigned a group
-                       it = seqGroup.find(name);
-                       
-                       if (it != seqGroup.end()) {  
-                               m->openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
-                               outFile << name << '\t' << nameList << endl;
-                               outFile.close();
-                       }else{
-                               wroteExtra = true;
-                               remainingNames << name << '\t' << nameList << endl;
-                       }
-               }
-               bigNameFile.close();
-                               
-               for(int i=0;i<numGroups;i++){
-                       string tempNameFile = namefile + "." + toString(i) + ".temp";
-                       string tempDistFile = distFile + "." + toString(i) + ".temp";
-
-                       //if there are valid distances
-                       if (validDistances[i]) {
-                               map<string, string> temp;
-                               temp[tempDistFile] = tempNameFile;
-                               dists.push_back(temp);
-                       }else{
-                               ifstream in;
-                               m->openInputFile(tempNameFile, in);
-                               
-                               while(!in.eof()) { 
-                                       in >> name >> nameList;  m->gobble(in);
-                                       wroteExtra = true;
-                                       remainingNames << name << '\t' << nameList << endl;
-                               }
-                               in.close();
-                               m->mothurRemove(tempNameFile);
-                       }
-               }
-               
-               remainingNames.close();
-               
-               if (!wroteExtra) { 
-                       m->mothurRemove(singleton);
-                       singleton = "none";
-               }
-
+        splitNames(seqGroup, numGroups, tempDistFiles);
+        
                 if (m->control_pressed)  {  
                         for (int i = 0; i < dists.size(); i++) { 
                                 m->mothurRemove((dists[i].begin()->first));
@@ -645,17 +547,29 @@ int SplitMatrix::splitDistanceLarge(){
                         m->gobble(dFile);
                 }
                 dFile.close();
-               
+        
+               vector<string> tempDistFiles;
                 for (int i = 0; i < numGroups; i++) {
+            string fileName = distFile + "." + toString(i) + ".temp";
+            tempDistFiles.push_back(fileName);
+            //remove old names files just in case
+                       
                         if (numOutputs[i] > 0) {
-                               string fileName = distFile + "." + toString(i) + ".temp";
                                 outFile.open(fileName.c_str(), ios::app);
                                 outFile << outputs[i];
                                 outFile.close();
                         }
                 }
-
-               splitNames(groups);
+        
+        map<string, int> seqGroup;
+        for (int i = 0; i < groups.size(); i++) {
+            for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
+                seqGroup[*itNames] = i;
+                groups[i].erase(itNames++);
+            }
+        }
+        
+               splitNames(seqGroup, numGroups, tempDistFiles);
                                 
                 return 0;                       
         }
@@ -665,73 +579,104 @@ int SplitMatrix::splitDistanceLarge(){
         }
  }
  //********************************************************************************************************************
-int SplitMatrix::splitNames(vector<set<string> >& groups){
+int SplitMatrix::splitNames(map<string, int>& seqGroup, int numGroups, vector<string>& tempDistFiles){
         try {
-               int numGroups = groups.size();
-       
-               ifstream bigNameFile(namefile.c_str());
-               if(!bigNameFile){
-                       cerr << "Error: We can't open the name file\n";
-                       exit(1);
-               }
-               
-               map<string, string> nameMap;
-               string name, nameList;
-               while(bigNameFile){
-                       bigNameFile >> name >> nameList;
-                       nameMap[name] = nameList;
-                       m->gobble(bigNameFile);
-               }
-               bigNameFile.close();
-                       
-               for(int i=0;i<numGroups;i++){  //parse names file to match distance files
-                       int numSeqsInGroup = groups[i].size();
-                       
-                       if(numSeqsInGroup > 0){
-                               string fileName = namefile + "." + toString(i) + ".temp";
-                               ofstream smallNameFile(fileName.c_str(), ios::ate);
-                               
-                               for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
-                                       map<string,string>::iterator nIt = nameMap.find(*gIt);
-                                       if (nIt != nameMap.end()) {
-                                               smallNameFile << nIt->first << '\t' << nIt->second << endl;
-                                               nameMap.erase(nIt);
-                                       }else{
-                                               m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
-                                       }
-                               }
-                               smallNameFile.close();
-                       }
-               }
-               
-               //names of singletons
-               if (nameMap.size() != 0) {
-                       singleton = namefile + ".extra.temp";
-                       ofstream remainingNames(singleton.c_str(), ios::ate);
-                       for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
-                               remainingNames << nIt->first << '\t' << nIt->second << endl;
-                       }
-                       remainingNames.close();
-               }else { singleton = "none"; }
-                       
+        ofstream outFile;
+        map<string, int>::iterator it;
+        
+        string inputFile = namefile;
+        if (countfile != "") { inputFile = countfile; }
+        
+        for(int i=0;i<numGroups;i++){  m->mothurRemove((inputFile + "." + toString(i) + ".temp")); }
+
+        singleton = inputFile + ".extra.temp";
+        ofstream remainingNames;
+        m->openOutputFile(singleton, remainingNames);
+        
+        bool wroteExtra = false;
+        
+        ifstream bigNameFile;
+        m->openInputFile(inputFile, bigNameFile);
+        
+        //grab header line 
+        string headers = "";
+        if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); }
+        
+        string name, nameList;
+        while(!bigNameFile.eof()){
+            bigNameFile >> name >> nameList;  
+            m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given.
+            
+            //did this sequence get assigned a group
+            it = seqGroup.find(name);
+            
+            if (it != seqGroup.end()) {  
+                m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile);
+                outFile << name << '\t' << nameList << endl;
+                outFile.close();
+            }else{
+                wroteExtra = true;
+                remainingNames << name << '\t' << nameList << endl;
+            }
+        }
+        bigNameFile.close();
+        
                 for(int i=0;i<numGroups;i++){
-                       if(groups[i].size() > 0){
-                               string tempNameFile = namefile + "." + toString(i) + ".temp";
-                               string tempDistFile = distFile + "." + toString(i) + ".temp";
-                               
+                       string tempNameFile = inputFile + "." + toString(i) + ".temp";
+                       string tempDistFile = tempDistFiles[i];
+            
+            //if there are valid distances
+            ifstream fileHandle;
+            fileHandle.open(tempDistFile.c_str());
+            if(fileHandle)     {       
+                m->gobble(fileHandle);
+                if (!fileHandle.eof()) {  //check
                                 map<string, string> temp;
+                if (countfile != "") {
+                    //add header
+                    ofstream out;
+                    string newtempNameFile = tempNameFile + "2";
+                    m->openOutputFile(newtempNameFile, out);
+                    out << headers << endl;
+                    out.close();
+                    m->appendFiles(tempNameFile, newtempNameFile);
+                    m->mothurRemove(tempNameFile);
+                    m->renameFile(newtempNameFile, tempNameFile);
+                }
                                 temp[tempDistFile] = tempNameFile;
                                 dists.push_back(temp);
+                       }else{
+                               ifstream in;
+                               m->openInputFile(tempNameFile, in);
+                               
+                               while(!in.eof()) { 
+                                       in >> name >> nameList;  m->gobble(in);
+                                       wroteExtra = true;
+                                       remainingNames << name << '\t' << nameList << endl;
+                               }
+                               in.close();
+                               m->mothurRemove(tempNameFile);
                         }
+            }
+            fileHandle.close();
                 }
                 
-               if (m->control_pressed)  {  
-                       for (int i = 0; i < dists.size(); i++) { 
-                               m->mothurRemove((dists[i].begin()->first));
-                               m->mothurRemove((dists[i].begin()->second));
-                       }
-                       dists.clear();
-               }
+               remainingNames.close();
+               
+               if (!wroteExtra) { 
+                       m->mothurRemove(singleton);
+                       singleton = "none";
+               }else if (countfile != "") {
+            //add header
+            ofstream out;
+            string newtempNameFile = singleton + "2";
+            m->openOutputFile(newtempNameFile, out);
+            out << headers << endl;
+            out.close();
+            m->appendFiles(singleton, newtempNameFile);
+            m->mothurRemove(singleton);
+            m->renameFile(newtempNameFile, singleton);
+        }
                 
                 return 0;
         }
@@ -836,17 +781,27 @@ int SplitMatrix::splitDistanceRAM(){
                 }
                 dFile.close();
                 
+        vector<string> tempDistFiles;
                 for (int i = 0; i < numGroups; i++) {
+            string fileName = distFile + "." + toString(i) + ".temp";
+            tempDistFiles.push_back(fileName);
                         if (outputs[i] != "") {
                                 ofstream outFile;
-                               string fileName = distFile + "." + toString(i) + ".temp";
                                 outFile.open(fileName.c_str(), ios::ate);
                                 outFile << outputs[i];
                                 outFile.close();
                         }
                 }
-
-               splitNames(groups);
+        
+        map<string, int> seqGroup;
+        for (int i = 0; i < groups.size(); i++) {
+            for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
+                seqGroup[*itNames] = i;
+                groups[i].erase(itNames++);
+            }
+        }
+        
+               splitNames(seqGroup, numGroups, tempDistFiles);
                                 
                 return 0;                       
         }
diff --git a/splitmatrix.h b/splitmatrix.h

index b8aa55119c810871988e76ca156b1746bd4eeb4a..7b468e998d967cde992a74b2cc0745840f60d454 100644 (file)
--- a/splitmatrix.h
+++ b/splitmatrix.h
@@ -19,8 +19,8 @@ class SplitMatrix  {
         
         public:
  
-               SplitMatrix(string, string, string, float, string, bool); //column formatted distance file, namesfile, cutoff, method, large
-               SplitMatrix(string, string, string, float, float, string, int, bool, string); //fastafile, namefile, taxFile, taxcutoff, cutoff, method, processors, classic, outputDir
+               SplitMatrix(string, string, string, string, float, string, bool); //column formatted distance file, namesfile, countfile, cutoff, method, large
+               SplitMatrix(string, string, string, string, float, float, string, int, bool, string); //fastafile, namefile, countfile, taxFile, taxcutoff, cutoff, method, processors, classic, outputDir
                 
                 ~SplitMatrix();
                 int split();
@@ -30,7 +30,7 @@ class SplitMatrix  {
         private:
                 MothurOut* m;
  
-               string distFile, namefile, singleton, method, taxFile, fastafile, outputDir;
+               string distFile, namefile, singleton, method, taxFile, fastafile, outputDir, countfile;
                 vector< map< string, string> > dists;
                 float cutoff, distCutoff;
                 bool large, classic;
@@ -40,7 +40,7 @@ class SplitMatrix  {
                 int splitClassify();
                 int splitDistanceLarge();
                 int splitDistanceRAM();
-               int splitNames(vector<set<string> >& groups);
+               int splitNames(map<string, int>& groups, int, vector<string>&);
                 int splitDistanceFileByTax(map<string, int>&, int);
                 int createDistanceFilesFromTax(map<string, int>&, int);
  };
diff --git a/subsample.cpp b/subsample.cpp

index 261297df67cfc1c5a8933a72f8bba307defbef88..392f97bd51f1d2ac67d6d0a810a408ab507fdcee 100644 (file)
--- a/subsample.cpp
+++ b/subsample.cpp
@@ -8,62 +8,54 @@
  
  #include "subsample.h"
  //**********************************************************************************************************************
-Tree* SubSample::getSample(Tree* T, TreeMap* tmap, TreeMap* newTmap, int size, map<string, string> originalNameMap) {
+Tree* SubSample::getSample(Tree* T, CountTable* ct, CountTable* newCt, int size) {
      try {
          Tree* newTree = NULL;
          
-        map<string, vector<string> > newGroups;
-        vector<string> subsampledSeqs = getSample(tmap, size, newGroups);
+        //remove seqs not in sample from counttable
+        vector<string> Groups = ct->getNamesOfGroups();
+        newCt->copy(ct); 
+        newCt->addGroup("doNotIncludeMe");
          
-        //remove seqs not in sample from treemap
-        for (map<string, vector<string> >::iterator it = newGroups.begin(); it != newGroups.end(); it++) {
-            for (int i = 0; i < (it->second).size(); i++) {
-                newTmap->addSeq((it->second)[i], it->first);
-            }
-        }
-        
-        newTree = new Tree(newTmap);
-        newTree->getCopy(T, originalNameMap);
-        
-        return newTree;
-    }
-    catch(exception& e) {
-        m->errorOut(e, "SubSample", "getSample-Tree");
-        exit(1);
-    }
-}
-/**********************************************************************************************************************
-Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, int size) {
-    try {
-        Tree* newTree = NULL;
-        
-        vector<string> subsampledSeqs = getSample(tmap, size);
-        map<string, string> sampledNameMap = deconvolute(whole, subsampledSeqs); 
+        map<string, int> doNotIncludeTotals; 
+        vector<string> namesSeqs = ct->getNamesOfSeqs();
+        for (int i = 0; i < namesSeqs.size(); i++) {  doNotIncludeTotals[namesSeqs[i]] = 0; }
+    
+        for (int i = 0; i < Groups.size(); i++) {
+            if (m->inUsersGroups(Groups[i], m->getGroups())) {
+                if (m->control_pressed) { break; }
          
-        //remove seqs not in sample from treemap
-        for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
-            //is that name in the subsample?
-            int count = 0;
-            for (int j = 0; j < subsampledSeqs.size(); j++) {
-                if (tmap->namesOfSeqs[i] == subsampledSeqs[j]) { break; } //found it
-                count++;
+                int thisSize = ct->getGroupCount(Groups[i]);
+                
+                if (thisSize >= size) {        
+                    
+                    vector<string> names = ct->getNamesOfSeqs(Groups[i]);
+                    vector<int> random;
+                    for (int j = 0; j < names.size(); j++) {
+                        int num = ct->getGroupCount(names[j], Groups[i]);
+                        for (int k = 0; k < num; k++) { random.push_back(j); }
+                    }
+                    random_shuffle(random.begin(), random.end());
+                    
+                    vector<int> sampleRandoms; sampleRandoms.resize(names.size(), 0);
+                    for (int j = 0; j < size; j++) { sampleRandoms[random[j]]++; }
+                    for (int j = 0; j < sampleRandoms.size(); j++) {
+                        newCt->setAbund(names[j], Groups[i], sampleRandoms[j]);
+                    }
+                    sampleRandoms.clear(); sampleRandoms.resize(names.size(), 0);
+                    for (int j = size; j < thisSize; j++) { sampleRandoms[random[j]]++; }
+                    for (int j = 0; j < sampleRandoms.size(); j++) {  doNotIncludeTotals[names[j]] += sampleRandoms[j]; }
+                }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
              }
  
-            if (m->control_pressed) { return newTree; }
-            
-            //if you didnt find it, remove it 
-            if (count == subsampledSeqs.size()) { 
-                tmap->removeSeq(tmap->namesOfSeqs[i]);
-                i--; //need this because removeSeq removes name from namesOfSeqs
-            }
          }
          
-        //create new tree
-        int numUniques = sampledNameMap.size();
-        if (sampledNameMap.size() == 0) { numUniques = subsampledSeqs.size(); }
+        for (map<string, int>::iterator it = doNotIncludeTotals.begin(); it != doNotIncludeTotals.end(); it++) {  
+            newCt->setAbund(it->first, "doNotIncludeMe", it->second);
+        } 
          
-        newTree = new Tree(numUniques, tmap); //numNodes, treemap
-        newTree->getSubTree(T, subsampledSeqs, sampledNameMap);
+        newTree = new Tree(newCt);
+        newTree->getCopy(T, true);
          
          return newTree;
      }
@@ -71,7 +63,7 @@ Tree* SubSample::getSample(Tree* T, TreeMap* tmap, map<string, string> whole, in
          m->errorOut(e, "SubSample", "getSample-Tree");
          exit(1);
      }
-}*/
+}
  //**********************************************************************************************************************
  //assumes whole maps dupName -> uniqueName
  map<string, string> SubSample::deconvolute(map<string, string> whole, vector<string>& wanted) {
@@ -112,100 +104,6 @@ map<string, string> SubSample::deconvolute(map<string, string> whole, vector<str
         }
  }
  //**********************************************************************************************************************
-vector<string> SubSample::getSample(TreeMap* tMap, int size, map<string, vector<string> >& sample) {
-    try {
-        vector<string> temp2;
-        sample["doNotIncludeMe"] = temp2;
-        
-        vector<string> namesInSample;
-        
-        vector<string> Groups = tMap->getNamesOfGroups();    
-        for (int i = 0; i < Groups.size(); i++) {
-            
-            if (m->inUsersGroups(Groups[i], m->getGroups())) {
-                if (m->control_pressed) { break; }
-                
-                vector<string> thisGroup; thisGroup.push_back(Groups[i]);
-                vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
-                int thisSize = thisGroupsSeqs.size();
-                vector<string> temp;
-                sample[Groups[i]] = temp;
-                
-                if (thisSize >= size) {        
-                    
-                    random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
-                    
-                    for (int j = 0; j < size; j++) { sample[Groups[i]].push_back(thisGroupsSeqs[j]); namesInSample.push_back(thisGroupsSeqs[j]); }
-                    for (int j = size; j < thisSize; j++) { sample["doNotIncludeMe"].push_back(thisGroupsSeqs[j]); }
-                    
-                }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
-            }
-        } 
-        
-        return namesInSample;
-    }
-       catch(exception& e) {
-               m->errorOut(e, "SubSample", "getSample-TreeMap");
-               exit(1);
-       }
-}      
-
-//**********************************************************************************************************************
-vector<string> SubSample::getSample(TreeMap* tMap, int size) {
-    try {
-        vector<string> sample;
-        
-        vector<string> Groups = tMap->getNamesOfGroups();    
-        for (int i = 0; i < Groups.size(); i++) {
-            
-            if (m->inUsersGroups(Groups[i], m->getGroups())) {
-                if (m->control_pressed) { break; }
-                
-                vector<string> thisGroup; thisGroup.push_back(Groups[i]);
-                vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
-                int thisSize = thisGroupsSeqs.size();
-                
-                if (thisSize >= size) {        
-                    
-                    random_shuffle(thisGroupsSeqs.begin(), thisGroupsSeqs.end());
-                    
-                    for (int j = 0; j < size; j++) { sample.push_back(thisGroupsSeqs[j]); }
-                }else {  m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
-            }
-        } 
-        
-        return sample;
-    }
-       catch(exception& e) {
-               m->errorOut(e, "SubSample", "getSample-TreeMap");
-               exit(1);
-       }
-}      
-//**********************************************************************************************************************
-vector<string> SubSample::getSample(TreeMap* tMap, vector<string> Groups) {
-    try {
-        vector<string> sample;
-        
-        //vector<string> Groups = tMap->getNamesOfGroups();    
-        for (int i = 0; i < Groups.size(); i++) {
-            
-            if (m->control_pressed) { break; }
-            
-            vector<string> thisGroup; thisGroup.push_back(Groups[i]);
-            vector<string> thisGroupsSeqs = tMap->getNamesSeqs(thisGroup);
-            int thisSize = thisGroupsSeqs.size();
-                
-            for (int j = 0; j < thisSize; j++) { sample.push_back(thisGroupsSeqs[j]); }
-        } 
-        
-        return sample;
-    }
-       catch(exception& e) {
-               m->errorOut(e, "SubSample", "getSample-TreeMap");
-               exit(1);
-       }
-}      
-//**********************************************************************************************************************
  vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
         try {
                 
@@ -366,7 +264,164 @@ int SubSample::getSample(SAbundVector*& sabund, int size) {
                 m->errorOut(e, "SubSampleCommand", "getSample");
                 exit(1);
         }
-}                      
+}
+//**********************************************************************************************************************
+CountTable SubSample::getSample(CountTable& ct, int size, vector<string> Groups) {
+       try {
+        if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: Cannot subsample by group because your count table doesn't have group information.\n"); m->control_pressed = true; }
+            
+        CountTable sampledCt;
+        map<string, vector<int> > tempCount;
+        for (int i = 0; i < Groups.size(); i++) {
+            sampledCt.addGroup(Groups[i]);
+            
+            vector<string> names = ct.getNamesOfSeqs(Groups[i]);
+            vector<string> allNames;
+            for (int j = 0; j < names.size(); j++) {
+                
+                if (m->control_pressed) { return sampledCt; }
+                
+                int num = ct. getGroupCount(names[j], Groups[i]);
+                for (int k = 0; k < num; k++) { allNames.push_back(names[j]); }
+            }
+            
+            random_shuffle(allNames.begin(), allNames.end());
+            
+            if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than "+Groups[i]+" number of sequences.\n"); m->control_pressed = true; }
+            else{
+                for (int j = 0; j < size; j++) {
+                    
+                    if (m->control_pressed) { return sampledCt; }
+                    
+                    map<string, vector<int> >::iterator it = tempCount.find(allNames[j]);
+                    
+                    if (it == tempCount.end()) { //we have not seen this sequence at all yet
+                        vector<int> tempGroups; tempGroups.resize(Groups.size(), 0);
+                        tempGroups[i]++;
+                        tempCount[allNames[j]] = tempGroups;
+                    }else{
+                        tempCount[allNames[j]][i]++;
+                    }
+                }
+            }
+        }
+        
+        //build count table
+        for (map<string, vector<int> >::iterator it = tempCount.begin(); it != tempCount.end();) {
+            sampledCt.push_back(it->first, it->second);
+            tempCount.erase(it++);
+        }
+        
+        return sampledCt;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SubSampleCommand", "getSample");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+CountTable SubSample::getSample(CountTable& ct, int size, vector<string> Groups, bool pickedGroups) {
+       try {
+        CountTable sampledCt;
+        if (!ct.hasGroupInfo() && pickedGroups) { m->mothurOut("[ERROR]: Cannot subsample with groups because your count table doesn't have group information.\n"); m->control_pressed = true; return sampledCt; }
+        
+        if (ct.hasGroupInfo()) {
+            map<string, vector<int> > tempCount;
+            vector<item> allNames;
+            map<string, int> groupMap;
+            
+            vector<string> myGroups;
+            if (pickedGroups) { myGroups = Groups; }
+            else {  myGroups = ct.getNamesOfGroups(); }
+            
+            for (int i = 0; i < myGroups.size(); i++) {
+                sampledCt.addGroup(myGroups[i]);
+                groupMap[myGroups[i]] = i;
+                
+                vector<string> names = ct.getNamesOfSeqs(myGroups[i]);
+                for (int j = 0; j < names.size(); j++) {
+                    
+                    if (m->control_pressed) { return sampledCt; }
+                    
+                    int num = ct. getGroupCount(names[j], myGroups[i]);
+                    for (int k = 0; k < num; k++) { 
+                        item temp(names[j], myGroups[i]);
+                        allNames.push_back(temp); 
+                    }
+                }
+            }
+            
+            random_shuffle(allNames.begin(), allNames.end());
+            
+            if (allNames.size() < size) { 
+                if (pickedGroups) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); } 
+                else { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences in the groups you chose.\n"); }
+                m->control_pressed = true; return sampledCt; }
+            else{
+                for (int j = 0; j < size; j++) {
+                    
+                    if (m->control_pressed) { return sampledCt; }
+                    
+                    map<string, vector<int> >::iterator it = tempCount.find(allNames[j].name);
+                    
+                    if (it == tempCount.end()) { //we have not seen this sequence at all yet
+                        vector<int> tempGroups; tempGroups.resize(myGroups.size(), 0);
+                        tempGroups[groupMap[allNames[j].group]]++;
+                        tempCount[allNames[j].name] = tempGroups;
+                    }else{
+                        tempCount[allNames[j].name][groupMap[allNames[j].group]]++;
+                    }
+                }
+            }
+            
+            //build count table
+            for (map<string, vector<int> >::iterator it = tempCount.begin(); it != tempCount.end();) {
+                sampledCt.push_back(it->first, it->second);
+                tempCount.erase(it++);
+            }
+            
+            //remove empty groups 
+            for (int i = 0; i < myGroups.size(); i++) { if (sampledCt.getGroupCount(myGroups[i]) == 0) { sampledCt.removeGroup(myGroups[i]); } }
+            
+        }else {
+            vector<string> names = ct.getNamesOfSeqs();
+            map<string, int> nameMap;
+            vector<string> allNames;
+            
+            for (int i = 0; i < names.size(); i++) {
+                int num = ct.getNumSeqs(names[i]);
+                for (int j = 0; j < num; j++) { allNames.push_back(names[i]); }
+            }
+            
+            if (allNames.size() < size) { m->mothurOut("[ERROR]: You have selected a size that is larger than the number of sequences.\n"); m->control_pressed = true; return sampledCt; }
+            else {
+                random_shuffle(allNames.begin(), allNames.end());
+                
+                for (int j = 0; j < size; j++) {
+                    if (m->control_pressed) { return sampledCt; }
+                    
+                    map<string, int>::iterator it = nameMap.find(allNames[j]);
+                    
+                    //we have not seen this sequence at all yet
+                    if (it == nameMap.end()) { nameMap[allNames[j]] = 1;  }
+                    else{  nameMap[allNames[j]]++;  }
+                }
+                
+                //build count table
+                for (map<string, int>::iterator it = nameMap.begin(); it != nameMap.end();) {
+                    sampledCt.push_back(it->first, it->second);
+                    nameMap.erase(it++);
+                }
+            }
+        }
+        
+        return sampledCt;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "SubSampleCommand", "getSample");
+               exit(1);
+       }
+}
  //**********************************************************************************************************************
  
  
diff --git a/subsample.h b/subsample.h

index b00f1a7141b49fb99a9683a58b96153c5986a6b7..fdf8576f1950c0a27665a190eee1c3a9ddb09b6b 100644 (file)
--- a/subsample.h
+++ b/subsample.h
@@ -13,6 +13,16 @@
  #include "sharedrabundvector.h"
  #include "treemap.h"
  #include "tree.h"
+#include "counttable.h"
+
+struct item {
+    string name;
+    string group;
+    
+    item() {}
+    item(string n, string g) : name(n), group(g) {}
+    ~item() {}
+};
  
  //subsampling overwrites the sharedRabunds.  If you need to reuse the original use the getSamplePreserve function.
  
@@ -24,20 +34,16 @@ class SubSample {
          ~SubSample() {}
      
          vector<string> getSample(vector<SharedRAbundVector*>&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first.
-        
-        //Tree* getSample(Tree*, TreeMap*, map<string, string>, int); //creates new subsampled tree, destroys treemap so copy if needed.
-        Tree* getSample(Tree*, TreeMap*, TreeMap*, int, map<string, string>); //creates new subsampled tree. Uses first treemap to fill new treemap with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe".
+        Tree* getSample(Tree*, CountTable*, CountTable*, int); //creates new subsampled tree. Uses first counttable to fill new counttable with sabsampled seqs. Sets groups of seqs not in subsample to "doNotIncludeMe".
          int getSample(SAbundVector*&, int); //destroys sabundvector passed in, so copy it if you need it
+        CountTable getSample(CountTable&, int, vector<string>); //subsample a countTable bygroup(same number sampled from each group, returns subsampled countTable 
+        CountTable getSample(CountTable&, int, vector<string>, bool); //subsample a countTable. If you want to only sample from specific groups, pass in groups in the vector and set bool=true, otherwise set bool=false.   
      
      private:
      
          MothurOut* m;
          int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
-    
-        vector<string> getSample(TreeMap*, vector<string>);
-        vector<string> getSample(TreeMap*, int); //names of seqs to include in sample tree 
-        vector<string> getSample(TreeMap* tMap, int size, map<string, vector<string> >& sample); //sample maps group -> seqs in group. seqs not in sample are in doNotIncludeMe group
-        map<string, string> deconvolute(map<string, string> wholeSet, vector<string>& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap.
+         map<string, string> deconvolute(map<string, string> wholeSet, vector<string>& subsampleWanted); //returns new nameMap containing only subsampled names, and removes redundants from subsampled wanted because it makes the new nameMap.
  
  
  };
diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp

index f9cb1e60e43d59d9d1a6ce760b3031de0e92c301..e1793f415ff98daa180e7ace0b8bff3d52e5f3b9 100644 (file)
--- a/subsamplecommand.cpp
+++ b/subsamplecommand.cpp
@@ -16,8 +16,9 @@
  vector<string> SubSampleCommand::setParameters(){      
         try {           
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter plist("list", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(plist);
                 CommandParameter pshared("shared", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(pshared);
                 CommandParameter prabund("rabund", "InputTypes", "", "", "none", "FLSSR", "none",false,false); parameters.push_back(prabund);
@@ -43,7 +44,7 @@ string SubSampleCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n";
-               helpString += "The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size, persample and label.  You must provide a fasta, list, sabund, rabund or shared file as an input file.\n";
+               helpString += "The sub.sample command parameters are fasta, name, list, group, count, rabund, sabund, shared, groups, size, persample and label.  You must provide a fasta, list, sabund, rabund or shared file as an input file.\n";
                 helpString += "The namefile is only used with the fasta file, not with the listfile, because the list file should contain all sequences.\n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
                 helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
@@ -76,6 +77,7 @@ string SubSampleCommand::getOutputFileNameTag(string type, string inputName=""){
              if (type == "fasta")            {   outputFileName =  "subsample" + m->getExtension(inputName);   }
              else if (type == "sabund")    {   outputFileName =  "subsample" + m->getExtension(inputName);   }
              else if (type == "name")        {   outputFileName =  "subsample" + m->getExtension(inputName);   }
+            else if (type == "count")        {   outputFileName =  "subsample" + m->getExtension(inputName);   }
              else if (type == "group")       {   outputFileName =  "subsample" + m->getExtension(inputName);   }
              else if (type == "list")        {   outputFileName =  "subsample" + m->getExtension(inputName);   }
              else if (type == "rabund")       {   outputFileName =  "subsample" + m->getExtension(inputName);   }
@@ -103,6 +105,7 @@ SubSampleCommand::SubSampleCommand(){
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
                 outputTypes["group"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "SubSampleCommand", "GetRelAbundCommand");
@@ -142,6 +145,7 @@ SubSampleCommand::SubSampleCommand(string option) {
                         outputTypes["fasta"] = tempOutNames;
                         outputTypes["name"] = tempOutNames;
                         outputTypes["group"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
@@ -206,6 +210,14 @@ SubSampleCommand::SubSampleCommand(string option) {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //check for required parameters
@@ -244,6 +256,22 @@ SubSampleCommand::SubSampleCommand(string option) {
                         else if (groupfile == "not found") { groupfile = ""; }
                         else { m->setGroupFile(groupfile); }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else {
+                m->setCountTableFile(countfile); 
+                ct.readTable(countfile);
+            }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+            
                         //check for optional parameter and set defaults
                         // ...at some point should added some additional type checking...
                         label = validParameter.validFile(parameters, "label", false);                   
@@ -267,26 +295,34 @@ SubSampleCommand::SubSampleCommand(string option) {
                         temp = validParameter.validFile(parameters, "persample", false);                if (temp == "not found"){       temp = "f";             }
                         persample = m->isTrue(temp);
                         
-                       if (groupfile == "") { persample = false; }
+                       if ((groupfile == "") && (countfile == "")) { persample = false; }
+            if (countfile != "") {
+                if (!ct.hasGroupInfo()) { 
+                    persample = false; 
+                    if (pickedGroups) { m->mothurOut("You cannot pick groups without group info in your count file."); m->mothurOutEndLine(); abort = true; }
+                }
+            }
                         
                         if ((namefile != "") && (fastafile == "")) { m->mothurOut("You may only use a namefile with a fastafile."); m->mothurOutEndLine(); abort = true; }
                         
                         if ((fastafile == "") && (listfile == "") && (sabundfile == "") && (rabundfile == "") && (sharedfile == "")) {
                                 m->mothurOut("You must provide a fasta, list, sabund, rabund or shared file as an input file."); m->mothurOutEndLine(); abort = true; }
                         
-                       if (pickedGroups && ((groupfile == "") && (sharedfile == ""))) { 
-                               m->mothurOut("You cannot pick groups without a valid group file or shared file."); m->mothurOutEndLine(); abort = true; }
+                       if (pickedGroups && ((groupfile == "") && (sharedfile == "") && (countfile == ""))) { 
+                               m->mothurOut("You cannot pick groups without a valid group, count or shared file."); m->mothurOutEndLine(); abort = true; }
                         
-                       if ((groupfile != "") && ((fastafile == "") && (listfile == ""))) { 
-                               m->mothurOut("Group file only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; }
+                       if (((groupfile != "") || (countfile != "")) && ((fastafile == "") && (listfile == ""))) { 
+                               m->mothurOut("Group or count files are only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; }
                         
-                       if ((groupfile != "") && ((fastafile != "") && (listfile != ""))) { 
-                               m->mothurOut("A new group file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; }
+                       if (((groupfile != "") || (countfile != "")) && ((fastafile != "") && (listfile != ""))) { 
+                               m->mothurOut("A new group or count file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; }
                         
-                       if ((fastafile != "") && (namefile == "")) {
-                               vector<string> files; files.push_back(fastafile);
-                               parser.getNameFile(files);
-                       }
+            if (countfile == "") {
+                if ((fastafile != "") && (namefile == "")) {
+                    vector<string> files; files.push_back(fastafile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
  
         }
@@ -353,6 +389,11 @@ int SubSampleCommand::execute(){
                 if (itTypes != outputTypes.end()) {
                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
                 }
+        
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
                 
                 
                 m->mothurOutEndLine();
@@ -374,49 +415,67 @@ int SubSampleCommand::getSubSampleFasta() {
                 if (namefile != "") { readNames(); }    //fills names with all names in namefile.
                 else { getNames(); }//no name file, so get list of names to pick from
                 
-               GroupMap* groupMap;
+               GroupMap groupMap;
                 if (groupfile != "") {
-                       
-                       groupMap = new GroupMap(groupfile);
-                       groupMap->readMap();
+                       groupMap.readMap(groupfile);
                         
                         //takes care of user setting groupNames that are invalid or setting groups=all
-                       SharedUtil* util = new SharedUtil();
-                       vector<string> namesGroups = groupMap->getNamesOfGroups();
-                       util->setGroups(Groups, namesGroups);
-                       delete util;
+                       SharedUtil util;
+                       vector<string> namesGroups = groupMap.getNamesOfGroups();
+                       util.setGroups(Groups, namesGroups);
                         
                         //file mismatch quit
-                       if (names.size() != groupMap->getNumSeqs()) { 
-                               m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); 
+                       if (names.size() != groupMap.getNumSeqs()) { 
+                               m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your groupfile contains " + toString(groupMap.getNumSeqs()) + ", please correct."); 
                                 m->mothurOutEndLine();
-                               delete groupMap;
                                 return 0;
                         }                       
-               }       
+               }else if (countfile != "") {
+            if (ct.hasGroupInfo()) {
+                SharedUtil util;
+                vector<string> namesGroups = ct.getNamesOfGroups();
+                util.setGroups(Groups, namesGroups);
+            }
+            
+            //file mismatch quit
+                       if (names.size() != ct.getNumUniqueSeqs()) { 
+                               m->mothurOut("[ERROR]: your fasta file contains " + toString(names.size()) + " sequences, and your count file contains " + toString(ct.getNumUniqueSeqs()) + " unique sequences, please correct."); 
+                               m->mothurOutEndLine();
+                               return 0;
+                       }       
+        }
                 
                 if (m->control_pressed) { return 0; }
                 
-               
                 //make sure that if your picked groups size is not too big
-               int thisSize = names.size();
+               int thisSize = 0;
+        if (countfile == "") { thisSize = names.size();  }
+        else {  thisSize = ct. getNumSeqs();  }  //all seqs not just unique
+        
                 if (persample) { 
                         if (size == 0) { //user has not set size, set size = smallest samples size
-                               size = groupMap->getNumSeqs(Groups[0]);
+                               if (countfile == "") { size = groupMap.getNumSeqs(Groups[0]); }
+                else {  size = ct.getGroupCount(Groups[0]);  }
+                
                                 for (int i = 1; i < Groups.size(); i++) {
-                                       int thisSize = groupMap->getNumSeqs(Groups[i]);
+                                       int thisSize = 0;
+                    if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); }
+                    else {  thisSize = ct.getGroupCount(Groups[i]);  }
                                         
                                         if (thisSize < size) {  size = thisSize;        }
                                 }
                         }else { //make sure size is not too large
                                 vector<string> newGroups;
                                 for (int i = 0; i < Groups.size(); i++) {
-                                       int thisSize = groupMap->getNumSeqs(Groups[i]);
+                                       int thisSize = 0;
+                    if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); }
+                    else {  thisSize = ct.getGroupCount(Groups[i]);  }
                                         
                                         if (thisSize >= size) { newGroups.push_back(Groups[i]); }
                                         else {  m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); }
                                 }
                                 Groups = newGroups;
+                if (newGroups.size() == 0) {  m->mothurOut("[ERROR]: all groups removed."); m->mothurOutEndLine(); m->control_pressed = true; }
                         }
                         
                         m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine();                        
@@ -424,7 +483,8 @@ int SubSampleCommand::getSubSampleFasta() {
                         if (pickedGroups) {
                                 int total = 0;
                                 for(int i = 0; i < Groups.size(); i++) {
-                                       total += groupMap->getNumSeqs(Groups[i]);
+                    if (countfile == "") { total += groupMap.getNumSeqs(Groups[i]); }
+                    else {  total += ct.getGroupCount(Groups[i]);  }
                                 }
                                 
                                 if (size == 0) { //user has not set size, set size = 10% samples size
@@ -442,64 +502,87 @@ int SubSampleCommand::getSubSampleFasta() {
                         }
                         
                         if (size == 0) { //user has not set size, set size = 10% samples size
-                               size = int (names.size() * 0.10);
-                       }
-                       
-                       if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
-                               size = thisSize;
+                               if (countfile == "") {  size = int (names.size() * 0.10); }
+                else {  size = int (ct.getNumSeqs() * 0.10); }
                         }
                         
-                       if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); }
+            
+            if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
+                    size = thisSize;
+            }
+            
+            if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); }
  
                 }
                 random_shuffle(names.begin(), names.end());
                 
                 set<string> subset; //dont want repeat sequence names added
                 if (persample) {
-                       //initialize counts
-                       map<string, int> groupCounts;
-                       map<string, int>::iterator itGroupCounts;
-                       for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
+            if (countfile == "") {
+                //initialize counts
+                map<string, int> groupCounts;
+                map<string, int>::iterator itGroupCounts;
+                for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
                         
-                       for (int j = 0; j < names.size(); j++) {
+                for (int j = 0; j < names.size(); j++) {
                                         
-                               if (m->control_pressed) { return 0; }
+                    if (m->control_pressed) { return 0; }
                                                                                                 
-                               string group = groupMap->getGroup(names[j]);
-                               if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
-                               else{
-                                       itGroupCounts = groupCounts.find(group);
-                                       if (itGroupCounts != groupCounts.end()) {
-                                               if (groupCounts[group] < size) {        subset.insert(names[j]);        groupCounts[group]++; }
-                                       }
-                               }                               
-                       }
+                    string group = groupMap.getGroup(names[j]);
+                    if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+                    else{
+                        itGroupCounts = groupCounts.find(group);
+                        if (itGroupCounts != groupCounts.end()) {
+                            if (groupCounts[group] < size) {   subset.insert(names[j]);        groupCounts[group]++; }
+                        }
+                    }                          
+                }
+            }else {
+                SubSample sample;
+                CountTable sampledCt = sample.getSample(ct, size, Groups);
+                vector<string> sampledSeqs = sampledCt.getNamesOfSeqs();
+                for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); }
+                
+                string countOutputDir = outputDir;
+                if (outputDir == "") {  countOutputDir += m->hasPath(countfile);  }
+                string countOutputFileName = countOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+                outputTypes["count"].push_back(countOutputFileName);  outputNames.push_back(countOutputFileName);
+                sampledCt.printTable(countOutputFileName);
+            }
                 }else {
-                       
-                       //randomly select a subset of those names to include in the subsample
-                       //since names was randomly shuffled just grab the next one
-                       for (int j = 0; j < names.size(); j++) {
-                               
-                               if (m->control_pressed) { return 0; }
-                               
-                               if (groupfile != "") { //if there is a groupfile given fill in group info
-                                       string group = groupMap->getGroup(names[j]);
-                                       if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
-                                       
-                                       if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
-                                               if (m->inUsersGroups(group, Groups)) {
-                                                       subset.insert(names[j]); 
-                                               }
-                                       }else{
-                                               subset.insert(names[j]); 
-                                       }
-                               }else{ //save everyone, group
-                                       subset.insert(names[j]); 
-                               }                                       
-                       
-                               //do we have enough??
-                               if (subset.size() == size) { break; }
-                       }       
+                       if (countfile == "") {
+                //randomly select a subset of those names to include in the subsample
+                //since names was randomly shuffled just grab the next one
+                for (int j = 0; j < names.size(); j++) {
+                    
+                    if (m->control_pressed) { return 0; }
+                    
+                    if (groupfile != "") { //if there is a groupfile given fill in group info
+                        string group = groupMap.getGroup(names[j]);
+                        if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+                        
+                        if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
+                            if (m->inUsersGroups(group, Groups)) {  subset.insert(names[j]); }
+                        }else{  subset.insert(names[j]); }
+                    }else{ //save everyone, group
+                        subset.insert(names[j]); 
+                    }                                  
+                    
+                    //do we have enough??
+                    if (subset.size() == size) { break; }
+                }
+            }else {
+                SubSample sample;
+                CountTable sampledCt = sample.getSample(ct, size, Groups, pickedGroups);
+                vector<string> sampledSeqs = sampledCt.getNamesOfSeqs();
+                for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); }
+                
+                string countOutputDir = outputDir;
+                if (outputDir == "") {  countOutputDir += m->hasPath(countfile);  }
+                string countOutputFileName = countOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+                outputTypes["count"].push_back(countOutputFileName);  outputNames.push_back(countOutputFileName);
+                sampledCt.printTable(countOutputFileName);
+            }
                 }
                 
                 if (subset.size() == 0) {  m->mothurOut("The size you selected is too large, skipping fasta file."); m->mothurOutEndLine();  return 0; }
@@ -808,7 +891,7 @@ int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup) {
                 
                 string thisOutputDir = outputDir;
                 if (outputDir == "") {  thisOutputDir += m->hasPath(sharedfile);  }
-               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + getOutputFileNameTag("shared", sharedfile);        
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + "." +getOutputFileNameTag("shared", sharedfile);        
          SubSample sample;
          vector<string> subsampledLabels = sample.getSample(thislookup, size);
          
@@ -858,67 +941,76 @@ int SubSampleCommand::getSubSampleList() {
                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
                 set<string> processedLabels;
                 set<string> userLabels = labels;
-               
+
                 ofstream outGroup;
-               GroupMap* groupMap;
+               GroupMap groupMap;
                 if (groupfile != "") {
-                       
-                       groupMap = new GroupMap(groupfile);
-                       groupMap->readMap();
+                       groupMap.readMap(groupfile);
                         
                         //takes care of user setting groupNames that are invalid or setting groups=all
-                       SharedUtil* util = new SharedUtil();
-                       vector<string> namesGroups = groupMap->getNamesOfGroups();
-                       util->setGroups(Groups, namesGroups);
-                       delete util;
+                       SharedUtil util; vector<string> namesGroups = groupMap.getNamesOfGroups(); util.setGroups(Groups, namesGroups);
                         
                         //create outputfiles
                         string groupOutputDir = outputDir;
                         if (outputDir == "") {  groupOutputDir += m->hasPath(groupfile);  }
                         string groupOutputFileName = groupOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "subsample" + m->getExtension(groupfile);
-                       
                         m->openOutputFile(groupOutputFileName, outGroup);
                         outputTypes["group"].push_back(groupOutputFileName);  outputNames.push_back(groupOutputFileName);
                         
                         //file mismatch quit
-                       if (list->getNumSeqs() != groupMap->getNumSeqs()) { 
-                               m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); 
+                       if (list->getNumSeqs() != groupMap.getNumSeqs()) { 
+                               m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your groupfile contains " + toString(groupMap.getNumSeqs()) + ", please correct."); 
+                               m->mothurOutEndLine(); delete list; delete input; out.close(); outGroup.close(); return 0;
+                       }                       
+               }else if (countfile != "") {
+            if (ct.hasGroupInfo()) {
+                SharedUtil util;
+                vector<string> namesGroups = ct.getNamesOfGroups();
+                util.setGroups(Groups, namesGroups);
+            }
+            
+            //file mismatch quit
+                       if (list->getNumSeqs() != ct.getNumUniqueSeqs()) { 
+                               m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your count file contains " + toString(ct.getNumUniqueSeqs()) + " unique sequences, please correct."); 
                                 m->mothurOutEndLine();
-                               delete groupMap;
-                               delete list;
-                               delete input;
-                               out.close();
-                               outGroup.close();
                                 return 0;
-                       }                       
-               }
-               
+                       }       
+        }
+
                 //make sure that if your picked groups size is not too big
                 if (persample) {
                         if (size == 0) { //user has not set size, set size = smallest samples size
-                               size = groupMap->getNumSeqs(Groups[0]);
+                               if (countfile == "") { size = groupMap.getNumSeqs(Groups[0]); }
+                else {  size = ct.getGroupCount(Groups[0]);  }
+                
                                 for (int i = 1; i < Groups.size(); i++) {
-                                       int thisSize = groupMap->getNumSeqs(Groups[i]);
+                                       int thisSize = 0;
+                    if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); }
+                    else {  thisSize = ct.getGroupCount(Groups[i]);  }
                                         
                                         if (thisSize < size) {  size = thisSize;        }
                                 }
                         }else { //make sure size is not too large
                                 vector<string> newGroups;
                                 for (int i = 0; i < Groups.size(); i++) {
-                                       int thisSize = groupMap->getNumSeqs(Groups[i]);
+                                       int thisSize = 0;
+                    if (countfile == "") { thisSize = groupMap.getNumSeqs(Groups[i]); }
+                    else {  thisSize = ct.getGroupCount(Groups[i]);  }
                                         
                                         if (thisSize >= size) { newGroups.push_back(Groups[i]); }
                                         else {  m->mothurOut("You have selected a size that is larger than " + Groups[i] + " number of sequences, removing " + Groups[i] + "."); m->mothurOutEndLine(); }
                                 }
                                 Groups = newGroups;
+                if (newGroups.size() == 0) {  m->mothurOut("[ERROR]: all groups removed."); m->mothurOutEndLine(); m->control_pressed = true; }
                         }
                         
-                       m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine();        
+                       m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine();                
                 }else{
-                       if (pickedGroups) {
+            if (pickedGroups) {
                                 int total = 0;
                                 for(int i = 0; i < Groups.size(); i++) {
-                                       total += groupMap->getNumSeqs(Groups[i]);
+                    if (countfile == "") { total += groupMap.getNumSeqs(Groups[i]); }
+                    else {  total += ct.getGroupCount(Groups[i]);  }
                                 }
                                 
                                 if (size == 0) { //user has not set size, set size = 10% samples size
@@ -926,122 +1018,110 @@ int SubSampleCommand::getSubSampleList() {
                                 }
                                 
                                 if (total < size) { 
-                                       m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine();
+                                       if (size != 0) { 
+                                               m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine();
+                                       }
                                         size = int (total * 0.10);
                                 }
                                 
                                 m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine();
-                       }else{
-                               
-                               if (size == 0) { //user has not set size, set size = 10% samples size
-                                       size = int (list->getNumSeqs() * 0.10);
+                       }else {
+                if (size == 0) { //user has not set size, set size = 10% samples size
+                                       if (countfile == "") {  size = int (list->getNumSeqs() * 0.10);  }
+                    else { size = int (ct.getNumSeqs() * 0.10);  }
                                 }
                                 
-                               int thisSize = list->getNumSeqs();
+                               int thisSize = 0;
+                if (countfile == "") { thisSize = list->getNumSeqs();  }
+                else { thisSize = ct.getNumSeqs(); }
+                
                                 if (size > thisSize) { m->mothurOut("Your list file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
                                         size = thisSize;
                                 }
                                 
-                               m->mothurOut("Sampling " + toString(size) + " from " + toString(list->getNumSeqs()) + "."); m->mothurOutEndLine();
-                       }
-               }
-               
-               
-               //fill names
-               for (int i = 0; i < list->getNumBins(); i++) {
-                       string binnames = list->get(i);
-                       
-                       //parse names
-                       string individual = "";
-                       int length = binnames.length();
-                       for(int j=0;j<length;j++){
-                               if(binnames[j] == ','){
-                                       
-                                       if (groupfile != "") { //if there is a groupfile given fill in group info
-                                               string group = groupMap->getGroup(individual);
-                                               if (group == "not found") { m->mothurOut("[ERROR]: " + individual + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
-                                               
-                                               if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
-                                                       if (m->inUsersGroups(group, Groups)) {
-                                                               names.push_back(individual);
-                                                       }
-                                               }else{
-                                                       names.push_back(individual);
-                                               }
-                                       }else{ //save everyone, group
-                                               names.push_back(individual);
-                                       }
-                                       individual = "";                                
-                               }
-                               else{
-                                       individual += binnames[j];
-                               }
-                       }
-                       //save last name
-                       if (groupfile != "") { //if there is a groupfile given fill in group info
-                               string group = groupMap->getGroup(individual);
-                               if (group == "not found") { m->mothurOut("[ERROR]: " + individual + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
-                               
-                               if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
-                                       if (m->inUsersGroups(group, Groups)) {
-                                               names.push_back(individual);
-                                       }
-                               }else{
-                                       names.push_back(individual);
-                               }
-                       }else{ //save everyone, group
-                               names.push_back(individual);
-                       }
-               }
-               
-               random_shuffle(names.begin(), names.end());
-                       
-               //randomly select a subset of those names to include in the subsample
-               set<string> subset; //dont want repeat sequence names added
-               if (persample) {
-                       //initialize counts
-                       map<string, int> groupCounts;
-                       map<string, int>::iterator itGroupCounts;
-                       for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
-                       
-                       for (int j = 0; j < names.size(); j++) {
-                               
-                               if (m->control_pressed) { return 0; }
-                               
-                               string group = groupMap->getGroup(names[j]);
-                               if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
-                               else{
-                                       itGroupCounts = groupCounts.find(group);
-                                       if (itGroupCounts != groupCounts.end()) {
-                                               if (groupCounts[group] < size) {        subset.insert(names[j]);        groupCounts[group]++; }
-                                       }
-                               }                               
-                       }
-               }else{
-                       for (int j = 0; j < size; j++) {
-                               
-                               if (m->control_pressed) { break; }
-                               
-                               subset.insert(names[j]); 
-                       }       
-               }
-               
-               if (groupfile != "") { 
-                       //write out new groupfile
-                       for (set<string>::iterator it = subset.begin(); it != subset.end(); it++) {
-                               string group = groupMap->getGroup(*it);
-                               if (group == "not found") { group = "NOTFOUND"; }
-                               
-                               outGroup << *it << '\t' << group << endl;
-                       }
-                       outGroup.close(); delete groupMap; 
-               }
+                               m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine();
+            }
+        }
                 
+        set<string> subset; //dont want repeat sequence names added
+               if (countfile == "") {
+            //fill names
+            for (int i = 0; i < list->getNumBins(); i++) {
+                string binnames = list->get(i);
+                vector<string> thisBin;
+                m->splitAtComma(binnames, thisBin);
+                
+                for(int j=0;j<thisBin.size();j++){
+                    if (groupfile != "") { //if there is a groupfile given fill in group info
+                        string group = groupMap.getGroup(thisBin[j]);
+                        if (group == "not found") { m->mothurOut("[ERROR]: " + thisBin[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+                        
+                                               //if hte user picked groups, we only want to keep the names of sequences from those groups
+                                               if (pickedGroups) { if (m->inUsersGroups(group, Groups)) { names.push_back(thisBin[j]); }  }
+                                               else{ names.push_back(thisBin[j]); } 
+                    }//save everyone, group
+                    else{ names.push_back(thisBin[j]); }
+                }
+            }
+            
+            random_shuffle(names.begin(), names.end());
+                       
+            //randomly select a subset of those names to include in the subsample
+            if (persample) {
+                //initialize counts
+                map<string, int> groupCounts;
+                map<string, int>::iterator itGroupCounts;
+                for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
+                
+                for (int j = 0; j < names.size(); j++) {
+                    
+                    if (m->control_pressed) { delete list; delete input;  return 0; }
+                    
+                    string group = groupMap.getGroup(names[j]);
+                    if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+                    else{
+                        itGroupCounts = groupCounts.find(group);
+                        if (itGroupCounts != groupCounts.end()) {
+                            if (groupCounts[group] < size) {   subset.insert(names[j]);        groupCounts[group]++; }
+                        }
+                    }                          
+                }
+            }else{
+                for (int j = 0; j < size; j++) {
+                    if (m->control_pressed) { break; }
+                    subset.insert(names[j]); 
+                }      
+            }
+            
+            if (groupfile != "") { 
+                //write out new groupfile
+                for (set<string>::iterator it = subset.begin(); it != subset.end(); it++) {
+                    string group = groupMap.getGroup(*it);
+                    if (group == "not found") { group = "NOTFOUND"; }
+                    outGroup << *it << '\t' << group << endl;
+                }
+                outGroup.close(); 
+            }
+               }else {
+            SubSample sample; CountTable sampledCt;
+            
+            if (persample)  { sampledCt = sample.getSample(ct, size, Groups);               }
+            else            { sampledCt = sample.getSample(ct, size, Groups, pickedGroups); }
+            
+            vector<string> sampledSeqs = sampledCt.getNamesOfSeqs();
+            for (int i = 0; i < sampledSeqs.size(); i++) { subset.insert(sampledSeqs[i]); }
+        
+            string countOutputDir = outputDir;
+            if (outputDir == "") {  countOutputDir += m->hasPath(countfile);  }
+            string countOutputFileName = countOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
+            outputTypes["count"].push_back(countOutputFileName);  outputNames.push_back(countOutputFileName);
+            sampledCt.printTable(countOutputFileName);
+        }
                                                 
                 //as long as you are not at the end of the file or done wih the lines you want
                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                         
-                       if (m->control_pressed) {  delete list; delete input; out.close();  return 0;  }
+                       if (m->control_pressed) {  delete list; delete input;  out.close();  return 0;  }
                         
                         if(allLines == 1 || labels.count(list->getLabel()) == 1){                       
                                 
@@ -1132,22 +1212,12 @@ int SubSampleCommand::processList(ListVector*& list, ofstream& out, set<string>&
                         
                         if (m->control_pressed) { break; }
                         
-                       string binnames = list->get(i);
-                       
-                       //parse names
-                       string individual = "";
-                       string newNames = "";
-                       int length = binnames.length();
-                       for(int j=0;j<length;j++){
-                               if(binnames[j] == ','){
-                                       if (subset.count(individual) != 0) {  newNames += individual + ",";  }
-                                       individual = "";                                
-                               }else{
-                                       individual += binnames[j];
-                               }
-                       }
-                       if (subset.count(individual) != 0) {  newNames += individual + ",";  }
+                       string bin = list->get(i);
+            vector<string> binnames;
+            m->splitAtComma(bin, binnames);
                         
+            string newNames = "";
+                       for(int j=0;j<binnames.size();j++){ if (subset.count(binnames[j]) != 0) {  newNames += binnames[j] + ",";  } }
                         
                         //if there are names in this bin add to new list
                         if (newNames != "") { 
diff --git a/subsamplecommand.h b/subsamplecommand.h

index c746e144119dc4d4c0d45ef10872b1f7c9951129..d1fb1226236dc3842b9ace53536cf95a266f1d5e 100644 (file)
--- a/subsamplecommand.h
+++ b/subsamplecommand.h
@@ -16,6 +16,7 @@
  #include "rabundvector.hpp"
  #include "inputdata.h"
  #include "sequence.hpp"
+#include "counttable.h"
  
  
  class SubSampleCommand : public Command {
@@ -38,13 +39,14 @@ public:
         
  private:       
         bool abort, pickedGroups, allLines, persample;
-       string listfile, groupfile, sharedfile, rabundfile, sabundfile, fastafile, namefile;
+       string listfile, groupfile, countfile, sharedfile, rabundfile, sabundfile, fastafile, namefile;
         set<string> labels; //holds labels to be used
         string groups, label, outputDir;
         vector<string> Groups, outputNames;
         int size;
         vector<string> names;
         map<string, vector<string> > nameMap;
+    CountTable ct;
         
         int getSubSampleShared();
         int getSubSampleList();
diff --git a/summaryqualcommand.cpp b/summaryqualcommand.cpp

index 5d7971349656d4400e5d41a24c15e576ccdf57f7..5a073677804ae2e3176f08a3d4ff1e26a1bc5c03 100644 (file)
--- a/summaryqualcommand.cpp
+++ b/summaryqualcommand.cpp
@@ -8,13 +8,14 @@
   */
  
  #include "summaryqualcommand.h"
-
+#include "counttable.h"
  
  //**********************************************************************************************************************
  vector<string> SummaryQualCommand::setParameters(){    
         try {
                 CommandParameter pqual("qfile", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pqual);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+               CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -32,9 +33,10 @@ vector<string> SummaryQualCommand::setParameters(){
  string SummaryQualCommand::getHelpString(){    
         try {
                 string helpString = "";
-               helpString += "The summary.qual command reads a quality file and an optional name file, and summarizes the quality information.\n";
-               helpString += "The summary.tax command parameters are qfile, name and processors. qfile is required, unless you have a valid current quality file.\n";
+               helpString += "The summary.qual command reads a quality file and an optional name or count file, and summarizes the quality information.\n";
+               helpString += "The summary.tax command parameters are qfile, name, count and processors. qfile is required, unless you have a valid current quality file.\n";
                 helpString += "The name parameter allows you to enter a name file associated with your quality file. \n";
+        helpString += "The count parameter allows you to enter a count file associated with your quality file. \n";
                 helpString += "The summary.qual command should be in the following format: \n";
                 helpString += "summary.qual(qfile=yourQualityFile) \n";
                 helpString += "Note: No spaces between parameter labels (i.e. qfile), '=' and parameters (i.e.yourQualityFile).\n";     
@@ -122,6 +124,14 @@ SummaryQualCommand::SummaryQualCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //initialize outputTypes
@@ -141,6 +151,13 @@ SummaryQualCommand::SummaryQualCommand(string option)  {
                         if (namefile == "not open") { namefile = ""; abort = true; }
                         else if (namefile == "not found") { namefile = "";  }   
                         else { m->setNameFile(namefile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { m->setCountTableFile(countfile); }
+                       
+            if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
                         
                         //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
@@ -152,10 +169,13 @@ SummaryQualCommand::SummaryQualCommand(string option)  {
                         m->setProcessors(temp);
                         m->mothurConvert(temp, processors);     
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(qualfile);
-                               parser.getNameFile(files);
-                       }
+            
+                       if (countfile == "") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(qualfile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
         }
         catch(exception& e) {
@@ -179,7 +199,12 @@ int SummaryQualCommand::execute(){
                 if (m->control_pressed) { return 0; }
                 
                 if (namefile != "") { nameMap = m->readNames(namefile); }
-               
+               else if (countfile != "") {
+            CountTable ct;
+            ct.readTable(countfile);
+            nameMap = ct.getNameMap();
+        }
+        
                 vector<unsigned long long> positions; 
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 positions = m->divideFile(qualfile, processors);
@@ -257,7 +282,7 @@ int SummaryQualCommand::driverCreateSummary(vector<int>& position, vector<int>&
                         if (current.getName() != "") {
                                 
                                 int num = 1;
-                               if (namefile != "") {
+                               if ((namefile != "") || (countfile != "")) {
                                         //make sure this sequence is in the namefile, else error 
                                         map<string, int>::iterator it = nameMap.find(current.getName());
                                         
@@ -400,11 +425,14 @@ int SummaryQualCommand::createProcessesCreateSummary(vector<int>& position, vect
                 DWORD   dwThreadIdArray[processors];
                 HANDLE  hThreadArray[processors]; 
                 
+        bool hasNameMap = false;
+        if ((namefile !="") || (countfile != "")) { hasNameMap = true; }
+        
                 //Create processor worker threads.
                 for( int i=0; i<processors; i++ ){
                         
                         // Allocate memory for thread data.
-                       seqSumQualData* tempSum = new seqSumQualData(filename, m, lines[i].start, lines[i].end, namefile, nameMap);
+                       seqSumQualData* tempSum = new seqSumQualData(filename, m, lines[i].start, lines[i].end, hasNameMap, nameMap);
                         pDataArray.push_back(tempSum);
                         processIDS.push_back(i);
          
@@ -457,7 +485,7 @@ int SummaryQualCommand::printQual(string sumFile, vector<int>& position, vector<
                         
                         if (m->control_pressed) { out.close(); return 0; }
                         
-                       float average = averageQ[i] / (float) position[i];
+                       double average = averageQ[i] / (float) position[i];
                         out << i << '\t' << position[i] << '\t' << average << '\t';
                         
                         for (int j = 0; j < 41; j++) {
diff --git a/summaryqualcommand.h b/summaryqualcommand.h

index 31390b4296ff7659fdfb2946d3f37ff33b2877b7..ac65938bc23075fc7db7cb40b9ce6d05d0d5d64e 100644 (file)
--- a/summaryqualcommand.h
+++ b/summaryqualcommand.h
@@ -35,7 +35,7 @@ public:
         
  private:
         bool abort;
-       string qualfile, outputDir, namefile;
+       string qualfile, outputDir, namefile, countfile;
         vector<string> outputNames;
         map<string, int> nameMap;
         int processors;
@@ -62,20 +62,21 @@ struct seqSumQualData {
         vector<int> position;
         vector<int> averageQ;
         vector< vector<int> > scores; 
-       string filename, namefile; 
+       string filename; 
         unsigned long long start;
         unsigned long long end;
         int count;
         MothurOut* m;
+    bool hasNameMap;
         map<string, int> nameMap;
         
         ~seqSumQualData(){}
-       seqSumQualData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string n, map<string, int> nam) {
+       seqSumQualData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, bool n, map<string, int> nam) {
                 filename = f;
                 m = mout;
                 start = st;
                 end = en;
-               namefile = n;
+               hasNameMap = n;
                 nameMap = nam;
                 count = 0;
         }
@@ -109,7 +110,7 @@ static DWORD WINAPI MySeqSumQualThreadFunction(LPVOID lpParam){
                         if (current.getName() != "") {
                         
                                 int num = 1;
-                               if (pDataArray->namefile != "") {
+                               if (pDataArray->hasNameMap) {
                                         //make sure this sequence is in the namefile, else error 
                                         map<string, int>::iterator it = pDataArray->nameMap.find(current.getName());
                                         
diff --git a/summarytaxcommand.cpp b/summarytaxcommand.cpp

index 3e16e74005c5f0b7cfc62d32df7c172be805e511..e932eee49c2477de737caf62d143a5140aea4340 100644 (file)
--- a/summarytaxcommand.cpp
+++ b/summarytaxcommand.cpp
@@ -14,8 +14,9 @@
  vector<string> SummaryTaxCommand::setParameters(){     
         try {
                 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter preftaxonomy("reftaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(preftaxonomy);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -34,9 +35,10 @@ string SummaryTaxCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The summary.tax command reads a taxonomy file and an optional name file, and summarizes the taxonomy information.\n";
-               helpString += "The summary.tax command parameters are taxonomy, group and name. taxonomy is required, unless you have a valid current taxonomy file.\n";
+               helpString += "The summary.tax command parameters are taxonomy, count, group and name. taxonomy is required, unless you have a valid current taxonomy file.\n";
                 helpString += "The name parameter allows you to enter a name file associated with your taxonomy file. \n";
                 helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n";
+        helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n";
                 helpString += "The reftaxonomy parameter allows you give the name of the reference taxonomy file used when you classified your sequences. It is not required, but providing it will keep the rankIDs in the summary file static.\n";
                 helpString += "The summary.tax command should be in the following format: \n";
                 helpString += "summary.tax(taxonomy=yourTaxonomyFile) \n";
@@ -142,6 +144,14 @@ SummaryTaxCommand::SummaryTaxCommand(string option)  {
                                         if (path == "") {       parameters["reftaxonomy"] = inputDir + it->second;              }
                                 }
                                 
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
+
                         }
                         
                         //initialize outputTypes
@@ -166,7 +176,20 @@ SummaryTaxCommand::SummaryTaxCommand(string option)  {
                         if (groupfile == "not open") { groupfile = ""; abort = true; }
                         else if (groupfile == "not found") { groupfile = ""; }
                         else { m->setGroupFile(groupfile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
                         
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+            
                         refTaxonomy = validParameter.validFile(parameters, "reftaxonomy", true);
                         if (refTaxonomy == "not found") { refTaxonomy = ""; m->mothurOut("reftaxonomy is not required, but if given will keep the rankIDs in the summary file static."); m->mothurOutEndLine(); }
                         else if (refTaxonomy == "not open") { refTaxonomy = ""; abort = true; }
@@ -177,11 +200,12 @@ SummaryTaxCommand::SummaryTaxCommand(string option)  {
                                 outputDir += m->hasPath(taxfile); //if user entered a file with a path then preserve it 
                         }
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(taxfile);
-                               parser.getNameFile(files);
+            if (countfile == "") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(taxfile);
+                    parser.getNameFile(files);
+                }
                         }
-                       
                 }
         }
         catch(exception& e) {
@@ -197,23 +221,35 @@ int SummaryTaxCommand::execute(){
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 int start = time(NULL);
                 
-               PhyloSummary* taxaSum;
-               if (refTaxonomy != "") {
-                       taxaSum = new PhyloSummary(refTaxonomy, groupfile);
-               }else {
-                       taxaSum = new PhyloSummary(groupfile);
-               }
+        GroupMap* groupMap = NULL;
+        CountTable* ct = NULL;
+        if (groupfile != "") {
+            groupMap = new GroupMap(groupfile);
+            groupMap->readMap();
+        }else if (countfile != "") {
+            ct = new CountTable();
+            ct->readTable(countfile);
+        }
                 
-               if (m->control_pressed) { delete taxaSum; return 0; }
+        PhyloSummary* taxaSum;
+        if (countfile != "") {
+            if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, ct); }
+            else { taxaSum = new PhyloSummary(ct); }
+        }else {
+            if (refTaxonomy != "") { taxaSum = new PhyloSummary(refTaxonomy, groupMap); }
+            else { taxaSum = new PhyloSummary(groupMap); }
+               }
+        
+               if (m->control_pressed) { if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } delete taxaSum; return 0; }
                 
                 int numSeqs = 0;
-               if (namefile == "") { numSeqs = taxaSum->summarize(taxfile);  }
-               else {
+               if ((namefile == "") || (countfile != "")) { numSeqs = taxaSum->summarize(taxfile);  }
+               else if (namefile != "") {
                         map<string, vector<string> > nameMap;
                         map<string, vector<string> >::iterator itNames;
                         m->readNames(namefile, nameMap);
                         
-                       if (m->control_pressed) { delete taxaSum; return 0; }
+                       if (m->control_pressed) { if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } delete taxaSum; return 0; }
                         
                         ifstream in;
                         m->openInputFile(taxfile, in);
@@ -222,6 +258,9 @@ int SummaryTaxCommand::execute(){
                         string name, taxon;
                         
                         while(!in.eof()){
+                
+                if (m->control_pressed) { break; }
+                
                                 in >> name >> taxon; m->gobble(in);
                                 
                                 itNames = nameMap.find(name);
@@ -240,7 +279,7 @@ int SummaryTaxCommand::execute(){
                         in.close();
                 }
                 
-               if (m->control_pressed) {  delete taxaSum; return 0; }
+               if (m->control_pressed) {  if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; } delete taxaSum; return 0; }
                 
                 //print summary file
                 ofstream outTaxTree;
@@ -250,6 +289,7 @@ int SummaryTaxCommand::execute(){
                 outTaxTree.close();
                 
                 delete taxaSum;
+        if (groupMap != NULL) { delete groupMap; } if (ct != NULL) { delete ct; }
                 
                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
                 
diff --git a/summarytaxcommand.h b/summarytaxcommand.h

index 5f0630f66a03ba9f08a2dd4d2098808a513f0a3c..e8033e23c76837809aa7828389b9cbae13b9c639 100644 (file)
--- a/summarytaxcommand.h
+++ b/summarytaxcommand.h
@@ -11,6 +11,7 @@
   */
  
  #include "command.hpp"
+#include "counttable.h"
  
  /**************************************************************************************************/
  
@@ -33,7 +34,7 @@ class SummaryTaxCommand : public Command {
                 
         private:
                 bool abort;
-               string taxfile, outputDir, namefile, groupfile, refTaxonomy;
+               string taxfile, outputDir, namefile, groupfile, refTaxonomy, countfile;
                 vector<string> outputNames;
                 map<string, int> nameMap;
  };
diff --git a/taxonomynode.cpp b/taxonomynode.cpp

new file mode 100755 (executable)

index 0000000..b90bda1
--- /dev/null
+++ b/taxonomynode.cpp
@@ -0,0 +1,72 @@
+/*
+ *  taxonomynode.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 7/8/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+/**************************************************************************************************/
+
+#include "taxonomynode.h"
+
+/**************************************************************************************************/
+
+TaxonomyNode::TaxonomyNode(string n, int l): name(n), level(l){
+    m = MothurOut::getInstance();
+       parent = -1;
+       numChildren = 0;
+       numSeqs = 0;
+}
+
+/**************************************************************************************************/
+
+void TaxonomyNode::setName(string n)                   {       name = n;                                       }
+
+/**************************************************************************************************/
+
+string TaxonomyNode::getName()                                 {       return name;                            }
+
+/**************************************************************************************************/
+
+void TaxonomyNode::setParent(int p)                            {       parent = p;                                     }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getParent()                                  {       return parent;                          }
+
+/**************************************************************************************************/
+
+void TaxonomyNode::makeChild(string c, int i)  {       children[c] = i;                        }
+
+
+/**************************************************************************************************/
+
+map<string, int> TaxonomyNode::getChildren()   {       return children;                        }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getChildIndex(string c){
+       map<string, int>::iterator it = children.find(c);
+       if(it != children.end())        {       return it->second;                      }
+       else                                            {       return -1;                                      }       
+}
+
+/**************************************************************************************************/
+
+int    TaxonomyNode::getNumKids()                                      {       return (int)children.size();            }
+
+/**************************************************************************************************/
+
+int    TaxonomyNode::getNumSeqs()                                      {       return numSeqs;                         }
+
+/**************************************************************************************************/
+
+void TaxonomyNode::setTotalSeqs(int n)                 {       totalSeqs = n;                          }
+
+/**************************************************************************************************/
+
+int TaxonomyNode::getLevel()                                   {       return level;                           }
+
+/**************************************************************************************************/
diff --git a/taxonomynode.h b/taxonomynode.h

new file mode 100755 (executable)

index 0000000..08bad3e
--- /dev/null
+++ b/taxonomynode.h
@@ -0,0 +1,53 @@
+#ifndef TAXONOMYNODE
+#define TAXONOMYNODE
+
+/*
+ *  taxonomynode.h
+ *  
+ *
+ *  Created by Pat Schloss on 7/8/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+/**************************************************************************************************/
+
+#include "mothurout.h"
+/**************************************************************************************************/
+
+class TaxonomyNode {
+       
+public:
+       TaxonomyNode();
+       TaxonomyNode(string, int);
+       void setName(string);
+       string getName();
+
+
+       void setParent(int);
+       int getParent();
+       
+       void makeChild(string, int);
+       map<string, int> getChildren();
+       int getChildIndex(string);
+       int     getNumKids();
+       int getNumSeqs();
+       void setTotalSeqs(int);
+       int getLevel();
+       
+private:
+       int parent;
+       map<string, int> children;
+       int numChildren;
+       int level;
+       
+protected:
+    MothurOut* m;
+       int numSeqs;
+       int totalSeqs;
+       string name;
+};
+
+/**************************************************************************************************/
+
+#endif
diff --git a/tree.cpp b/tree.cpp

index 44ecadd534b60d60b602d9e052259ac69709f9d3..0bd98e01f1c305d93851e1a5e465c92a22fc9a1f 100644 (file)
--- a/tree.cpp
+++ b/tree.cpp
@@ -10,7 +10,7 @@
  #include "tree.h"
  
  /*****************************************************************/
-Tree::Tree(int num, TreeMap* t) : tmap(t) {
+Tree::Tree(int num, CountTable* t) : ct(t) {
         try {
                 m = MothurOut::getInstance();
                 
@@ -36,21 +36,20 @@ Tree::Tree(string g) { //do not use tree generated by this its just to extract t
         }
  }
  /*****************************************************************/
-Tree::Tree(TreeMap* t) : tmap(t) {
+Tree::Tree(CountTable* t) : ct(t) {
         try {
                 m = MothurOut::getInstance();
                 
                 if (m->runParse == true) {  parseTreeFile();  m->runParse = false;  }
-//for(int i = 0; i <   globaldata->Treenames.size(); i++) { cout << i << '\t' << globaldata->Treenames[i] << endl;  }  
+
                 numLeaves = m->Treenames.size();
                 numNodes = 2*numLeaves - 1;
                 
                 tree.resize(numNodes);
                         
                 //initialize groupNodeInfo
-               for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) {
-                       groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0);
-               }
+        vector<string> namesOfGroups = ct->getNamesOfGroups();
+               for (int i = 0; i < namesOfGroups.size(); i++) {  groupNodeInfo[namesOfGroups[i]].resize(0);  }
                 
                 //initialize tree with correct number of nodes, name and group info.
                 for (int i = 0; i < numNodes; i++) {
@@ -59,19 +58,35 @@ Tree::Tree(TreeMap* t) : tmap(t) {
                                 tree[i].setName(m->Treenames[i]);
                                 
                                 //save group info
-                               string group = tmap->getGroup(m->Treenames[i]);
-                               
-                               vector<string> tempGroups; tempGroups.push_back(group);
-                               tree[i].setGroup(tempGroups);
-                               groupNodeInfo[group].push_back(i); 
-                               
-                               //set pcount and pGroup for groupname to 1.
-                               tree[i].pcount[group] = 1;
-                               tree[i].pGroups[group] = 1;
-                               
-                               //Treemap knows name, group and index to speed up search
-                               tmap->setIndex(m->Treenames[i], i);
-       
+                int maxPars = 1;
+                               vector<string> group;
+                vector<int> counts = ct->getGroupCounts(m->Treenames[i]);
+                               for (int j = 0; j < namesOfGroups.size(); j++) {  
+                    if (counts[j] != 0) { //you have seqs from this group
+                        groupNodeInfo[namesOfGroups[j]].push_back(i);
+                        group.push_back(namesOfGroups[j]);
+                        tree[i].pGroups[namesOfGroups[j]] = counts[j];
+                        tree[i].pcount[namesOfGroups[j]] = counts[j];
+                        //keep highest group
+                                               if(counts[j] > maxPars){ maxPars = counts[j]; }
+                    }  
+                }
+                               tree[i].setGroup(group);
+                               setIndex(m->Treenames[i], i);
+                
+                if (maxPars > 1) { //then we have some more dominant groups
+                                       //erase all the groups that are less than maxPars because you found a more dominant group.
+                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){
+                                               if(it->second < maxPars){
+                                                       tree[i].pGroups.erase(it++);
+                                               }else { it++; }
+                                       }
+                                       //set one remaining groups to 1
+                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){
+                                               tree[i].pGroups[it->first] = 1;
+                                       }
+                               }//end if
+                
                         //intialize non leaf nodes
                         }else if (i > (numLeaves-1)) {
                                 tree[i].setName("");
@@ -87,7 +102,7 @@ Tree::Tree(TreeMap* t) : tmap(t) {
         }
  }
  /*****************************************************************/
-Tree::Tree(TreeMap* t, vector< vector<double> >& sims) : tmap(t) {
+Tree::Tree(CountTable* t, vector< vector<double> >& sims) : ct(t) {
         try {
                 m = MothurOut::getInstance();
                 
@@ -98,9 +113,8 @@ Tree::Tree(TreeMap* t, vector< vector<double> >& sims) : tmap(t) {
                 tree.resize(numNodes);
          
                 //initialize groupNodeInfo
-               for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) {
-                       groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0);
-               }
+        vector<string> namesOfGroups = ct->getNamesOfGroups();
+               for (int i = 0; i < namesOfGroups.size(); i++) {  groupNodeInfo[namesOfGroups[i]].resize(0);  }
                 
                 //initialize tree with correct number of nodes, name and group info.
                 for (int i = 0; i < numNodes; i++) {
@@ -109,18 +123,34 @@ Tree::Tree(TreeMap* t, vector< vector<double> >& sims) : tmap(t) {
                                 tree[i].setName(m->Treenames[i]);
                                 
                                 //save group info
-                               string group = tmap->getGroup(m->Treenames[i]);
-                               
-                               vector<string> tempGroups; tempGroups.push_back(group);
-                               tree[i].setGroup(tempGroups);
-                               groupNodeInfo[group].push_back(i); 
-                               
-                               //set pcount and pGroup for groupname to 1.
-                               tree[i].pcount[group] = 1;
-                               tree[i].pGroups[group] = 1;
-                               
-                               //Treemap knows name, group and index to speed up search
-                               tmap->setIndex(m->Treenames[i], i);
+                int maxPars = 1;
+                               vector<string> group;
+                vector<int> counts = ct->getGroupCounts(m->Treenames[i]);
+                               for (int j = 0; j < namesOfGroups.size(); j++) {  
+                    if (counts[j] != 0) { //you have seqs from this group
+                        groupNodeInfo[namesOfGroups[j]].push_back(i);
+                        group.push_back(namesOfGroups[j]);
+                        tree[i].pGroups[namesOfGroups[j]] = counts[j];
+                        tree[i].pcount[namesOfGroups[j]] = counts[j];
+                        //keep highest group
+                                               if(counts[j] > maxPars){ maxPars = counts[j]; }
+                    }  
+                }
+                               tree[i].setGroup(group);
+                               setIndex(m->Treenames[i], i);
+                
+                if (maxPars > 1) { //then we have some more dominant groups
+                                       //erase all the groups that are less than maxPars because you found a more dominant group.
+                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){
+                                               if(it->second < maxPars){
+                                                       tree[i].pGroups.erase(it++);
+                                               }else { it++; }
+                                       }
+                                       //set one remaining groups to 1
+                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){
+                                               tree[i].pGroups[it->first] = 1;
+                                       }
+                               }//end if
                  
                  //intialize non leaf nodes
                         }else if (i > (numLeaves-1)) {
@@ -129,11 +159,12 @@ Tree::Tree(TreeMap* t, vector< vector<double> >& sims) : tmap(t) {
                                 tree[i].setGroup(tempGroups);
                         }
                 }
+
          
          //build tree from matrix
          //initialize indexes
-        map<int, int> indexes;  //maps row in simMatrix to vector index in the tree
-        for (int g = 0; g < numLeaves; g++) {  indexes[g] = g; }
+        map<int, int> thisIndexes;  //maps row in simMatrix to vector index in the tree
+        for (int g = 0; g < numLeaves; g++) {  thisIndexes[g] = g;     }
                 
                 //do merges and create tree structure by setting parents and children
                 //there are numGroups - 1 merges to do
@@ -152,26 +183,26 @@ Tree::Tree(TreeMap* t, vector< vector<double> >& sims) : tmap(t) {
              
                         //set non-leaf node info and update leaves to know their parents
                         //non-leaf
-                       tree[numLeaves + i].setChildren(indexes[row], indexes[column]);
+                       tree[numLeaves + i].setChildren(thisIndexes[row], thisIndexes[column]);
                         
                         //parents
-                       tree[indexes[row]].setParent(numLeaves + i);
-                       tree[indexes[column]].setParent(numLeaves + i);
+                       tree[thisIndexes[row]].setParent(numLeaves + i);
+                       tree[thisIndexes[column]].setParent(numLeaves + i);
                         
                         //blength = distance / 2;
                         float blength = ((1.0 - largest) / 2);
                         
                         //branchlengths
-                       tree[indexes[row]].setBranchLength(blength - tree[indexes[row]].getLengthToLeaves());
-                       tree[indexes[column]].setBranchLength(blength - tree[indexes[column]].getLengthToLeaves());
+                       tree[thisIndexes[row]].setBranchLength(blength - tree[thisIndexes[row]].getLengthToLeaves());
+                       tree[thisIndexes[column]].setBranchLength(blength - tree[thisIndexes[column]].getLengthToLeaves());
                         
                         //set your length to leaves to your childs length plus branchlength
-                       tree[numLeaves + i].setLengthToLeaves(tree[indexes[row]].getLengthToLeaves() + tree[indexes[row]].getBranchLength());
+                       tree[numLeaves + i].setLengthToLeaves(tree[thisIndexes[row]].getLengthToLeaves() + tree[thisIndexes[row]].getBranchLength());
                         
                         
                         //update index 
-                       indexes[row] = numLeaves+i;
-                       indexes[column] = numLeaves+i;
+                       thisIndexes[row] = numLeaves+i;
+                       thisIndexes[column] = numLeaves+i;
                         
                         //remove highest value that caused the merge.
                         sims[row][column] = -1000.0;
@@ -200,7 +231,7 @@ Tree::Tree(TreeMap* t, vector< vector<double> >& sims) : tmap(t) {
  }
  /*****************************************************************/
  Tree::~Tree() {}
-/*****************************************************************/
+/*****************************************************************
  void Tree::addNamesToCounts(map<string, string> nameMap) {
         try {
                 //ex. seq1      seq2,seq3,se4
@@ -297,15 +328,15 @@ void Tree::addNamesToCounts(map<string, string> nameMap) {
                 m->errorOut(e, "Tree", "addNamesToCounts");
                 exit(1);
         }
-}
+}*/
  /*****************************************************************/
  int Tree::getIndex(string searchName) {
         try {
-               //Treemap knows name, group and index to speed up search
-               // getIndex function will return the vector index or -1 if seq is not found.
-               int index = tmap->getIndex(searchName);
-               return index;
-               
+        map<string, int>::iterator itIndex = indexes.find(searchName);
+        if (itIndex != indexes.end()) {
+            return itIndex->second;
+        }
+               return -1;
         }
         catch(exception& e) {
                 m->errorOut(e, "Tree", "getIndex");
@@ -316,8 +347,10 @@ int Tree::getIndex(string searchName) {
  
  void Tree::setIndex(string searchName, int index) {
         try {
-               //set index in treemap
-               tmap->setIndex(searchName, index);
+               map<string, int>::iterator itIndex = indexes.find(searchName);
+        if (itIndex == indexes.end()) {
+            indexes[searchName] = index;
+        }
         }
         catch(exception& e) {
                 m->errorOut(e, "Tree", "setIndex");
@@ -325,14 +358,8 @@ void Tree::setIndex(string searchName, int index) {
         }
  }
  /*****************************************************************/
-int Tree::assembleTree(map<string, string> nameMap) {
-       try {
-               //save for later
-        names = nameMap;
-
-               //if user has given a names file we want to include that info in the pgroups and pcount info.
-               if(nameMap.size() != 0) {  addNamesToCounts(nameMap);  }
-               
+int Tree::assembleTree() {
+       try {           
                 //build the pGroups in non leaf nodes to be used in the parsimony calcs.
                 for (int i = numLeaves; i < numNodes; i++) {
                         if (m->control_pressed) { return 1; }
@@ -348,66 +375,66 @@ int Tree::assembleTree(map<string, string> nameMap) {
                 exit(1);
         }
  }
-/*****************************************************************
-int Tree::assembleTree(string n) {
-       try {
-               
-               //build the pGroups in non leaf nodes to be used in the parsimony calcs.
-               for (int i = numLeaves; i < numNodes; i++) {
-                       if (m->control_pressed) { return 1; }
-
-                       tree[i].pGroups = (mergeGroups(i));
-                       tree[i].pcount = (mergeGcounts(i));
-               }
-               //float B = clock();
-               //cout << "assembleTree\t" << (B-A) / CLOCKS_PER_SEC << endl;
-               return 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "Tree", "assembleTree");
-               exit(1);
-       }
-}
  /*****************************************************************/
  //assumes leaf node names are in groups and no names file - used by indicator command
  void Tree::getSubTree(Tree* Ctree, vector<string> Groups) {
         try {
          
          //copy Tree since we are going to destroy it
-        Tree* copy = new Tree(tmap);
+        Tree* copy = new Tree(ct);
          copy->getCopy(Ctree);
-        map<string, string> empty;
-        copy->assembleTree(empty);
+        copy->assembleTree();
          
                 //we want to select some of the leaf nodes to create the output tree
                 //go through the input Tree starting at parents of leaves
+        //initialize groupNodeInfo
+        vector<string> namesOfGroups = ct->getNamesOfGroups();
+               for (int i = 0; i < namesOfGroups.size(); i++) {  groupNodeInfo[namesOfGroups[i]].resize(0);  }
+               
+               //initialize tree with correct number of nodes, name and group info.
                 for (int i = 0; i < numNodes; i++) {
-                       
                         //initialize leaf nodes
                         if (i <= (numLeaves-1)) {
                                 tree[i].setName(Groups[i]);
                                 
                                 //save group info
-                               string group = tmap->getGroup(Groups[i]);
-                               vector<string> tempGroups; tempGroups.push_back(group);
-                               tree[i].setGroup(tempGroups);
-                               groupNodeInfo[group].push_back(i); 
-                               
-                               //set pcount and pGroup for groupname to 1.
-                               tree[i].pcount[group] = 1;
-                               tree[i].pGroups[group] = 1;
-                               
-                               //Treemap knows name, group and index to speed up search
-                               tmap->setIndex(Groups[i], i);
-                               
-                               //intialize non leaf nodes
+                int maxPars = 1;
+                               vector<string> group;
+                vector<int> counts = ct->getGroupCounts(Groups[i]);
+                               for (int j = 0; j < namesOfGroups.size(); j++) {  
+                    if (counts[j] != 0) { //you have seqs from this group
+                        groupNodeInfo[namesOfGroups[j]].push_back(i);
+                        group.push_back(namesOfGroups[j]);
+                        tree[i].pGroups[namesOfGroups[j]] = counts[j];
+                        tree[i].pcount[namesOfGroups[j]] = counts[j];
+                        //keep highest group
+                                               if(counts[j] > maxPars){ maxPars = counts[j]; }
+                    }  
+                }
+                               tree[i].setGroup(group);
+                               setIndex(Groups[i], i);
+                
+                if (maxPars > 1) { //then we have some more dominant groups
+                                       //erase all the groups that are less than maxPars because you found a more dominant group.
+                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){
+                                               if(it->second < maxPars){
+                                                       tree[i].pGroups.erase(it++);
+                                               }else { it++; }
+                                       }
+                                       //set one remaining groups to 1
+                                       for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){
+                                               tree[i].pGroups[it->first] = 1;
+                                       }
+                               }//end if
+                
+                //intialize non leaf nodes
                         }else if (i > (numLeaves-1)) {
                                 tree[i].setName("");
                                 vector<string> tempGroups;
                                 tree[i].setGroup(tempGroups);
                         }
                 }
-               
+
                 set<int> removedLeaves;
                 for (int i = 0; i < copy->getNumLeaves(); i++) {
                         
@@ -534,7 +561,7 @@ void Tree::getSubTree(Tree* Ctree, vector<string> Groups) {
                 exit(1);
         }
  }
-/*****************************************************************/
+/*****************************************************************
  //assumes nameMap contains unique names as key or is empty. 
  //assumes numLeaves defined in tree constructor equals size of seqsToInclude and seqsToInclude only contains unique seqs.
  int Tree::getSubTree(Tree* copy, vector<string> seqsToInclude, map<string, string> nameMap) {
@@ -578,7 +605,7 @@ int Tree::populateNewTree(vector<Node>& oldtree, int node, int& index) {
                         
                         return (index++);
                 }else { //you are a leaf
-                       int indexInNewTree = tmap->getIndex(oldtree[node].getName());
+                       int indexInNewTree = getIndex(oldtree[node].getName());
                         return indexInNewTree;
                 }
         }
@@ -588,7 +615,7 @@ int Tree::populateNewTree(vector<Node>& oldtree, int node, int& index) {
         }
  }
  /*****************************************************************/
-void Tree::getCopy(Tree* copy, map<string, string> nameMap) {
+void Tree::getCopy(Tree* copy, bool subsample) {
         try {
          
                 //for each node in the tree copy its info
@@ -602,8 +629,6 @@ void Tree::getCopy(Tree* copy, map<string, string> nameMap) {
                         //copy children
                         tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild());
          }
-               
-        if (nameMap.size() != 0) {  addNamesToCounts(nameMap);  }
          
          //build the pGroups in non leaf nodes to be used in the parsimony calcs.
                 for (int i = numLeaves; i < numNodes; i++) {
@@ -640,8 +665,8 @@ void Tree::getCopy(Tree* copy) {
                         tree[i].setChildren(copy->tree[i].getLChild(), copy->tree[i].getRChild());
                 
                         //copy index in node and tmap
+            setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName()));
                         tree[i].setIndex(copy->tree[i].getIndex());
-                       setIndex(copy->tree[i].getName(), getIndex(copy->tree[i].getName()));
                         
                         //copy pGroups
                         tree[i].pGroups = copy->tree[i].pGroups;
@@ -805,8 +830,8 @@ void Tree::randomLabels(vector<string> g) {
         try {
         
                 //initialize groupNodeInfo
-               for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) {
-                       groupNodeInfo[(tmap->getNamesOfGroups())[i]].resize(0);
+               for (int i = 0; i < (ct->getNamesOfGroups()).size(); i++) {
+                       groupNodeInfo[(ct->getNamesOfGroups())[i]].resize(0);
                 }
                 
                 for(int i = 0; i < numLeaves; i++){
@@ -868,23 +893,20 @@ void Tree::randomBlengths()  {
  /*************************************************************************************************/
  void Tree::assembleRandomUnifracTree(vector<string> g) {
         randomLabels(g);
-    map<string, string> empty;
-       assembleTree(empty);
+       assembleTree();
  }
  /*************************************************************************************************/
  void Tree::assembleRandomUnifracTree(string groupA, string groupB) {
         vector<string> temp; temp.push_back(groupA); temp.push_back(groupB);
         randomLabels(temp);
-    map<string, string> empty;
-       assembleTree(empty);
+       assembleTree();
  }
  
  /*************************************************************************************************/
  //for now it's just random topology but may become random labels as well later that why this is such a simple function now...
  void Tree::assembleRandomTree() {
         randomTopology();
-    map<string, string> empty;
-       assembleTree(empty);
+       assembleTree();
  }
  /**************************************************************************************************/
  
@@ -1103,16 +1125,16 @@ void Tree::printBranch(int node, ostream& out, string mode) {
                                 }
                         }
                 }else { //you are a leaf
-                       string leafGroup = tmap->getGroup(tree[node].getName());
+                       vector<string> leafGroup = ct->getGroups(tree[node].getName());
                         
                         if (mode == "branch") {
-                               out << leafGroup; 
+                               out << leafGroup[0]; 
                                 //if there is a branch length then print it
                                 if (tree[node].getBranchLength() != -1) {
                                         out << ":" << tree[node].getBranchLength();
                                 }
                         }else if (mode == "boot") {
-                               out << leafGroup; 
+                               out << leafGroup[0]; 
                                 //if there is a label then print it
                                 if (tree[node].getLabel() != -1) {
                                         out << tree[node].getLabel();
@@ -1166,16 +1188,16 @@ void Tree::printBranch(int node, ostream& out, string mode, vector<Node>& theseN
                                 }
                         }
                 }else { //you are a leaf
-                       string leafGroup = tmap->getGroup(theseNodes[node].getName());
+                       vector<string> leafGroup = ct->getGroups(theseNodes[node].getName());
                         
                         if (mode == "branch") {
-                               out << leafGroup; 
+                               out << leafGroup[0]; 
                                 //if there is a branch length then print it
                                 if (theseNodes[node].getBranchLength() != -1) {
                                         out << ":" << theseNodes[node].getBranchLength();
                                 }
                         }else if (mode == "boot") {
-                               out << leafGroup; 
+                               out << leafGroup[0]; 
                                 //if there is a label then print it
                                 if (theseNodes[node].getLabel() != -1) {
                                         out << theseNodes[node].getLabel();
diff --git a/tree.h b/tree.h

index 03da5f6841d9f30cef2f946d6c9ecbc1d5c58cc0..88e49c0d0372c16371e7c9f8254a0717ab93ae12 100644 (file)
--- a/tree.h
+++ b/tree.h
@@ -11,22 +11,22 @@
   */
  
  #include "treenode.h"
-#include "treemap.h"
+#include "counttable.h"
  /* This class represents the treefile. */
  
  class Tree {
  public: 
         Tree(string);  //do not use tree generated by this constructor its just to extract the treenames, its a chicken before the egg thing that needs to be revisited.
-       Tree(int, TreeMap*); 
-       Tree(TreeMap*);         //to generate a tree from a file
-    Tree(TreeMap*, vector< vector<double> >&); //create tree from sim matrix
+       Tree(int, CountTable*); 
+       Tree(CountTable*);              //to generate a tree from a file
+    Tree(CountTable*, vector< vector<double> >&); //create tree from sim matrix
         ~Tree();
         
-    TreeMap* getTreeMap() { return tmap; }
+    CountTable* getCountTable() { return ct; }
         void getCopy(Tree*);  //makes tree a copy of the one passed in.
-    void getCopy(Tree* copy, map<string, string>); //makes a copy of the tree structure passed in, (just parents, children and br). Used with the Tree(TreeMap*) constructor. Assumes the tmap already has set seqs groups you want.  Used by subsample to reassign seqs you don't want included to group "doNotIncludeMe".
+    void getCopy(Tree* copy, bool); //makes a copy of the tree structure passed in, (just parents, children and br). Used with the Tree(TreeMap*) constructor. Assumes the tmap already has set seqs groups you want.  Used by subsample to reassign seqs you don't want included to group "doNotIncludeMe".
         void getSubTree(Tree*, vector<string>);  //makes tree a that contains only the names passed in.
-    int getSubTree(Tree* originalToCopy, vector<string> seqToInclude, map<string, string> nameMap);  //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. 
+    //int getSubTree(Tree* originalToCopy, vector<string> seqToInclude, map<string, string> nameMap);  //used with (int, TreeMap) constructor. SeqsToInclude contains subsample wanted - assumes these are unique seqs and size of vector=numLeaves passed into constructor. nameMap is unique -> redundantList can be empty if no namesfile was provided. 
      
         void assembleRandomTree();
         void assembleRandomUnifracTree(vector<string>);
@@ -45,21 +45,22 @@ public:
         int findRoot();  //return index of root node
         
         //this function takes the leaf info and populates the non leaf nodes
-       int assembleTree(map<string, string>);  
+       int assembleTree();     
         
         vector<Node> tree;              //the first n nodes are the leaves, where n is the number of sequences.
         map< string, vector<int> > groupNodeInfo;       //maps group to indexes of leaf nodes with that group, different groups may contain same node because of names file.
                         
  private:
-       TreeMap* tmap;
+       CountTable* ct;
         int numNodes, numLeaves;
         ofstream out;
         string filename;
         
-    map<string, string> names;
+    //map<string, string> names;
         map<string, int>::iterator it, it2;
         map<string, int> mergeGroups(int);  //returns a map with a groupname and the number of times that group was seen in the children
         map<string,int> mergeGcounts(int);
+    map<string, int> indexes; //maps seqName -> index in tree vector
         
         void addNamesToCounts(map<string, string>);
         void randomTopology();
diff --git a/treegroupscommand.cpp b/treegroupscommand.cpp

index 6633e5160559297c23a542c373ed1b1cc9a3ccdc..bba6289ada54856c25d683126b0d181badab4c15 100644 (file)
--- a/treegroupscommand.cpp
+++ b/treegroupscommand.cpp
@@ -16,8 +16,9 @@ vector<string> TreeGroupCommand::setParameters(){
         try {
                 CommandParameter pshared("shared", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pshared);
                 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pphylip);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname);
-               CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "ColumnName",false,false); parameters.push_back(pcolumn);  
+               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname);
+               CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "countcolumn",false,false); parameters.push_back(pcount);
+        CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "ColumnName-countcolumn",false,false); parameters.push_back(pcolumn);             
          CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
          CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
          CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
@@ -160,6 +161,14 @@ TreeGroupCommand::TreeGroupCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //check for required parameters
@@ -182,6 +191,11 @@ TreeGroupCommand::TreeGroupCommand(string option)  {
                         if (namefile == "not open") { abort = true; }   
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { m->setCountTableFile(countfile); }
                         
                         if ((phylipfile == "") && (columnfile == "") && (sharedfile == "")) { 
                                 //is there are current file available for either of these?
@@ -204,15 +218,20 @@ TreeGroupCommand::TreeGroupCommand(string option)  {
                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When running the tree.shared command with a distance file you may not use both the column and the phylip parameters."); m->mothurOutEndLine(); abort = true; }
                         
                         if (columnfile != "") {
-                               if (namefile == "") { 
+                               if ((namefile == "") && (countfile == "")){ 
                                         namefile = m->getNameFile(); 
                                         if (namefile != "") {  m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
                                         else { 
-                                               m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); 
-                                               abort = true; 
+                                               countfile = m->getCountTableFile();
+                        if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
+                        else { 
+                            m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); 
+                            abort = true; 
+                        }      
                                         }       
                                 }
                         }
+
                         
                         //check for optional parameter and set defaults
                         // ...at some point should added some additional type checking...
@@ -287,7 +306,7 @@ TreeGroupCommand::~TreeGroupCommand(){
         if (abort == false) {
                 if (format == "sharedfile") {  delete input; }
                 else { delete list; }
-               delete tmap;  
+               delete ct;  
         }
         
  }
@@ -400,8 +419,16 @@ int TreeGroupCommand::execute(){
                         m->runParse = false;
                         
                         //create treemap class from groupmap for tree class to use
-                       tmap = new TreeMap();
-                       tmap->makeSim(m->getAllGroups());
+                       ct = new CountTable();
+            set<string> nameMap;
+            map<string, string> groupMap;
+            set<string> gps;
+            for (int i = 0; i < m->getAllGroups().size(); i++) { 
+                nameMap.insert(m->getAllGroups()[i]); 
+                gps.insert(m->getAllGroups()[i]); 
+                groupMap[m->getAllGroups()[i]] = m->getAllGroups()[i];
+            }
+            ct->createTable(nameMap, groupMap, gps);
                         
                         //clear globaldatas old tree names if any
                         m->Treenames.clear();
@@ -425,31 +452,40 @@ int TreeGroupCommand::execute(){
                                 
                         readMatrix->setCutoff(cutoff);
         
-                       if(namefile != ""){     
-                               nameMap = new NameAssignment(namefile);
-                               nameMap->readMap();
-                       }
-                       else{
-                               nameMap = NULL;
-                       }
-       
-                       readMatrix->read(nameMap);
+            ct = NULL;
+            if(namefile != ""){        
+                nameMap = new NameAssignment(namefile);
+                nameMap->readMap();
+                readMatrix->read(nameMap);
+            }else if (countfile != "") {
+                ct = new CountTable();
+                ct->readTable(countfile);
+                readMatrix->read(ct);
+            }
+
                         list = readMatrix->getListVector();
                         SparseDistanceMatrix* dMatrix = readMatrix->getDMatrix();
  
                         //make treemap
-                       tmap = new TreeMap();
-                       
-                       if (m->control_pressed) { return 0; }
-                       
-                       tmap->makeSim(list);
+            if (ct != NULL) { delete ct; }
+                       ct = new CountTable();
+            set<string> nameMap;
+            map<string, string> groupMap;
+            set<string> gps;
+            for (int i = 0; i < list->getNumBins(); i++) {
+                string bin = list->get(i);
+                nameMap.insert(bin); 
+                gps.insert(bin); 
+                groupMap[bin] = bin;
+            }
+            ct->createTable(nameMap, groupMap, gps);
                         
-                       vector<string> namesGroups = tmap->getNamesOfGroups();
+                       vector<string> namesGroups = ct->getNamesOfGroups();
                         m->setGroups(namesGroups);
                 
                         //clear globaldatas old tree names if any
                         m->Treenames.clear();
-               
+            
                         //fills globaldatas tree names
                         m->Treenames = m->getGroups();
                         
@@ -505,13 +541,12 @@ int TreeGroupCommand::execute(){
  Tree* TreeGroupCommand::createTree(vector< vector<double> >& simMatrix){
         try {
                 //create tree
-               t = new Tree(tmap, simMatrix);
+               t = new Tree(ct, simMatrix);
          
          if (m->control_pressed) { delete t; t = NULL; return t; }
                 
          //assemble tree
-        map<string, string> empty;
-               t->assembleTree(empty);
+               t->assembleTree();
  
                 return t;
         }
diff --git a/treegroupscommand.h b/treegroupscommand.h

index b0ae730d98aa8af6da4fecabe3cf0f7aa03df2d4..b29670ae614205a7ee286116379ddda86f0e7cf9 100644 (file)
--- a/treegroupscommand.h
+++ b/treegroupscommand.h
@@ -15,11 +15,10 @@
  #include "groupmap.h"
  #include "validcalculator.h"
  #include "tree.h"
-#include "treemap.h"
+#include "counttable.h"
  #include "readmatrix.hpp"
  #include "readcolumn.h"
  #include "readphylip.h"
-#include "sparsematrix.hpp"
  #include "sharedsobscollectsummary.h"
  #include "sharedchao1.h"
  #include "sharedace.h"
@@ -69,8 +68,6 @@
         They can also use as many or as few calculators as they wish. */
         
  
-typedef list<PCell>::iterator MatData;
-
  class TreeGroupCommand : public Command {
         
  public:
@@ -107,13 +104,13 @@ private:
         
         NameAssignment* nameMap;
         ListVector* list;
-       TreeMap* tmap;
+       CountTable* ct;
         Tree* t;
      InputData* input;
         vector<Calculator*> treeCalculators;
         vector<SharedRAbundVector*> lookup;
         string lastLabel;
-       string format, groupNames, filename, sharedfile, inputfile;
+       string format, groupNames, filename, sharedfile, countfile, inputfile;
         int numGroups, subsampleSize, iters, processors;
         ofstream out;
         float precision, cutoff;
diff --git a/treemap.cpp b/treemap.cpp

index 42ec336dd53d9da3c0ac44b997cdc7509a89c9cf..47b7cf343635e2d104db75114aa77fbe8007c466 100644 (file)
--- a/treemap.cpp
+++ b/treemap.cpp
@@ -13,6 +13,9 @@
  
   TreeMap::TreeMap(string filename) {
         m = MothurOut::getInstance();
+    ofstream out2;
+    m->openOutputFileAppend(filename, out2);
+    out2 << endl; out2.close();
         groupFileName = filename;
         m->openInputFile(filename, fileHandle);
  }
@@ -22,6 +25,10 @@
  /************************************************************/
  int TreeMap::readMap(string gf) {
      try {
+        ofstream out2;
+        m->openOutputFileAppend(gf, out2);
+        out2 << endl; out2.close();
+        
          groupFileName = gf;
          m->openInputFile(gf, fileHandle);
          
@@ -65,6 +72,34 @@ int TreeMap::readMap(string gf) {
          }
          fileHandle.close();
          
+        if (rest != "") {
+            vector<string> pieces = m->splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  seqName = pieces[i]; columnOne=false; }
+                else  { seqGroup = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    setNamesOfGroups(seqGroup);
+                    
+                    map<string, GroupIndex>::iterator itCheck = treemap.find(seqName);
+                    if (itCheck != treemap.end()) { error = 1; m->mothurOut("[WARNING]: Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
+                    else {
+                        namesOfSeqs.push_back(seqName);
+                        treemap[seqName].groupname = seqGroup; //store data in map
+                        
+                        it2 = seqsPerGroup.find(seqGroup);
+                        if (it2 == seqsPerGroup.end()) { //if it's a new group
+                            seqsPerGroup[seqGroup] = 1;
+                        }else {//it's a group we already have
+                            seqsPerGroup[seqGroup]++;
+                        }                              
+                    }
+                    pairDone = false; 
+                } 
+            }
+        }
+        
          return error;
      }
         catch(exception& e) {
@@ -116,6 +151,34 @@ int TreeMap::readMap() {
          }
          fileHandle.close();
          
+        if (rest != "") {
+            vector<string> pieces = m->splitWhiteSpace(rest);
+            
+            for (int i = 0; i < pieces.size(); i++) {
+                if (columnOne) {  seqName = pieces[i]; columnOne=false; }
+                else  { seqGroup = pieces[i]; pairDone = true; columnOne=true; }
+                
+                if (pairDone) { 
+                    setNamesOfGroups(seqGroup);
+                    
+                    map<string, GroupIndex>::iterator itCheck = treemap.find(seqName);
+                    if (itCheck != treemap.end()) { error = 1; m->mothurOut("[WARNING]: Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine();  }
+                    else {
+                        namesOfSeqs.push_back(seqName);
+                        treemap[seqName].groupname = seqGroup; //store data in map
+                        
+                        it2 = seqsPerGroup.find(seqGroup);
+                        if (it2 == seqsPerGroup.end()) { //if it's a new group
+                            seqsPerGroup[seqGroup] = 1;
+                        }else {//it's a group we already have
+                            seqsPerGroup[seqGroup]++;
+                        }                              
+                    }
+                    pairDone = false; 
+                } 
+            }
+        }
+        
          return error;
      }
         catch(exception& e) {
@@ -183,26 +246,6 @@ string TreeMap::getGroup(string sequenceName) {
                 return "not found";
         }
                 
-}
-/************************************************************/
-void TreeMap::setIndex(string seq, int index) {
-       it = treemap.find(seq);
-       if (it != treemap.end()) { //sequence name was in group file
-               treemap[seq].vectorIndex = index;       
-       }else {
-               treemap[seq].vectorIndex = index;
-               treemap[seq].groupname = "not found";
-       }
-}
-/************************************************************/
-int TreeMap::getIndex(string seq) {
-       
-       it = treemap.find(seq);
-       // if it is a valid sequence name then return index
-       if (it != treemap.end()) { return treemap[seq].vectorIndex; }
-       // if not return error code
-       else { return -1; }
-       
  }
  /************************************************************/
  
diff --git a/treemap.h b/treemap.h

index 57822e02d4b8c3a57f7d0ddf26947c470bf9400b..7ffd1e7003154d672d544aa8a993f371a7f9a28a 100644 (file)
--- a/treemap.h
+++ b/treemap.h
@@ -29,8 +29,8 @@ public:
      int readMap(string);
         int getNumGroups();
         int getNumSeqs();
-       void setIndex(string, int);  //sequencename, index
-       int getIndex(string);           //returns vector index of sequence
+       //void setIndex(string, int);  //sequencename, index
+       //int getIndex(string);         //returns vector index of sequence
         bool isValidGroup(string);  //return true if string is a valid group
         void removeSeq(string);  //removes a sequence, this is to accomadate trees that do not contain all the seqs in your groupfile
         string getGroup(string);
diff --git a/treereader.cpp b/treereader.cpp

index b385d214fbf4043d5b32b58ec23673b3e9136438..0e25f12a39b2a5341ad2ba3bf7255bb8b6aa7edc 100644 (file)
--- a/treereader.cpp
+++ b/treereader.cpp
@@ -8,12 +8,23 @@
  
  #include "treereader.h"
  #include "readtree.h"
+#include "groupmap.h"
  
  /***********************************************************************/
-
-TreeReader::TreeReader(string tf) : treefile(tf)  { 
+TreeReader::TreeReader(string tf, string cf) : treefile(tf), countfile(cf)  { 
      try {
          m = MothurOut::getInstance();
+        ct = new CountTable();
+        ct->readTable(cf);
+        
+        //if no groupinfo in count file we need to add it
+        if (!ct->hasGroupInfo()) {
+            ct->addGroup("Group1");
+            vector<string> namesOfSeqs = ct->getNamesOfSeqs();
+            for (int i = 0; i < namesOfSeqs.size(); i++) { 
+                ct->setAbund(namesOfSeqs[i], "Group1", ct->getNumSeqs(namesOfSeqs[i]));
+            }
+        }
          namefile = "";
          groupfile = "";
          readTrees();
@@ -24,22 +35,32 @@ TreeReader::TreeReader(string tf) : treefile(tf)  {
         }
  }
  /***********************************************************************/
-
-TreeReader::TreeReader(string tf, string gf) : treefile(tf),  groupfile(gf)  { 
-    try {
-        m = MothurOut::getInstance();
-        namefile = "";
-        readTrees();
-    }
-       catch(exception& e) {
-               m->errorOut(e, "TreeReader", "TreeReader");
-               exit(1);
-       }
-}
-/***********************************************************************/
  TreeReader::TreeReader(string tf, string gf, string nf) : treefile(tf),  groupfile(gf), namefile(nf)  { 
      try {
          m = MothurOut::getInstance();
+        countfile = "";
+        ct = new CountTable();
+        if (namefile != "") { ct->createTable(namefile, groupfile, true); }
+        else {
+            Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
+            set<string> nameMap;
+            map<string, string> groupMap;
+            set<string> gps;
+            for (int i = 0; i < m->Treenames.size(); i++) { nameMap.insert(m->Treenames[i]);  }
+            if (groupfile == "") { gps.insert("Group1"); for (int i = 0; i < m->Treenames.size(); i++) { groupMap[m->Treenames[i]] = "Group1"; } }
+            else {
+                GroupMap g(groupfile); 
+                g.readMap();
+                vector<string> seqs = g.getNamesSeqs();
+                for (int i = 0; i < seqs.size(); i++) {  
+                    string group = g.getGroup(seqs[i]);
+                    groupMap[seqs[i]] = group;
+                    gps.insert(group);
+                }
+            }
+            ct->createTable(nameMap, groupMap, gps);
+        }
+
          readTrees();
      }
         catch(exception& e) {
@@ -51,22 +72,15 @@ TreeReader::TreeReader(string tf, string gf, string nf) : treefile(tf),  groupfi
  bool TreeReader::readTrees()  { 
      try {
          
-        tmap = new TreeMap();
-        if (groupfile != "") {      tmap->readMap(groupfile);        }
-               else{ //fake out by putting everyone in one group
-                       Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
-                       for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
-               }
-               
-        int numUniquesInName = 0;
-               if (namefile != "") { numUniquesInName = readNamesFile(); }
+        int numUniquesInName = ct->getNumUniqueSeqs();
+               //if (namefile != "") { numUniquesInName = readNamesFile(); }
                 
                 ReadTree* read = new ReadNewickTree(treefile);
-               int readOk = read->read(tmap); 
+               int readOk = read->read(ct); 
                 
                 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine();  delete read; m->control_pressed=true; return 0; }
                 
-               read->AssembleTrees(names);
+               read->AssembleTrees();
                 trees = read->getTrees();
                 delete read;
          
@@ -74,18 +88,19 @@ bool TreeReader::readTrees()  {
                 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
                 int numNamesInTree;
                 if (namefile != "")  {  
-                       if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
+                       if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = ct->getNumSeqs();  }
                         else {   numNamesInTree = m->Treenames.size();  }
                 }else {  numNamesInTree = m->Treenames.size();  }
                 
                 
                 //output any names that are in group file but not in tree
-               if (numNamesInTree < tmap->getNumSeqs()) {
-                       for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
+               if (numNamesInTree < ct->getNumSeqs()) {
+            vector<string> namesSeqsCt = ct->getNamesOfSeqs();
+                       for (int i = 0; i < namesSeqsCt.size(); i++) {
                                 //is that name in the tree?
                                 int count = 0;
                                 for (int j = 0; j < m->Treenames.size(); j++) {
-                                       if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
+                                       if (namesSeqsCt[i] == m->Treenames[j]) { break; } //found it
                                         count++;
                                 }
                                 
@@ -93,14 +108,8 @@ bool TreeReader::readTrees()  {
                                 
                                 //then you did not find it so report it 
                                 if (count == m->Treenames.size()) { 
-                                       //if it is in your namefile then don't remove
-                                       map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
-                                       
-                                       if (it == nameMap.end()) {
-                                               m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
-                                               tmap->removeSeq(tmap->namesOfSeqs[i]);
-                                               i--; //need this because removeSeq removes name from namesOfSeqs
-                                       }
+                    m->mothurOut(namesSeqsCt[i] + " is in your name or group file and not in your tree. It will be disregarded."); m->mothurOutEndLine();
+                    ct->remove(namesSeqsCt[i]);
                                 }
                         }
                 }
@@ -112,47 +121,6 @@ bool TreeReader::readTrees()  {
                 exit(1);
         }
  }
-/*****************************************************************/
-int TreeReader::readNamesFile() {
-       try {
-               nameMap.clear();
-        names.clear();
-               int numUniquesInName = 0;
-               
-               ifstream in;
-               m->openInputFile(namefile, in);
-               
-               string first, second;
-               map<string, string>::iterator itNames;
-               
-               while(!in.eof()) {
-                       in >> first >> second; m->gobble(in);
-                       
-                       numUniquesInName++;
-                       
-                       itNames = nameMap.find(first);
-                       if (itNames == nameMap.end()) {  
-                               names[first] = second; 
-                               
-                               //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
-                               vector<string> dupNames;
-                               m->splitAtComma(second, dupNames);
-                               
-                               for (int i = 0; i < dupNames.size(); i++) {     
-                                       nameMap[dupNames[i]] = first; 
-                                       if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
-                               }
-                       }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); nameMap.clear(); names.clear(); namefile = ""; return 1; }                    
-               }
-               in.close();
-               
-               return numUniquesInName;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TreeReader", "readNamesFile");
-               exit(1);
-       }
-}
  /***********************************************************************/
  
  
diff --git a/treereader.h b/treereader.h

index fb9c791959f96cd04ce95fae0385f02c9740f546..ac24eb0161336e3be68ad17028a2b6a28eab8489 100644 (file)
--- a/treereader.h
+++ b/treereader.h
@@ -11,29 +11,26 @@
  
  #include "mothurout.h"
  #include "tree.h"
+#include "counttable.h"
  
  class TreeReader {
      
  public:
      
-    TreeReader(string tf);
-       TreeReader(string tf, string gf);
+    TreeReader(string tf, string cf);
      TreeReader(string tf, string gf, string nf);
         ~TreeReader() {}        
      
      vector<Tree*> getTrees()            { return trees;     }
-    map<string, string> getNames()      { return nameMap;   } //dups -> unique
-    map<string, string> getNameMap()    { return names;     } //unique -> dups list
-    
      
  private:
      MothurOut* m;
         vector<Tree*> trees;
-    TreeMap* tmap;
-    map<string, string> nameMap; //dupName -> uniqueName
-    map<string, string> names;
+    CountTable* ct;
+    //map<string, string> nameMap; //dupName -> uniqueName
+   // map<string, string> names;
      
-    string treefile, groupfile, namefile;
+    string treefile, groupfile, namefile, countfile;
      
      bool readTrees();
      int readNamesFile();
diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp

index d45f20cae1f0516c014d42cf7e6afaef57d2a1a2..296a6fe37d0449bb719d69c4001429583bcb74ee 100644 (file)
--- a/trimflowscommand.cpp
+++ b/trimflowscommand.cpp
@@ -28,7 +28,7 @@ vector<string> TrimFlowsCommand::setParameters(){
                 CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal);
                 CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise);
                 CommandParameter pallfiles("allfiles", "Boolean", "", "t", "", "", "",false,false); parameters.push_back(pallfiles);
-               CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
+               CommandParameter porder("order", "String", "", "TACG", "", "", "",false,false); parameters.push_back(porder);
                 CommandParameter pfasta("fasta", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pfasta);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
@@ -347,7 +347,7 @@ int TrimFlowsCommand::execute(){
                         
                         output.close();
                 }
-               outputTypes["flow.files"].push_back(flowFilesFileName);
+               outputTypes["file"].push_back(flowFilesFileName);
                 outputNames.push_back(flowFilesFileName);
                 
  //             set fasta file as new current fastafile
@@ -423,11 +423,9 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN
                         string trashCode = "";
                         
                         flowData.getNext(flowFile); 
-                       //cout << "driver good bit " << flowFile.good() << endl;        
                         flowData.capFlows(maxFlows);    
                         
                         Sequence currSeq = flowData.getSequence();
-                       
                         if(!flowData.hasMinFlows(minFlows)){    //screen to see if sequence is of a minimum number of flows
                                 success = 0;
                                 trashCode += 'l';
@@ -443,6 +441,8 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN
                  
              }
              
+            if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + " " + currSeq.getUnaligned() + "\n"); }
+            
                         if(barcodes.size() != 0){
                                 success = trimOligos.stripBarcode(currSeq, barcodeIndex);
                                 if(success > bdiffs)            {       trashCode += 'b';       }
diff --git a/trimoligos.cpp b/trimoligos.cpp

index 2f92cc847ca75e1a0983aeef2d09e9eaab68e57f..8f4cbe98a7488b9d552f07b6ffd17aeb5e858c23 100644 (file)
--- a/trimoligos.cpp
+++ b/trimoligos.cpp
@@ -14,7 +14,7 @@
  
  /********************************************************************/
  //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
-TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, map<string, int> rbr, vector<string> r, vector<string> lk, vector<string> sp){
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
         try {
                 m = MothurOut::getInstance();
                 
@@ -24,7 +24,6 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<stri
          sdiffs = s;
                 
                 barcodes = br;
-        rbarcodes = rbr;
                 primers = pr;
                 revPrimer = r;
          linker = lk;
@@ -37,7 +36,7 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<stri
  }
  /********************************************************************/
  //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
-TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<int, oligosPair> pr, map<int, oligosPair> br, vector<string> lk, vector<string> sp){
         try {
                 m = MothurOut::getInstance();
                 
@@ -46,9 +45,8 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<stri
          ldiffs = l;
          sdiffs = s;
                 
-               barcodes = br;
-               primers = pr;
-               revPrimer = r;
+               ibarcodes = br;
+               iprimers = pr;
          linker = lk;
          spacer = sp;
         }
@@ -194,6 +192,177 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
                 m->errorOut(e, "TrimOligos", "stripBarcode");
                 exit(1);
         }
+}
+//*******************************************************************/
+int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){
+       try {
+               //look for forward barcode
+               string rawFSequence = forwardSeq.getUnaligned();
+        string rawRSequence = reverseSeq.getUnaligned();
+               int success = bdiffs + 1;       //guilty until proven innocent
+               
+               //can you find the forward barcode
+               for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+                       string foligo = it->second.forward;
+            string roligo = it->second.reverse;
+            
+                       if(rawFSequence.length() < foligo.length()){    //let's just assume that the barcodes are the same length
+                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+            if(rawRSequence.length() < roligo.length()){       //let's just assume that the barcodes are the same length
+                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+                       
+                       if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
+                               group = it->first;
+                               forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
+                reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
+                forwardQual.trimQScores(foligo.length(), -1);
+                reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length());
+                               success = 0;
+                               break;
+                       }
+               }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((bdiffs == 0) || (success == 0)) { return success;  }
+               else { //try aligning and see if you can find it
+                       
+            //look for forward
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (ibarcodes.size() > 0) {
+                               for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+                                       if(it->second.forward.length() > maxLength){ maxLength = it->second.forward.length();  }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minFGroup = -1;
+                       int minFPos = 0;
+                       
+                       for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+                               string oligo = it->second.forward;
+                               
+                               if(rawFSequence.length() < maxLength){  //let's just assume that the barcodes are the same length
+                                       success = bdiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawFSequence.substr(0,oligo.length()+bdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                               
+                               int alnLength = oligo.length();
+                               
+                               for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){      alnLength = i+1;        break;  } }
+                               oligo = oligo.substr(0,alnLength);
+                               temp = temp.substr(0,alnLength);
+                int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minFGroup = it->first;
+                                       minFPos = 0;
+                                       for(int i=0;i<alnLength;i++){
+                                               if(temp[i] != '-'){
+                                                       minFPos++;
+                                               }
+                                       }
+                               }else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{   
+                //check for reverse match
+                if (alignment != NULL) {  delete alignment;  }
+                maxLength = 0;
+                
+                if (ibarcodes.size() > 0) {
+                    for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+                        if(it->second.reverse.length() > maxLength){ maxLength = it->second.reverse.length();  }
+                    }
+                    alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
+                }else{ alignment = NULL; } 
+                
+                //can you find the barcode
+                minDiff = 1e6;
+                minCount = 1;
+                int minRGroup = -1;
+                int minRPos = 0;
+                
+                for(map<int,oligosPair>::iterator it=ibarcodes.begin();it!=ibarcodes.end();it++){
+                    string oligo = it->second.reverse;
+                    
+                    if(rawRSequence.length() < maxLength){     //let's just assume that the barcodes are the same length
+                        success = bdiffs + 10;
+                        break;
+                    }
+                    
+                    //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                    alignment->align(oligo, rawRSequence.substr((rawRSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+                    oligo = alignment->getSeqAAln();
+                    string temp = alignment->getSeqBAln();
+                    
+                    int alnLength = oligo.length();
+                    for(int i=0;i<alnLength;i++){ if(oligo[i] != '-'){ alnLength = i;  break;  } }
+                    oligo = oligo.substr(0,alnLength);
+                    temp = temp.substr(0,alnLength);
+                    int numDiff = countDiffs(oligo, temp);
+                    
+                    if(numDiff < minDiff){
+                        minDiff = numDiff;
+                        minCount = 1;
+                        minRGroup = it->first;
+                        minRPos = 0;
+                        for(int i=0;i<alnLength;i++){
+                            if(temp[i] != '-'){
+                                minRPos++;
+                            }
+                        }
+                    }else if(numDiff == minDiff){
+                        minCount++;
+                    }
+                    
+                }
+
+                if(minDiff > bdiffs)   {       success = minDiff;              }       //no good matches
+                else if(minCount > 1)  {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
+                else{
+                    //we have an acceptable match for the forward and reverse, but do they match?
+                    if (minFGroup == minRGroup) {
+                        group = minFGroup;
+                        forwardSeq.setUnaligned(rawFSequence.substr(minFPos));
+                        reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-minRPos)));
+                        forwardQual.trimQScores(minFPos, -1);
+                        reverseQual.trimQScores(-1, rawRSequence.length()-minRPos);
+                        success = minDiff;
+                    }else { success = bdiffs + 100;    }
+                }
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+               }
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripIBarcode");
+               exit(1);
+       }
         
  }
  //*******************************************************************/
@@ -308,7 +477,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){
         }
         
  }
-//*******************************************************************/
+/*******************************************************************
  int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){
         try {
                 
@@ -428,7 +597,7 @@ int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){
         }
         
  }
-//*******************************************************************/
+/*******************************************************************
  int TrimOligos::stripRBarcode(Sequence& seq, int& group){
         try {
                 
diff --git a/trimoligos.h b/trimoligos.h

index a32b3d8e4f2d388b15b3aa68ed66fa61f33c1681..fb8f74dcb4f309387ccf3caa923354a466c9fccb 100644 (file)
--- a/trimoligos.h
+++ b/trimoligos.h
@@ -15,23 +15,30 @@
  #include "sequence.hpp"
  #include "qualityscores.h"
  
+struct oligosPair {
+       string forward;
+       string reverse;
+       
+       oligosPair() { forward = ""; reverse = "";  }
+       oligosPair(string f, string r) : forward(f), reverse(r) {}
+       ~oligosPair() {}
+};
  
  class TrimOligos {
         
         public:
          TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
-        TrimOligos(int,int, int, int, map<string, int>, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
-        TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
+        TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer
+        TrimOligos(int,int, int, int, map<int, oligosPair>, map<int, oligosPair>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, linker, spacer
                 ~TrimOligos();
         
                 int stripBarcode(Sequence&, int&);      
                 int stripBarcode(Sequence&, QualityScores&, int&);
-    
-        int stripRBarcode(Sequence&, int&);    
-        int stripRBarcode(Sequence&, QualityScores&, int&);
-       
+        int stripBarcode(Sequence&, Sequence&, QualityScores&, QualityScores&, int&);
+       
                 int stripForward(Sequence&, int&);
                 int stripForward(Sequence&, QualityScores&, int&, bool);
+        int stripForward(Sequence&, Sequence&, QualityScores&, QualityScores&, int&);
         
                 bool stripReverse(Sequence&);
                 bool stripReverse(Sequence&, QualityScores&);
@@ -47,11 +54,12 @@ class TrimOligos {
                 int pdiffs, bdiffs, ldiffs, sdiffs;
         
                 map<string, int> barcodes;
-        map<string, int> rbarcodes;
                 map<string, int> primers;
                 vector<string> revPrimer;
          vector<string> linker;
          vector<string> spacer;
+        map<int, oligosPair> ibarcodes;
+        map<int, oligosPair> iprimers;
         
                 MothurOut* m;
         
diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp

index 00367695944bd579c13052210b9f0d8491d0e509..0c21c8993552929a5299841a5089783f7b8c2aef 100644 (file)
--- a/trimseqscommand.cpp
+++ b/trimseqscommand.cpp
@@ -11,13 +11,15 @@
  #include "needlemanoverlap.hpp"
  #include "trimoligos.h"
  
+
  //**********************************************************************************************************************
  vector<string> TrimSeqsCommand::setParameters(){       
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
                 CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos);
                 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+               CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
                 CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip);
                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig);
                 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop);
@@ -58,11 +60,12 @@ string TrimSeqsCommand::getHelpString(){
                 string helpString = "";
                 helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n";
                 helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n";
-               helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
+               helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n";
                 helpString += "The fasta parameter is required.\n";
                 helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n";
                 helpString += "The oligos parameter allows you to provide an oligos file.\n";
                 helpString += "The name parameter allows you to provide a names file with your fasta file.\n";
+        helpString += "The count parameter allows you to provide a count file with your fasta file.\n";
                 helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n";
                 helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
                 helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
@@ -111,6 +114,7 @@ string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
              else if (type == "fasta")            {   outputFileName =  "fasta";   }
              else if (type == "group")            {   outputFileName =  "groups";   }
              else if (type == "name")            {   outputFileName =  "names";   }
+            else if (type == "count")            {   outputFileName =  "count_table";   }
              else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
          }
          return outputFileName;
@@ -133,6 +137,7 @@ TrimSeqsCommand::TrimSeqsCommand(){
                 outputTypes["qfile"] = tempOutNames;
                 outputTypes["group"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
+        outputTypes["count"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand");
@@ -171,6 +176,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                         outputTypes["qfile"] = tempOutNames;
                         outputTypes["group"] = tempOutNames;
                         outputTypes["name"] = tempOutNames;
+            outputTypes["count"] = tempOutNames;
                         
                         //if the user changes the input directory command factory will send this info to us in the output parameter 
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
@@ -208,6 +214,14 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                                 
                         }
  
@@ -279,6 +293,13 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                         if (temp == "not found")        {       nameFile = "";          }
                         else if(temp == "not open")     {       nameFile = "";  abort = true;           }
                         else                                            {       nameFile = temp;        m->setNameFile(nameFile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { abort = true; countfile = ""; }  
+                       else if (countfile == "not found") { countfile = ""; }
+                       else { m->setCountTableFile(countfile); }
+                       
+            if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
                         
                         temp = validParameter.validFile(parameters, "qthreshold", false);       if (temp == "not found") { temp = "0"; }
                         m->mothurConvert(temp, qThreshold);
@@ -331,10 +352,12 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                                 abort = true;
                         }
                         
-                       if (nameFile == "") {
-                               vector<string> files; files.push_back(fastaFile);
-                               parser.getNameFile(files);
-                       }
+            if (countfile == "") {
+                if (nameFile == "") {
+                    vector<string> files; files.push_back(fastaFile);
+                    parser.getNameFile(files);
+                }
+            }
                 }
  
         }
@@ -385,13 +408,27 @@ int TrimSeqsCommand::execute(){
                         outputTypes["name"].push_back(trimNameFile);
                         outputTypes["name"].push_back(scrapNameFile); 
                 }
+        
+        string trimCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "trim." + getOutputFileNameTag("count");
+               string scrapCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "scrap." + getOutputFileNameTag("count");
+               
+               if (countfile != "") {
+            CountTable ct;
+            ct.readTable(countfile);
+            nameCount = ct.getNameMap();
+                       outputNames.push_back(trimCountFile);
+                       outputNames.push_back(scrapCountFile);
+                       outputTypes["count"].push_back(trimCountFile);
+                       outputTypes["count"].push_back(scrapCountFile); 
+               }
+
                 
                 if (m->control_pressed) { return 0; }
                 
                 string outputGroupFileName;
                 if(oligoFile != ""){
                         createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames);
-                       if (createGroup) {
+                       if ((createGroup) && (countfile == "")){
                                 outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group");
                                 outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
                         }
@@ -401,9 +438,9 @@ int TrimSeqsCommand::execute(){
                 setLines(fastaFile, qFileName);
                 
          if(processors == 1){
-            driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
+            driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
          }else{
-            createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); 
+            createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); 
          }      
                 
                 
@@ -446,35 +483,62 @@ int TrimSeqsCommand::execute(){
                         for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
                         outputNames = outputNames2;
                         
-                       for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
-                               ifstream in;
-                               m->openInputFile(it->first, in);
-                               
-                               ofstream out;
-                               string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + getOutputFileNameTag("group");
-                               outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
-                               m->openOutputFile(thisGroupName, out);
-                               
-                               while (!in.eof()){
-                                       if (m->control_pressed) { break; }
-                                       
-                                       Sequence currSeq(in); m->gobble(in);
-                                       out << currSeq.getName() << '\t' << it->second << endl;
+            for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
+                ifstream in;
+                m->openInputFile(it->first, in);
+                
+                ofstream out;
+                string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first));
+                if (countfile == "") { thisGroupName += getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); }
+                else {  thisGroupName += getOutputFileNameTag("count"); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName);  }
+                m->openOutputFile(thisGroupName, out);
+                
+                if (countfile != "") {  out << "Representative_Sequence\ttotal\t" << it->second << endl;  }
+                
+                while (!in.eof()){
+                    if (m->control_pressed) { break; }
                      
-                    if (nameFile != "") {
-                        map<string, string>::iterator itName = nameMap.find(currSeq.getName());
-                        if (itName != nameMap.end()) { 
-                            vector<string> thisSeqsNames; 
-                            m->splitAtChar(itName->second, thisSeqsNames, ',');
-                            for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
-                                out << thisSeqsNames[k] << '\t' << it->second << endl;
-                            }
-                        }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }                                                  
+                    Sequence currSeq(in); m->gobble(in);
+                    if (countfile == "") {  
+                        out << currSeq.getName() << '\t' << it->second << endl;  
+                        
+                        if (nameFile != "") {
+                            map<string, string>::iterator itName = nameMap.find(currSeq.getName());
+                            if (itName != nameMap.end()) { 
+                                vector<string> thisSeqsNames; 
+                                m->splitAtChar(itName->second, thisSeqsNames, ',');
+                                for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+                                    out << thisSeqsNames[k] << '\t' << it->second << endl;
+                                }
+                            }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }                                                      
+                        }
+                    }else { 
+                        map<string, int>::iterator itTotalReps = nameCount.find(currSeq.getName());
+                        if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; }
+                        else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
                      }
-                               }
-                               in.close();
-                               out.close();
-                       }
+                }
+                in.close();
+                out.close();
+            }
+            
+            if (countfile != "") { //create countfile with group info included
+                CountTable* ct = new CountTable();
+                ct->readTable(trimCountFile);
+                map<string, int> justTrimmedNames = ct->getNameMap();
+                delete ct;
+                
+                CountTable newCt;
+                for (map<string, int>::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); }
+                vector<int> tempCounts; tempCounts.resize(groupCounts.size(), 0);
+                for (map<string, int>::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) {
+                    newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance
+                    map<string, string>::iterator it2 = groupMap.find(itNames->first);
+                    if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); }
+                    else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+                }
+                newCt.printTable(trimCountFile);
+            }
                 }
                 
                 if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } return 0;    }
@@ -511,6 +575,11 @@ int TrimSeqsCommand::execute(){
                 if (itTypes != outputTypes.end()) {
                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
                 }
+        
+        itTypes = outputTypes.find("count");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
+               }
  
                 m->mothurOutEndLine();
                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
@@ -527,8 +596,7 @@ int TrimSeqsCommand::execute(){
  }
                 
  /**************************************************************************************/
-
-int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {     
+int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {        
                 
         try {
                 
@@ -552,9 +620,16 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                         m->openOutputFile(scrapNFileName, scrapNameFile);
                 }
                 
+        ofstream trimCountFile;
+               ofstream scrapCountFile;
+               if(countfile != ""){
+                       m->openOutputFile(trimCFileName, trimCountFile);
+                       m->openOutputFile(scrapCFileName, scrapCountFile);
+            if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
+               }
                 
                 ofstream outGroupsFile;
-               if (createGroup){       m->openOutputFile(groupFileName, outGroupsFile);   }
+               if ((createGroup) && (countfile == "")){        m->openOutputFile(groupFileName, outGroupsFile);   }
                 if(allFiles){
                         for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
                                 for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
@@ -585,20 +660,17 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                 
                 int count = 0;
                 bool moreSeqs = 1;
-               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
+               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
         
                 while (moreSeqs) {
                                 
                         if (m->control_pressed) { 
                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
-                               if (createGroup) {       outGroupsFile.close();   }
-
-                               if(qFileName != ""){
-                                       qFile.close();
-                               }
-                               for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); }
-
-                               return 0;
+                               if ((createGroup) && (countfile == "")) {        outGroupsFile.close();   }
+                if(qFileName != "")    {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
+                if(nameFile != "")     {       scrapNameFile.close(); trimNameFile.close();    }
+                if(countfile != "")    {       scrapCountFile.close(); trimCountFile.close();  }
+                               for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } return 0;
                         }
                         
                         int success = 1;
@@ -611,7 +683,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                         QualityScores currQual;
                         if(qFileName != ""){
                                 currQual = QualityScores(qFile);  m->gobble(qFile);
-                if ((m->debug)&&(count>15800)) { m->mothurOut("[DEBUG]: " + toString(count) + " fasta = " + currSeq.getName() + '\n'); m->mothurOut("[DEBUG]: " + toString(getpid()) + '\n'); }
+                //cout << currQual.getName() << endl;
                         }
                         
                         string origSeq = currSeq.getUnaligned();
@@ -632,12 +704,6 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                         if(success > bdiffs)            {       trashCode += 'b';       }
                                         else{ currentSeqsDiffs += success;  }
                                 }
-                
-                if(rbarcodes.size() != 0){
-                                       success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
-                                       if(success > bdiffs)            {       trashCode += 'b';       }
-                                       else{ currentSeqsDiffs += success;  }
-                               }
                                 
                  if(numSpacers != 0){
                                         success = trimOligos.stripSpacer(currSeq, currQual);
@@ -704,6 +770,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                         }
                                 }
                                 
+                if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
+                
                                 if(trashCode.length() == 0){
                                         currSeq.setAligned(currSeq.getUnaligned());
                                         currSeq.printSequence(trimFASTAFile);
@@ -718,6 +786,15 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                 if (itName != nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
                                                 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
                                         }
+                    
+                    int numRedundants = 0;
+                    if (countfile != "") {
+                        map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
+                        if (itCount != nameCount.end()) { 
+                            trimCountFile << itCount->first << '\t' << itCount->second << endl;
+                            numRedundants = itCount->second-1;
+                        }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
+                    }
                                         
                                         if (createGroup) {
                                                 if(barcodes.size() != 0){
@@ -732,9 +809,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                                 } 
                                                         }
                                                         
-                                                       outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
+                            if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
+                            
+                                                       if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
+                            else {   groupMap[currSeq.getName()] = thisGroup; }
                                                         
-                            int numRedundants = 0;
                                                         if (nameFile != "") {
                                                                 map<string, string>::iterator itName = nameMap.find(currSeq.getName());
                                                                 if (itName != nameMap.end()) { 
@@ -782,6 +861,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                 if (itName != nameMap.end()) {  scrapNameFile << itName->first << '\t' << itName->second << endl; }
                                                 else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); }
                                         }
+                    if (countfile != "") {
+                        map<string, int>::iterator itCount = nameCount.find(currSeq.getName());
+                        if (itCount != nameCount.end()) { 
+                            trimCountFile << itCount->first << '\t' << itCount->second << endl;
+                        }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); }
+                    }
+                    
                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
                                         currSeq.setUnaligned(origSeq);
                                         currSeq.setAligned(origSeq);
@@ -815,6 +901,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                 if (createGroup) {       outGroupsFile.close();   }
                 if(qFileName != "")     {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
                 if(nameFile != "")      {       scrapNameFile.close(); trimNameFile.close();    }
+        if(countfile != "")    {       scrapCountFile.close(); trimCountFile.close();  }
                 
                 return count;
         }
@@ -826,7 +913,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
  
  /**************************************************************************************************/
  
-int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
+int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
         try {
          
          int process = 1;
@@ -877,6 +964,8 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                                                  (scrapQualFileName + toString(getpid()) + ".temp"),
                                                                  (trimNameFileName + toString(getpid()) + ".temp"),
                                                                  (scrapNameFileName + toString(getpid()) + ".temp"),
+                                 (trimCountFileName + toString(getpid()) + ".temp"),
+                                                                (scrapCountFileName + toString(getpid()) + ".temp"),
                                                                  (groupFile + toString(getpid()) + ".temp"),
                                                                  tempFASTAFileNames,
                                                                  tempPrimerQualFileNames,
@@ -897,6 +986,11 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                         for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
                                                 out << it->first << '\t' << it->second << endl;
                                         }
+                    
+                    out << groupMap.size() << endl;
+                    for (map<string, string>::iterator it = groupMap.begin(); it != groupMap.end(); it++) {
+                                               out << it->first << '\t' << it->second << endl;
+                                       }
                                         out.close();
                                 }
                                 exit(0);
@@ -919,8 +1013,12 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                         m->openOutputFile(trimNameFileName, temp);              temp.close();
                         m->openOutputFile(scrapNameFileName, temp);             temp.close();
                 }
+        if (countfile != "") {
+                       m->openOutputFile(trimCountFileName, temp);             temp.close();
+                       m->openOutputFile(scrapCountFileName, temp);            temp.close();
+               }
  
-               driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
+               driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<processIDS.size();i++) { 
@@ -970,22 +1068,24 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
  
              
                         trimData* tempTrim = new trimData(filename,
-                                              qFileName, nameFile,
+                                              qFileName, nameFile, countfile,
                                                (trimFASTAFileName+extension),
                                                (scrapFASTAFileName+extension),
                                                (trimQualFileName+extension),
                                                (scrapQualFileName+extension),
                                                (trimNameFileName+extension),
                                                (scrapNameFileName+extension),
+                                              (trimCountFileName+extension),
+                                              (scrapCountFileName+extension),
                                                (groupFile+extension),
                                                tempFASTAFileNames,
                                                tempPrimerQualFileNames,
                                                tempNameFileNames,
                                                lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
-                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer, 
+                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, 
                                               primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
                                                qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
-                                             minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
+                                             minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount);
                         pDataArray.push_back(tempTrim);
              
                         hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
@@ -1004,7 +1104,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                         m->openOutputFile(scrapNameFileName, temp);             temp.close();
                 }
          
-               driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
+               driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
          processIDS.push_back(processors-1);
  
          
@@ -1018,6 +1118,11 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                  if (it2 == groupCounts.end()) {        groupCounts[it->first] = it->second; }
                  else { groupCounts[it->first] += it->second; }
              }
+            for (map<string, string>::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) {
+                map<string, string>::iterator it2 = groupMap.find(it->first);
+                if (it2 == groupMap.end()) {   groupMap[it->first] = it->second; }
+                else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n");  }
+            }
              CloseHandle(hThreadArray[i]);
                         delete pDataArray[i];
                 }
@@ -1048,8 +1153,15 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                 m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName);
                                 m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp"));
                         }
+            
+            if(countfile != ""){
+                               m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName);
+                               m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp"));
+                               m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName);
+                               m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp"));
+                       }
                         
-                       if(createGroup){
+                       if((createGroup)&&(countfile == "")){
                                 m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile);
                                 m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp"));
                         }
@@ -1087,14 +1199,27 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                 in >> tempNum; m->gobble(in);
                                 
                                 if (tempNum != 0) {
-                                       while (!in.eof()) { 
-                                               in >> group >> tempNum; m->gobble(in);
+                                       for (int i = 0; i < tempNum; i++) { 
+                        int groupNum;
+                                               in >> group >> groupNum; m->gobble(in);
                          
                                                 map<string, int>::iterator it = groupCounts.find(group);
-                                               if (it == groupCounts.end()) {  groupCounts[group] = tempNum; }
-                                               else { groupCounts[it->first] += tempNum; }
+                                               if (it == groupCounts.end()) {  groupCounts[group] = groupNum; }
+                                               else { groupCounts[it->first] += groupNum; }
                                         }
                                 }
+                in >> tempNum; m->gobble(in);
+                if (tempNum != 0) {
+                                       for (int i = 0; i < tempNum; i++) { 
+                        string group, seqName;
+                                               in >> seqName >> group; m->gobble(in);
+                        
+                                               map<string, string>::iterator it = groupMap.find(seqName);
+                                               if (it == groupMap.end()) {     groupMap[seqName] = group; }
+                                               else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n");  }
+                                       }
+                               }
+                
                                 in.close(); m->mothurRemove(tempFile);
                         }
              #endif
@@ -1255,7 +1380,9 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                 while(!inOligos.eof()){
  
                         inOligos >> type; 
-                                       
+            
+                       if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); }      
+            
                         if(type[0] == '#'){
                                 while (!inOligos.eof()) {       char c = inOligos.get();  if (c == 10 || c == 13){      break;  }       } // get rest of line if there's any crap there
                                 m->gobble(inOligos);
@@ -1266,6 +1393,8 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                 for(int i=0;i<type.length();i++){       type[i] = toupper(type[i]);  }
                                 
                                 inOligos >> oligo;
+                
+                if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); }
                                 
                                 for(int i=0;i<oligo.length();i++){
                                         oligo[i] = toupper(oligo[i]);
@@ -1287,6 +1416,8 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                         map<string, int>::iterator itPrime = primers.find(oligo);
                                         if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
                                         
+                    if (m->debug) {  if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); }  }
+                    
                                         primers[oligo]=indexPrimer; indexPrimer++;              
                                         primerNameVector.push_back(group);
                                 }
@@ -1298,33 +1429,11 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                 }
                                 else if(type == "BARCODE"){
                                         inOligos >> group;
-                    
-                    //barcode lines can look like   BARCODE   atgcatgc   groupName  - for 454 seqs
-                    //or                            BARCODE   atgcatgc   atgcatgc    groupName  - for illumina data that has forward and reverse info
-                                       string temp = "";
-                    while (!inOligos.eof())    {       
-                                               char c = inOligos.get(); 
-                                               if (c == 10 || c == 13){        break;  }
-                                               else if (c == 32 || c == 9){;} //space or tab
-                                               else {  temp += c;  }
-                                       } 
                                         
-                    //then this is illumina data with 4 columns
-                    if (temp != "") {  
-                        string reverseBarcode = reverseOligo(group); //reverse barcode
-                        group = temp;
-                        
-                        //check for repeat barcodes
-                        map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
-                        if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine();  }
-                                               
-                        rbarcodes[reverseBarcode]=indexBarcode; 
-                    }
-                        
                                         //check for repeat barcodes
                                         map<string, int>::iterator itBar = barcodes.find(oligo);
                                         if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
-                                               
+                    
                                         barcodes[oligo]=indexBarcode; indexBarcode++;
                                         barcodeNameVector.push_back(group);
                                 }else if(type == "LINKER"){
@@ -1332,7 +1441,7 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                 }else if(type == "SPACER"){
                                         spacer.push_back(oligo);
                                 }
-                               else{   m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();  }
+                               else{   m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
                         }
                         m->gobble(inOligos);
                 }       
@@ -1370,6 +1479,7 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                         string fastaFileName = "";
                                         string qualFileName = "";
                                         string nameFileName = "";
+                    string countFileName = "";
                                         
                                         if(primerName == ""){
                                                 comboGroupName = barcodeNameVector[itBar->second];
@@ -1416,7 +1526,6 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                                 nameFileNames[itBar->second][itPrimer->second] = nameFileName;
                                                 m->openOutputFile(nameFileName, temp);          temp.close();
                                         }
-                                       
                                 }
                         }
                 }
@@ -1438,7 +1547,7 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                 break;
                         }
                 }
-               
+
                 if (allBlank) {
                         m->mothurOut("[WARNING]: your oligos file does not contain any group names.  mothur will not create a groupfile."); m->mothurOutEndLine();
                         allFiles = false;
diff --git a/trimseqscommand.h b/trimseqscommand.h

index 957f37a65bfb1c177f000c9c6d031ddfc3784c81..1ffad218ccee951e7ac9097d04f84327099fda9d 100644 (file)
--- a/trimseqscommand.h
+++ b/trimseqscommand.h
@@ -14,8 +14,8 @@
  #include "command.hpp"
  #include "sequence.hpp"
  #include "qualityscores.h"
-#include "groupmap.h"
  #include "trimoligos.h"
+#include "counttable.h"
  
  
  class TrimSeqsCommand : public Command {
@@ -36,16 +36,13 @@ public:
         void help() { m->mothurOut(getHelpString()); }  
         
  private:
-       
-       GroupMap* groupMap;
-    
      struct linePair {
          unsigned long long start;
          unsigned long long end;
          linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
          linePair() {}
      };
-
+    
         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
         bool keepFirstTrim(Sequence&, QualityScores&);
         bool removeLastTrim(Sequence&, QualityScores&);
@@ -55,7 +52,7 @@ private:
      string reverseOligo(string);
  
         bool abort, createGroup;
-       string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
+       string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
         
         bool flip, allFiles, qtrim, keepforward;
         int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
@@ -64,7 +61,6 @@ private:
         vector<string> revPrimer, outputNames;
         set<string> filesToRemove;
         map<string, int> barcodes;
-    map<string, int> rbarcodes;
         vector<string> groupVector;
         map<string, int> primers;
      vector<string>  linker;
@@ -75,13 +71,15 @@ private:
         vector<string> barcodeNameVector;       //needed here?
         map<string, int> groupCounts;  
         map<string, string> nameMap;
+    map<string, int> nameCount; //for countfile name -> repCount
+    map<string, string> groupMap; //for countfile name -> group
  
         vector<int> processIDS;   //processid
         vector<linePair> lines;
         vector<linePair> qLines;
         
-       int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);    
-       int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
+       int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);    
+       int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
         int setLines(string, string);
  };
  
@@ -92,7 +90,7 @@ private:
  struct trimData {
      unsigned long long start, end;
      MothurOut* m;
-    string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
+    string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
         vector<vector<string> > fastaFileNames;
      vector<vector<string> > qualFileNames;
      vector<vector<string> > nameFileNames;
@@ -103,8 +101,8 @@ struct trimData {
         double qRollAverage, qThreshold, qWindowAverage, qAverage;
      vector<string> revPrimer;
         map<string, int> barcodes;
-    map<string, int> rbarcodes;
         map<string, int> primers;
+    map<string, int> nameCount;
      vector<string>  linker;
      vector<string>  spacer;
         map<string, int> combos;
@@ -112,22 +110,26 @@ struct trimData {
         vector<string> barcodeNameVector;       
         map<string, int> groupCounts;  
         map<string, string> nameMap;
+    map<string, string> groupMap;
      
         trimData(){}
-       trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
-                      int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa, 
+       trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
+                      int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, 
                        vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
                        int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
-                      int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
+                      int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
          filename = fn;
          qFileName = qn;
          nameFile = nf;
+        countfile = cf;
          trimFileName = tn;
          scrapFileName = sn;
          trimQFileName = tqn;
          scrapQFileName = sqn;
          trimNFileName = tnn;
          scrapNFileName = snn;
+        trimCFileName = tcn;
+        scrapCFileName = scn;
          groupFileName = gn;
          fastaFileNames = ffn;
          qualFileNames = qfn;
@@ -137,6 +139,7 @@ struct trimData {
          qlineStart = qstart;
          qlineEnd = qend;
                 m = mout;
+        nameCount = ncount;
          
          pdiffs = pd;
          bdiffs = bd;
@@ -144,7 +147,6 @@ struct trimData {
          sdiffs = sd;
          tdiffs = td;
          barcodes = bar;
-        rbarcodes = rbar;
          primers = pri;      numFPrimers = primers.size();
          revPrimer = revP;   numRPrimers = revPrimer.size();
          linker = li;        numLinkers = linker.size();
@@ -203,7 +205,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                 
                 
                 ofstream outGroupsFile;
-               if (pDataArray->createGroup){   pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile);   }
+               if ((pDataArray->createGroup) && (pDataArray->countfile == "")){        pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile);   }
                 if(pDataArray->allFiles){
                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
@@ -222,6 +224,14 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                         }
                 }
                 
+        ofstream trimCountFile;
+               ofstream scrapCountFile;
+               if(pDataArray->countfile != ""){
+                       pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
+                       pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
+            if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
+               }
+        
                 ifstream inFASTA;
                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
                 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
@@ -241,14 +251,18 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                 }
                 
                 
-               TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
+               TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
          
                 pDataArray->count = pDataArray->lineEnd;
                 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
                                    
                         if (pDataArray->m->control_pressed) { 
                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
-                               if (pDataArray->createGroup) {   outGroupsFile.close();   }
+                               if ((pDataArray->createGroup) && (pDataArray->countfile == "")) {        outGroupsFile.close();   }
+                if(pDataArray->qFileName != "")        {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
+                if(pDataArray->nameFile != "") {       scrapNameFile.close(); trimNameFile.close();    }
+                if(pDataArray->countfile != "")        {       scrapCountFile.close(); trimCountFile.close();  }
+
                                 if(pDataArray->qFileName != ""){ qFile.close(); }
                                 return 0;
                         }
@@ -282,12 +296,6 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                         else{ currentSeqsDiffs += success;  }
                                 }
                  
-                               if(pDataArray->rbarcodes.size() != 0){
-                                       success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
-                                       if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
-                                       else{ currentSeqsDiffs += success;  }
-                               }
-                
                  if(pDataArray->numSpacers != 0){
                                         success = trimOligos.stripSpacer(currSeq, currQual);
                                         if(success > pDataArray->sdiffs)                {       trashCode += 's';       }
@@ -399,6 +407,15 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
                                         }
                                         
+                    int numRedundants = 0;
+                    if (pDataArray->countfile != "") {
+                        map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
+                        if (itCount != pDataArray->nameCount.end()) { 
+                            trimCountFile << itCount->first << '\t' << itCount->second << endl;
+                            numRedundants = itCount->second-1;
+                        }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+                    }
+                                       
                                         if (pDataArray->createGroup) {
                                                 if(pDataArray->barcodes.size() != 0){
                                                         string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
@@ -412,9 +429,9 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                                                 } 
                                                         }
                                                         
-                                                       outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
+                                                       if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
+                            else {   pDataArray->groupMap[currSeq.getName()] = thisGroup; }
                                                         
-                            int numRedundants = 0;
                                                         if (pDataArray->nameFile != "") {
                                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
                                                                 if (itName != pDataArray->nameMap.end()) { 
@@ -462,6 +479,12 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                                 if (itName != pDataArray->nameMap.end()) {  scrapNameFile << itName->first << '\t' << itName->second << endl; }
                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
                                         }
+                    if (pDataArray->countfile != "") {
+                        map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
+                        if (itCount != pDataArray->nameCount.end()) { 
+                            trimCountFile << itCount->first << '\t' << itCount->second << endl;
+                        }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+                    }
                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
                                         currSeq.setUnaligned(origSeq);
                                         currSeq.setAligned(origSeq);
diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp

index 0749cb79ff41e03c55ca2a9defeb1b70e44645d8..edc4bbc027edf761568919163ca8f696096c7900 100644 (file)
--- a/unifracunweightedcommand.cpp
+++ b/unifracunweightedcommand.cpp
@@ -16,8 +16,9 @@
  vector<string> UnifracUnweightedCommand::setParameters(){      
         try {
                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
@@ -42,7 +43,7 @@ vector<string> UnifracUnweightedCommand::setParameters(){
  string UnifracUnweightedCommand::getHelpString(){      
         try {
                 string helpString = "";
-               helpString += "The unifrac.unweighted command parameters are tree, group, name, groups, iters, distance, processors, root and random.  tree parameter is required unless you have valid current tree file.\n";
+               helpString += "The unifrac.unweighted command parameters are tree, group, name, count, groups, iters, distance, processors, root and random.  tree parameter is required unless you have valid current tree file.\n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 1 valid group.\n";
                 helpString += "The group names are separated by dashes.  The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
                 helpString += "The distance parameter allows you to create a distance file from the results. The default is false. You may set distance to lt, square or column.\n";
@@ -165,6 +166,14 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
              //check for required parameters
@@ -186,6 +195,19 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option)  {
                         if (namefile == "not open") { namefile = ""; abort = true; }
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
+            
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
                         
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
                         
@@ -233,7 +255,13 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option)  {
                         consensus = m->isTrue(temp);
              
                         if (subsample && random) {  m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true;  } 
-                       if (subsample && (groupfile == "")) {  m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true;  } 
+            if (countfile == "") { if (subsample && (groupfile == "")) {  m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true;  } }
+            else {  
+                CountTable testCt; 
+                if ((!testCt.testGroups(countfile)) && (subsample)) {
+                    m->mothurOut("[ERROR]: if subsample=t, a count file with group info must be provided.\n"); abort=true;  
+                }
+            }
              if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
              if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
  
@@ -246,10 +274,12 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option)  {
                                 m->setGroups(Groups);
                         }
                         
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(treefile);
-                               parser.getNameFile(files);
-                       }
+                       if (countfile=="") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(treefile);
+                    parser.getNameFile(files);
+                } 
+            }
                 }
                 
         }
@@ -267,12 +297,12 @@ int UnifracUnweightedCommand::execute() {
                 
                 m->setTreeFile(treefile);
                 
-               TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+               TreeReader* reader;
+        if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); }
+        else { reader = new TreeReader(treefile, countfile); }
          T = reader->getTrees();
-        tmap = T[0]->getTreeMap();
-        map<string, string> nameMap = reader->getNames();
-        map<string, string> unique2Dup = reader->getNameMap();
-        delete reader; 
+        ct = T[0]->getCountTable();
+        delete reader;
          
                 sumFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + getOutputFileNameTag("uwsummary");
                 outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile);
@@ -280,7 +310,7 @@ int UnifracUnweightedCommand::execute() {
                 
                 SharedUtil util;
                 Groups = m->getGroups();
-               vector<string> namesGroups = tmap->getNamesOfGroups();
+               vector<string> namesGroups = ct->getNamesOfGroups();
                 util.setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted");        //sets the groups the user wants to analyze
                 
                 Unweighted unweighted(includeRoot);
@@ -292,10 +322,9 @@ int UnifracUnweightedCommand::execute() {
              //user has not set size, set size = smallest samples size
              if (subsampleSize == -1) { 
                  vector<string> temp; temp.push_back(Groups[0]);
-                subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group
+                subsampleSize = ct->getGroupCount(Groups[0]); //num in first group
                  for (int i = 1; i < Groups.size(); i++) {
-                    temp.clear(); temp.push_back(Groups[i]);
-                    int thisSize = (tmap->getNamesSeqs(temp)).size();
+                    int thisSize = ct->getGroupCount(Groups[i]);
                      if (thisSize < subsampleSize) {    subsampleSize = thisSize;       }
                  }
                  m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
@@ -303,9 +332,7 @@ int UnifracUnweightedCommand::execute() {
                  vector<string> newGroups = Groups;
                  Groups.clear();
                  for (int i = 0; i < newGroups.size(); i++) {
-                    vector<string> thisGroup; thisGroup.push_back(newGroups[i]);
-                    vector<string> thisGroupsSeqs = tmap->getNamesSeqs(thisGroup);
-                    int thisSize = thisGroupsSeqs.size();
+                    int thisSize = ct->getGroupCount(newGroups[i]);
                      
                      if (thisSize >= subsampleSize) {    Groups.push_back(newGroups[i]);        }
                      else {   m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
@@ -330,7 +357,7 @@ int UnifracUnweightedCommand::execute() {
          
                 //get pscores for users trees
                 for (int i = 0; i < T.size(); i++) {
-                       if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } return 0; }
+                       if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }outSum.close(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } return 0; }
                         
              counter = 0;
                         
@@ -351,7 +378,7 @@ int UnifracUnweightedCommand::execute() {
  
                         userData = unweighted.getValues(T[i], processors, outputDir);  //userData[0] = unweightedscore
                 
-                       if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close();  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  }return 0; }
+                       if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close();  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  }return 0; }
                         
                         //output scores for each combination
                         for(int k = 0; k < numComp; k++) {
@@ -366,7 +393,7 @@ int UnifracUnweightedCommand::execute() {
              
              if (random) {  runRandomCalcs(T[i], userData);  }
                         
-                       if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } return 0;  }
+                       if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } return 0;  }
              
              int startSubsample = time(NULL);
              
@@ -376,32 +403,28 @@ int UnifracUnweightedCommand::execute() {
                  if (m->control_pressed) { break; }
                  
                  //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
-                TreeMap* newTmap = new TreeMap();
-                //newTmap->getCopy(*tmap);
-                
-                //SubSample sample;
-                //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
-                
+                CountTable* newCt = new CountTable();
+                 
                  //uses method of setting groups to doNotIncludeMe
                  SubSample sample;
-                Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup);
+                Tree* subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize);
                  
                  //call new weighted function
                  vector<double> iterData; iterData.resize(numComp,0);
                  Unweighted thisUnweighted(includeRoot);
                  iterData = thisUnweighted.getValues(subSampleTree, processors, outputDir); //userData[0] = weightedscore
-                
+        
                  //save data to make ave dist, std dist
                  calcDistsTotals.push_back(iterData);
                  
-                delete newTmap;
+                delete newCt;
                  delete subSampleTree;
                  
                  if((thisIter+1) % 100 == 0){   m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine();              }
              }
-            m->mothurOut("It took " + toString(time(NULL) - startSubsample) + " secs to run the subsampling."); m->mothurOutEndLine();
+            if (subsample) { m->mothurOut("It took " + toString(time(NULL) - startSubsample) + " secs to run the subsampling."); m->mothurOutEndLine(); }
              
-            if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0;  }
+            if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output;  } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } return 0;  }
  
              if (subsample) {  getAverageSTDMatrices(calcDistsTotals, i); }
              if (consensus) {  getConsensusTrees(calcDistsTotals, i);  }
@@ -420,7 +443,7 @@ int UnifracUnweightedCommand::execute() {
                 
  
                 outSum.close();
-               delete tmap; 
+               delete ct; 
                 for (int i = 0; i < T.size(); i++) { delete T[i]; }
                 
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  }     return 0; }
@@ -472,7 +495,7 @@ int UnifracUnweightedCommand::getAverageSTDMatrices(vector< vector<double> >& di
          //find standard deviation
          vector<double> stdDev; stdDev.resize(numComp, 0);
          
-        for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+        for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
              for (int j = 0; j < dists[thisIter].size(); j++) {
                  stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
              }
@@ -578,8 +601,16 @@ int UnifracUnweightedCommand::getConsensusTrees(vector< vector<double> >& dists,
          m->runParse = false;
          
          //create treemap class from groupmap for tree class to use
-        TreeMap newTmap;
-        newTmap.makeSim(m->getGroups());
+        CountTable newCt;
+        set<string> nameMap;
+        map<string, string> groupMap;
+        set<string> gps;
+        for (int i = 0; i < m->getGroups().size(); i++) { 
+            nameMap.insert(m->getGroups()[i]); 
+            gps.insert(m->getGroups()[i]); 
+            groupMap[m->getGroups()[i]] = m->getGroups()[i];
+        }
+        newCt.createTable(nameMap, groupMap, gps);
          
          //clear  old tree names if any
          m->Treenames.clear();
@@ -587,7 +618,7 @@ int UnifracUnweightedCommand::getConsensusTrees(vector< vector<double> >& dists,
          //fills globaldatas tree names
          m->Treenames = m->getGroups();
          
-        vector<Tree*> newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created
+        vector<Tree*> newTrees = buildTrees(dists, treeNum, newCt); //also creates .all.tre file containing the trees created
          
          if (m->control_pressed) { return 0; }
          
@@ -613,7 +644,7 @@ int UnifracUnweightedCommand::getConsensusTrees(vector< vector<double> >& dists,
  }
  /**************************************************************************************************/
  
-vector<Tree*> UnifracUnweightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, TreeMap& mytmap) {
+vector<Tree*> UnifracUnweightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, CountTable& myct) {
         try {
          
          vector<Tree*> trees;
@@ -647,9 +678,8 @@ vector<Tree*> UnifracUnweightedCommand::buildTrees(vector< vector<double> >& dis
                         }
              
              //create tree
-            Tree* tempTree = new Tree(&mytmap, sims);
-            map<string, string> empty;
-            tempTree->assembleTree(empty);
+            Tree* tempTree = new Tree(&myct, sims);
+            tempTree->assembleTree();
              
              trees.push_back(tempTree);
              
diff --git a/unifracunweightedcommand.h b/unifracunweightedcommand.h

index 15c3b9684b01fa58403276b80864684f99e8d189..107083fa41a3dc80f922a74ce6c95c18201511a2 100644 (file)
--- a/unifracunweightedcommand.h
+++ b/unifracunweightedcommand.h
@@ -12,7 +12,7 @@
  
  #include "command.hpp"
  #include "unweighted.h"
-#include "treemap.h"
+#include "counttable.h"
  #include "sharedutilities.h"
  #include "fileoutput.h"
  #include "readtree.h"
@@ -39,7 +39,7 @@ class UnifracUnweightedCommand : public Command {
         private:
                 FileOutput* output;
                 vector<Tree*> T;           //user trees
-               TreeMap* tmap;
+               CountTable* ct;
                 string sumFile, allGroups;
                 vector<string> groupComb; // AB. AC, BC...
                 int iters, numGroups, numComp, counter, processors, subsampleSize, subsampleIters;
@@ -50,7 +50,7 @@ class UnifracUnweightedCommand : public Command {
                 vector< map<float, float> > rCumul;  //map <unweighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each combination.
                 
                 bool abort, phylip, random, includeRoot, consensus, subsample;
-               string groups, itersString, outputDir, outputForm, treefile, groupfile, namefile;
+               string groups, itersString, outputDir, outputForm, treefile, groupfile, namefile, countfile;
                 vector<string> Groups, outputNames; //holds groups to be used
  
                 ofstream outSum, out;
@@ -60,7 +60,7 @@ class UnifracUnweightedCommand : public Command {
                 void printUWSummaryFile(int);
                 void printUnweightedFile();
                 void createPhylipFile(int);
-        vector<Tree*> buildTrees(vector< vector<double> >&, int, TreeMap&);
+        vector<Tree*> buildTrees(vector< vector<double> >&, int, CountTable&);
          int getConsensusTrees(vector< vector<double> >&, int);
          int getAverageSTDMatrices(vector< vector<double> >&, int);
                 
diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp

index d1e883382890081495dac72238cd05b3c5ad644e..cbec7490bfe028ea02de0b4a1e024b1ce2d89e1f 100644 (file)
--- a/unifracweightedcommand.cpp
+++ b/unifracweightedcommand.cpp
@@ -16,8 +16,9 @@
  vector<string> UnifracWeightedCommand::setParameters(){        
         try {
                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
-               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
-               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
@@ -42,7 +43,7 @@ vector<string> UnifracWeightedCommand::setParameters(){
  string UnifracWeightedCommand::getHelpString(){        
         try {
                 string helpString = "";
-               helpString += "The unifrac.weighted command parameters are tree, group, name, groups, iters, distance, processors, root, subsample, consensus and random.  tree parameter is required unless you have valid current tree file.\n";
+               helpString += "The unifrac.weighted command parameters are tree, group, name, count, groups, iters, distance, processors, root, subsample, consensus and random.  tree parameter is required unless you have valid current tree file.\n";
                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed.  You must enter at least 2 valid groups.\n";
                 helpString += "The group names are separated by dashes.  The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
                 helpString += "The distance parameter allows you to create a distance file from the results. The default is false.\n";
@@ -164,6 +165,14 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                
+                it = parameters.find("count");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["count"] = inputDir + it->second;            }
+                               }
                         }
                         
                         //check for required parameters
@@ -186,6 +195,19 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) {
                         else if (namefile == "not found") { namefile = ""; }
                         else { m->setNameFile(namefile); }
                         
+            countfile = validParameter.validFile(parameters, "count", true);
+                       if (countfile == "not open") { countfile = ""; abort = true; }
+                       else if (countfile == "not found") { countfile = "";  } 
+                       else { m->setCountTableFile(countfile); }
+            
+            if ((namefile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
+            }
+                       
+            if ((groupfile != "") && (countfile != "")) {
+                m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
+            }
+
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
                         
                                                                                                                                         
@@ -233,14 +255,22 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) {
                         consensus = m->isTrue(temp);
              
                         if (subsample && random) {  m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true;  } 
-                       if (subsample && (groupfile == "")) {  m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true;  } 
+                       if (countfile == "") { if (subsample && (groupfile == "")) {  m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true;  } }
+            else {  
+                CountTable testCt; 
+                if ((!testCt.testGroups(countfile)) && (subsample)) {
+                    m->mothurOut("[ERROR]: if subsample=t, a count file with group info must be provided.\n"); abort=true;  
+                }
+            }
              if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
              if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
              
-                       if (namefile == "") {
-                               vector<string> files; files.push_back(treefile);
-                               parser.getNameFile(files);
-                       }
+                       if (countfile=="") {
+                if (namefile == "") {
+                    vector<string> files; files.push_back(treefile);
+                    parser.getNameFile(files);
+                } 
+            }
                 }
                 
                 
@@ -258,14 +288,14 @@ int UnifracWeightedCommand::execute() {
                 
                 m->setTreeFile(treefile);
                 
-        TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
+        TreeReader* reader;
+        if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); }
+        else { reader = new TreeReader(treefile, countfile); }
          T = reader->getTrees();
-        tmap = T[0]->getTreeMap();
-        map<string, string> nameMap = reader->getNames();
-        map<string, string> unique2Dup = reader->getNameMap();
+        ct = T[0]->getCountTable();
          delete reader;
-    
-        if (m->control_pressed) {  delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
+        
+        if (m->control_pressed) {  delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
                                 
                 sumFile = outputDir + m->getSimpleName(treefile) + getOutputFileNameTag("wsummary");
                 m->openOutputFile(sumFile, outSum);
@@ -274,11 +304,11 @@ int UnifracWeightedCommand::execute() {
          SharedUtil util;
                 string s; //to make work with setgroups
                 Groups = m->getGroups();
-               vector<string> nameGroups = tmap->getNamesOfGroups();
+               vector<string> nameGroups = ct->getNamesOfGroups();
                 util.setGroups(Groups, nameGroups, s, numGroups, "weighted");   //sets the groups the user wants to analyze
                 m->setGroups(Groups);
                 
-        if (m->control_pressed) {  delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
+        if (m->control_pressed) {  delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
          
                 Weighted weighted(includeRoot);
                         
@@ -289,10 +319,9 @@ int UnifracWeightedCommand::execute() {
              //user has not set size, set size = smallest samples size
              if (subsampleSize == -1) { 
                  vector<string> temp; temp.push_back(Groups[0]);
-                subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group
+                subsampleSize = ct->getGroupCount(Groups[0]); //num in first group
                  for (int i = 1; i < Groups.size(); i++) {
-                    temp.clear(); temp.push_back(Groups[i]);
-                    int thisSize = (tmap->getNamesSeqs(temp)).size();
+                    int thisSize = ct->getGroupCount(Groups[i]);
                      if (thisSize < subsampleSize) {    subsampleSize = thisSize;       }
                  }
                  m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
@@ -300,12 +329,10 @@ int UnifracWeightedCommand::execute() {
                  vector<string> newGroups = Groups;
                  Groups.clear();
                  for (int i = 0; i < newGroups.size(); i++) {
-                    vector<string> thisGroup; thisGroup.push_back(newGroups[i]);
-                    vector<string> thisGroupsSeqs = tmap->getNamesSeqs(thisGroup);
-                    int thisSize = thisGroupsSeqs.size();
+                    int thisSize = ct->getGroupCount(newGroups[i]);
                      
                      if (thisSize >= subsampleSize) {    Groups.push_back(newGroups[i]);        }
-                    else {  m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
+                    else {   m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
                  } 
                  m->setGroups(Groups);
              }
@@ -321,7 +348,7 @@ int UnifracWeightedCommand::execute() {
          //get weighted scores for users trees
          for (int i = 0; i < T.size(); i++) {
              
-            if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } return 0; }
+            if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {    m->mothurRemove(outputNames[i]);  } return 0; }
              
              counter = 0;
              rScores.resize(numComp);  //data[0] = weightedscore AB, data[1] = weightedscore AC...
@@ -337,7 +364,7 @@ int UnifracWeightedCommand::execute() {
              } 
              
              userData = weighted.getValues(T[i], processors, outputDir); //userData[0] = weightedscore
-            if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0; }
+            if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } return 0; }
              
              //save users score
              for (int s=0; s<numComp; s++) {
@@ -361,16 +388,12 @@ int UnifracWeightedCommand::execute() {
                  if (m->control_pressed) { break; }
                  
                  //copy to preserve old one - would do this in subsample but memory cleanup becomes messy.
-                TreeMap* newTmap = new TreeMap();
-                //newTmap->getCopy(*tmap);
-                
-                //SubSample sample;
-               //Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
+                CountTable* newCt = new CountTable();
                  
                  //uses method of setting groups to doNotIncludeMe
                  SubSample sample;
-                Tree* subSampleTree = sample.getSample(T[i], tmap, newTmap, subsampleSize, unique2Dup);
-
+                Tree* subSampleTree = sample.getSample(T[i], ct, newCt, subsampleSize);
+               
                  //call new weighted function
                  vector<double> iterData; iterData.resize(numComp,0);
                  Weighted thisWeighted(includeRoot);
@@ -379,20 +402,20 @@ int UnifracWeightedCommand::execute() {
                  //save data to make ave dist, std dist
                  calcDistsTotals.push_back(iterData);
                  
-                delete newTmap;
+                delete newCt;
                  delete subSampleTree;
                  
                  if((thisIter+1) % 100 == 0){   m->mothurOut(toString(thisIter+1)); m->mothurOutEndLine();              }
              }
              
-            if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {   m->mothurRemove(outputNames[i]);  } return 0; }
+            if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {     m->mothurRemove(outputNames[i]);  } return 0; }
              
              if (subsample) {  getAverageSTDMatrices(calcDistsTotals, i); }
              if (consensus) {  getConsensusTrees(calcDistsTotals, i);  }
          }
          
                 
-               if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]);  } return 0;  }
+               if (m->control_pressed) { delete ct; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0;  }
                 
          if (phylip) {  createPhylipFile();             }
      
@@ -400,7 +423,7 @@ int UnifracWeightedCommand::execute() {
                 
                 //clear out users groups
                 m->clearGroups();
-               delete tmap; 
+               delete ct; 
                 for (int i = 0; i < T.size(); i++) { delete T[i]; }
                 
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);  } return 0; }
@@ -557,9 +580,17 @@ int UnifracWeightedCommand::getConsensusTrees(vector< vector<double> >& dists, i
          //used in tree constructor 
          m->runParse = false;
          
-        //create treemap class from groupmap for tree class to use
-        TreeMap newTmap;
-        newTmap.makeSim(m->getGroups());
+        ///create treemap class from groupmap for tree class to use
+        CountTable newCt;
+        set<string> nameMap;
+        map<string, string> groupMap;
+        set<string> gps;
+        for (int i = 0; i < m->getGroups().size(); i++) { 
+            nameMap.insert(m->getGroups()[i]); 
+            gps.insert(m->getGroups()[i]); 
+            groupMap[m->getGroups()[i]] = m->getGroups()[i];
+        }
+        newCt.createTable(nameMap, groupMap, gps);
          
          //clear  old tree names if any
          m->Treenames.clear();
@@ -567,7 +598,7 @@ int UnifracWeightedCommand::getConsensusTrees(vector< vector<double> >& dists, i
          //fills globaldatas tree names
          m->Treenames = m->getGroups();
          
-        vector<Tree*> newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created
+        vector<Tree*> newTrees = buildTrees(dists, treeNum, newCt); //also creates .all.tre file containing the trees created
          
          if (m->control_pressed) { return 0; }
          
@@ -593,7 +624,7 @@ int UnifracWeightedCommand::getConsensusTrees(vector< vector<double> >& dists, i
  }
  /**************************************************************************************************/
  
-vector<Tree*> UnifracWeightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, TreeMap& mytmap) {
+vector<Tree*> UnifracWeightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, CountTable& myct) {
         try {
          
          vector<Tree*> trees;
@@ -627,9 +658,8 @@ vector<Tree*> UnifracWeightedCommand::buildTrees(vector< vector<double> >& dists
                         }
  
              //create tree
-            Tree* tempTree = new Tree(&mytmap, sims);
-            map<string, string> empty;
-            tempTree->assembleTree(empty);
+            Tree* tempTree = new Tree(&myct, sims);
+            tempTree->assembleTree();
              
              trees.push_back(tempTree);
              
@@ -682,7 +712,7 @@ int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, vector<double> usersS
          
          //get scores for random trees
          for (int j = 0; j < iters; j++) {
-            
+            cout << j << endl; 
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
              if(processors == 1){
                  driver(thisTree,  namesOfGroupCombos, 0, namesOfGroupCombos.size(),  rScores);
@@ -693,7 +723,7 @@ int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, vector<double> usersS
              driver(thisTree, namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
  #endif
              
-            if (m->control_pressed) { delete tmap;  for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]);  } return 0; }
+            if (m->control_pressed) { delete ct;  for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) {    m->mothurRemove(outputNames[i]);  } return 0; }
              
              //report progress
              //                                 m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();          
@@ -796,7 +826,7 @@ int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector<string> > na
  /**************************************************************************************************/
  int UnifracWeightedCommand::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, vector< vector<double> >& scores) { 
   try {
-               Tree* randT = new Tree(tmap);
+               Tree* randT = new Tree(ct);
       
          Weighted weighted(includeRoot);
       
diff --git a/unifracweightedcommand.h b/unifracweightedcommand.h

index 06354ce5cbeed8dc7c1629a482c65d9381b40e04..fead41b0e8cb5dae95eed551ff76427b8c8d85f3 100644 (file)
--- a/unifracweightedcommand.h
+++ b/unifracweightedcommand.h
@@ -12,7 +12,7 @@
  
  #include "command.hpp"
  #include "weighted.h"
-#include "treemap.h"
+#include "counttable.h"
  #include "progress.hpp"
  #include "sharedutilities.h"
  #include "fileoutput.h"
@@ -43,7 +43,7 @@ class UnifracWeightedCommand : public Command {
                         linePair(int i, int j) : start(i), num(j) {}
                 };
                 vector<linePair> lines;
-        TreeMap* tmap;
+        CountTable* ct;
                 FileOutput* output;
                 vector<Tree*> T;           //user trees
                 vector<double> utreeScores;  //user tree unweighted scores
@@ -58,7 +58,7 @@ class UnifracWeightedCommand : public Command {
                 map<float, float>  validScores;  //map contains scores from random
                 
                 bool abort, phylip, random, includeRoot, subsample, consensus;
-               string groups, itersString, outputForm, treefile, groupfile, namefile;
+               string groups, itersString, outputForm, treefile, groupfile, namefile, countfile;
                 vector<string> Groups, outputNames; //holds groups to be used
                 int processors, subsampleSize, subsampleIters;
                 ofstream outSum;
@@ -73,7 +73,7 @@ class UnifracWeightedCommand : public Command {
                 int createProcesses(Tree*,  vector< vector<string> >,  vector< vector<double> >&);
                 int driver(Tree*, vector< vector<string> >, int, int,  vector< vector<double> >&);
          int runRandomCalcs(Tree*, vector<double>);
-        vector<Tree*> buildTrees(vector< vector<double> >&, int, TreeMap&);
+        vector<Tree*> buildTrees(vector< vector<double> >&, int, CountTable&);
          int getConsensusTrees(vector< vector<double> >&, int);
          int getAverageSTDMatrices(vector< vector<double> >&, int);
                 
diff --git a/unweighted.cpp b/unweighted.cpp

index 864a9f8bab16f5d1ea52480c37c141ddc934e199..e95834fb485fb6131fcacdb458e3f6c96b862693 100644 (file)
--- a/unweighted.cpp
+++ b/unweighted.cpp
@@ -16,7 +16,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) {
                 processors = p;
                 outputDir = o;
          
-        TreeMap* tmap = t->getTreeMap();
+        CountTable* ct = t->getCountTable();
          
                 //if the users enters no groups then give them the score of all groups
                 int numGroups = m->getNumGroups();
@@ -36,9 +36,9 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) {
                         vector<string> groups;
                         if (numGroups == 0) {
                                 //get score for all users groups
-                               for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) {
-                                       if ((tmap->getNamesOfGroups())[i] != "xxx") {
-                                               groups.push_back((tmap->getNamesOfGroups())[i]);
+                               for (int i = 0; i < (ct->getNamesOfGroups()).size(); i++) {
+                                       if ((ct->getNamesOfGroups())[i] != "xxx") {
+                                               groups.push_back((ct->getNamesOfGroups())[i]);
                                         }
                                 }
                                 namesOfGroupCombos.push_back(groups);
@@ -52,7 +52,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) {
  
                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                         if(processors == 1){
-                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
+                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct);
                         }else{
                                 int numPairs = namesOfGroupCombos.size();
                                 
@@ -67,11 +67,11 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) {
                 
                                         lines.push_back(linePair(startPos, numPairsPerProcessor));
                                 }
-                               data = createProcesses(t, namesOfGroupCombos, tmap);
+                               data = createProcesses(t, namesOfGroupCombos, ct);
                                 lines.clear();
                         }
                 #else
-                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
+                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct);
                 #endif
                 
                 return data;
@@ -83,7 +83,7 @@ EstOutput Unweighted::getValues(Tree* t, int p, string o) {
  }
  /**************************************************************************************************/
  
-EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, TreeMap* tmap) {
+EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, CountTable* ct) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 int process = 1;
@@ -100,7 +100,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
                                 process++;
                         }else if (pid == 0){
                                 EstOutput myresults;
-                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap);
+                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, ct);
                                 
                                 if (m->control_pressed) { exit(0); }
                                 
@@ -122,7 +122,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
                         }
                 }
                 
-               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap);
+               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, ct);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<(processors-1);i++) { 
@@ -167,7 +167,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
         }
  }
  /**************************************************************************************************/
-EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, TreeMap* tmap) { 
+EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, CountTable* ct) { 
   try {
         
          
@@ -261,7 +261,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st
                 processors = p;
                 outputDir = o;
                 
-        TreeMap* tmap = t->getTreeMap();
+        CountTable* ct = t->getCountTable();
       
                 //if the users enters no groups then give them the score of all groups
                 int numGroups = m->getNumGroups();
@@ -281,9 +281,9 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st
                         vector<string> groups;
                         if (numGroups == 0) {
                                 //get score for all users groups
-                               for (int i = 0; i < (tmap->getNamesOfGroups()).size(); i++) {
-                                       if ((tmap->getNamesOfGroups())[i] != "xxx") {
-                                               groups.push_back((tmap->getNamesOfGroups())[i]);
+                               for (int i = 0; i < (ct->getNamesOfGroups()).size(); i++) {
+                                       if ((ct->getNamesOfGroups())[i] != "xxx") {
+                                               groups.push_back((ct->getNamesOfGroups())[i]);
                                         }
                                 }
                                 namesOfGroupCombos.push_back(groups);
@@ -297,7 +297,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st
  
                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                         if(processors == 1){
-                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap);
+                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, ct);
                         }else{
                                 int numPairs = namesOfGroupCombos.size();
                                 
@@ -311,12 +311,12 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st
                                         lines.push_back(linePair(startPos, numPairsPerProcessor));
                                 }
                                         
-                               data = createProcesses(t, namesOfGroupCombos, true, tmap);
+                               data = createProcesses(t, namesOfGroupCombos, true, ct);
                                 
                                 lines.clear();
                         }
                 #else
-                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, tmap);
+                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true, ct);
                 #endif
         
                 return data;
@@ -328,7 +328,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB, int p, st
  }
  /**************************************************************************************************/
  
-EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, bool usingGroups, TreeMap* tmap) {
+EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, bool usingGroups, CountTable* ct) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 int process = 1;
@@ -345,7 +345,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
                                 process++;
                         }else if (pid == 0){
                                 EstOutput myresults;
-                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups, tmap);
+                               myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, usingGroups, ct);
                                 
                                 if (m->control_pressed) { exit(0); }
                                 
@@ -365,7 +365,7 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
                         }
                 }
                 
-               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups, tmap);
+               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, usingGroups, ct);
                 
                 //force parent to wait until all the processes are done
                 for (int i=0;i<(processors-1);i++) { 
@@ -409,14 +409,14 @@ EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfG
         }
  }
  /**************************************************************************************************/
-EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, bool usingGroups, TreeMap* tmap) { 
+EstOutput Unweighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, bool usingGroups, CountTable* ct) { 
   try {
                 
                 EstOutput results; results.resize(num);
                 
                 int count = 0;
                 
-               Tree* copyTree = new Tree(tmap);
+               Tree* copyTree = new Tree(ct);
                 
                 for (int h = start; h < (start+num); h++) {
                 
diff --git a/unweighted.h b/unweighted.h

index c6c13bb3a7abd14c824dfe6b495871790809833d..b136b007b517e56645be544ee6556f83a45de649 100644 (file)
--- a/unweighted.h
+++ b/unweighted.h
@@ -12,7 +12,7 @@
   */
  
  #include "treecalculator.h"
-#include "treemap.h"
+#include "counttable.h"
  
  /***********************************************************************/
  
@@ -38,10 +38,10 @@ class Unweighted : public TreeCalculator  {
                 map< vector<string>, set<int> > rootForGrouping;  //maps a grouping combo to the roots for that combo
                 bool includeRoot;
                 
-               EstOutput driver(Tree*, vector< vector<string> >, int, int, TreeMap*); 
-               EstOutput createProcesses(Tree*, vector< vector<string> >, TreeMap*);
-               EstOutput driver(Tree*, vector< vector<string> >, int, int, bool, TreeMap*); 
-               EstOutput createProcesses(Tree*, vector< vector<string> >, bool, TreeMap*);
+               EstOutput driver(Tree*, vector< vector<string> >, int, int, CountTable*); 
+               EstOutput createProcesses(Tree*, vector< vector<string> >, CountTable*);
+               EstOutput driver(Tree*, vector< vector<string> >, int, int, bool, CountTable*); 
+               EstOutput createProcesses(Tree*, vector< vector<string> >, bool, CountTable*);
                 int getRoot(Tree*, int, vector<string>);
  };
  
diff --git a/validparameter.cpp b/validparameter.cpp

index 3e1f3498ca0be8f0f010bd038273f3ea70f89d3c..7d1af2551c7a6717c1d3ea807af0e56de0fc218e 100644 (file)
--- a/validparameter.cpp
+++ b/validparameter.cpp
@@ -307,6 +307,14 @@ string ValidParameters::validFile(map<string, string>& container, string paramet
                                         
                                         if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ". I suspect you entered a column formatted file as a phylip file, aborting."); m->mothurOutEndLine(); return "not found"; }
                                 }
+                
+                //check for blank file
+                if (ableToOpen != 1) {
+                    if (m->isBlank(container[parameter])) {
+                        m->mothurOut("[ERROR]: " + container[parameter] + " is blank, aborting."); m->mothurOutEndLine(); return "not found"; 
+                    }
+                }
+                    
                         }
                 }else { return "not found"; }
                 
diff --git a/weighted.cpp b/weighted.cpp

index 85eed5207ff20d586bd999f670d59a4c8e840c67..cf1291dfa91af84e4d821541b2a8f43a169d64f7 100644 (file)
--- a/weighted.cpp
+++ b/weighted.cpp
@@ -19,7 +19,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) {
                 processors = p;
                 outputDir = o;
          
-        TreeMap* tmap = t->getTreeMap();
+        CountTable* ct = t->getCountTable();
                 
                 numGroups = m->getNumGroups();
                 
@@ -38,7 +38,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) {
                 
                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                         if(processors == 1){
-                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
+                               data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct);
                         }else{
                                 int numPairs = namesOfGroupCombos.size();
                                 
@@ -52,12 +52,12 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) {
                                         lines.push_back(linePair(startPos, numPairsPerProcessor));
                                 }
  
-                               data = createProcesses(t, namesOfGroupCombos, tmap);
+                               data = createProcesses(t, namesOfGroupCombos, ct);
                                 
                                 lines.clear();
                         }
                 #else
-                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), tmap);
+                       data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), ct);
                 #endif
                 
                 return data;
@@ -69,7 +69,7 @@ EstOutput Weighted::getValues(Tree* t, int p, string o) {
  }
  /**************************************************************************************************/
  
-EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, TreeMap* tmap) {
+EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, CountTable* ct) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 int process = 1;
@@ -87,7 +87,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGro
                         }else if (pid == 0){
         
                                 EstOutput Myresults;
-                               Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, tmap);
+                               Myresults = driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, ct);
                         
                                 //m->mothurOut("Merging results."); m->mothurOutEndLine();
                                 
@@ -110,7 +110,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGro
                         }
                 }
         
-               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, tmap);
+               results = driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, ct);
         
                 //force parent to wait until all the processes are done
                 for (int i=0;i<(processors-1);i++) { 
@@ -155,7 +155,7 @@ EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGro
         }
  }
  /**************************************************************************************************/
-EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, TreeMap* tmap) { 
+EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, CountTable* ct) { 
   try {
                 EstOutput results;
                 vector<double> D;
@@ -179,7 +179,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos,
                                 int numSeqsInGroupI = it->second;
                                 
                                 double sum = getLengthToRoot(t, t->groupNodeInfo[groupA][j], groupA, groupB);
-                               double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groupA]);
+                               double weightedSum = ((numSeqsInGroupI * sum) / (double)ct->getGroupCount(groupA));
                         
                                 D[count] += weightedSum;
                         }
@@ -190,7 +190,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos,
                                 int numSeqsInGroupL = it->second;
                                 
                                 double sum = getLengthToRoot(t, t->groupNodeInfo[groupB][j], groupA, groupB);
-                               double weightedSum = ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groupB]);
+                               double weightedSum = ((numSeqsInGroupL * sum) / (double)ct->getGroupCount(groupB));
                         
                                 D[count] += weightedSum;
                         }
@@ -216,7 +216,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos,
                                 it = t->tree[i].pcount.find(groupA);
                                 //if it does u = # of its descendants with a certain group / total number in tree with a certain group
                                 if (it != t->tree[i].pcount.end()) {
-                                       u = (double) t->tree[i].pcount[groupA] / (double) tmap->seqsPerGroup[groupA];
+                                       u = (double) t->tree[i].pcount[groupA] / (double) ct->getGroupCount(groupA);
                                 }else { u = 0.00; }
                                 
                                 
@@ -225,7 +225,7 @@ EstOutput Weighted::driver(Tree* t, vector< vector<string> > namesOfGroupCombos,
                                 
                                 //if it does subtract their percentage from u
                                 if (it != t->tree[i].pcount.end()) {
-                                       u -= (double) t->tree[i].pcount[groupB] / (double) tmap->seqsPerGroup[groupB];
+                                       u -= (double) t->tree[i].pcount[groupB] / (double) ct->getGroupCount(groupB);
                                 }
                                 
                                 if (includeRoot) {
@@ -270,7 +270,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) {
                 
                 data.clear(); //clear out old values
       
-        TreeMap* tmap = t->getTreeMap();
+        CountTable* ct = t->getCountTable();
                 
                 if (m->control_pressed) { return data; }
                 
@@ -287,7 +287,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) {
                         int numSeqsInGroupI = it->second;
                         
                         double sum = getLengthToRoot(t, t->groupNodeInfo[groups[0]][j], groups[0], groups[1]);
-                       double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groups[0]]);
+                       double weightedSum = ((numSeqsInGroupI * sum) / (double)ct->getGroupCount(groups[0]));
                 
                         D += weightedSum;
                 }
@@ -298,7 +298,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) {
                         int numSeqsInGroupL = it->second;
                         
                         double sum = getLengthToRoot(t, t->groupNodeInfo[groups[1]][j], groups[0], groups[1]);
-                       double weightedSum = ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groups[1]]);
+                       double weightedSum = ((numSeqsInGroupL * sum) / (double)ct->getGroupCount(groups[1]));
                 
                         D += weightedSum;
                 }
@@ -314,7 +314,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) {
                         it = t->tree[i].pcount.find(groupA);
                         //if it does u = # of its descendants with a certain group / total number in tree with a certain group
                         if (it != t->tree[i].pcount.end()) {
-                               u = (double) t->tree[i].pcount[groupA] / (double) tmap->seqsPerGroup[groupA];
+                               u = (double) t->tree[i].pcount[groupA] / (double) ct->getGroupCount(groupA);
                         }else { u = 0.00; }
                         
                         
@@ -322,7 +322,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) {
                         it = t->tree[i].pcount.find(groupB);
                         //if it does subtract their percentage from u
                         if (it != t->tree[i].pcount.end()) {
-                               u -= (double) t->tree[i].pcount[groupB] / (double) tmap->seqsPerGroup[groupB];
+                               u -= (double) t->tree[i].pcount[groupB] / (double) ct->getGroupCount(groupB);
                         }
                         
                         if (includeRoot) {
diff --git a/weighted.h b/weighted.h

index 180409ce2dbad5da67b570a7ca4a3666c61ae3b3..d4082fe9c82a7aeeb41aeaad62f016d0ec588d4d 100644 (file)
--- a/weighted.h
+++ b/weighted.h
@@ -12,7 +12,7 @@
   */
  
  #include "treecalculator.h"
-#include "treemap.h"
+#include "counttable.h"
  
  /***********************************************************************/
  
@@ -41,8 +41,8 @@ class Weighted : public TreeCalculator  {
                 map< vector<string>, set<int> > rootForGrouping;  //maps a grouping combo to the root for that combo
                 bool includeRoot;
                 
-               EstOutput driver(Tree*, vector< vector<string> >, int, int, TreeMap*); 
-               EstOutput createProcesses(Tree*, vector< vector<string> >, TreeMap*);
+               EstOutput driver(Tree*, vector< vector<string> >, int, int, CountTable*); 
+               EstOutput createProcesses(Tree*, vector< vector<string> >, CountTable*);
                 double getLengthToRoot(Tree*, int, string, string);
  };
  
diff --git a/weightedlinkage.cpp b/weightedlinkage.cpp

index 19c41ce555b87ed296e134bf6d403752a07b5159..c1e4d51be52b95b7eec757930532a8d231f07ea5 100644 (file)
--- a/weightedlinkage.cpp
+++ b/weightedlinkage.cpp
@@ -5,7 +5,6 @@
  #include "mothur.h"
  #include "cluster.hpp"
  #include "rabundvector.hpp"
-#include "sparsematrix.hpp"
  
  /* This class implements the WPGMA, weighted average neighbor clustering algorithm */
author	Pat Schloss <pschloss@umich.edu>
	Mon, 22 Oct 2012 13:18:06 +0000 (09:18 -0400)
committer	Pat Schloss <pschloss@umich.edu>
	Mon, 22 Oct 2012 13:18:06 +0000 (09:18 -0400)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
abstractdecisiontree.cpp	[new file with mode: 0644]	patch \| blob
abstractdecisiontree.hpp	[new file with mode: 0755]	patch \| blob
abstractrandomforest.cpp	[new file with mode: 0644]	patch \| blob
abstractrandomforest.hpp	[new file with mode: 0755]	patch \| blob
aligncommand.cpp		patch \| blob \| history
alignnode.cpp	[new file with mode: 0755]	patch \| blob
alignnode.h	[new file with mode: 0755]	patch \| blob
aligntree.cpp	[new file with mode: 0755]	patch \| blob
aligntree.h	[new file with mode: 0755]	patch \| blob
bayesian.cpp		patch \| blob \| history
bayesian.h		patch \| blob \| history
binsequencecommand.cpp		patch \| blob \| history
binsequencecommand.h		patch \| blob \| history
chimeraperseuscommand.cpp		patch \| blob \| history
chimeraperseuscommand.h		patch \| blob \| history
chimeraslayercommand.cpp		patch \| blob \| history
chimeraslayercommand.h		patch \| blob \| history
chimerauchimecommand.cpp		patch \| blob \| history
chimerauchimecommand.h		patch \| blob \| history
chopseqscommand.cpp		patch \| blob \| history
chopseqscommand.h		patch \| blob \| history
classify.cpp		patch \| blob \| history
classify.h		patch \| blob \| history
classifyotucommand.cpp		patch \| blob \| history
classifyotucommand.h		patch \| blob \| history
classifyseqscommand.cpp		patch \| blob \| history
classifyseqscommand.h		patch \| blob \| history
classifysharedcommand.cpp	[new file with mode: 0755]	patch \| blob
classifysharedcommand.h	[new file with mode: 0755]	patch \| blob
classifytreecommand.cpp		patch \| blob \| history
classifytreecommand.h		patch \| blob \| history
clusterclassic.cpp		patch \| blob \| history
clusterclassic.h		patch \| blob \| history
clustercommand.cpp		patch \| blob \| history
clusterdoturcommand.cpp		patch \| blob \| history
clusterdoturcommand.h		patch \| blob \| history
clusterfragmentscommand.cpp		patch \| blob \| history
clusterfragmentscommand.h		patch \| blob \| history
clustersplitcommand.cpp		patch \| blob \| history
clustersplitcommand.h		patch \| blob \| history
commandfactory.cpp		patch \| blob \| history
consensus.cpp		patch \| blob \| history
consensusseqscommand.cpp		patch \| blob \| history
consensusseqscommand.h		patch \| blob \| history
countgroupscommand.cpp		patch \| blob \| history
countgroupscommand.h		patch \| blob \| history
countseqscommand.cpp		patch \| blob \| history
counttable.cpp		patch \| blob \| history
counttable.h		patch \| blob \| history
decisiontree.cpp	[new file with mode: 0644]	patch \| blob
decisiontree.hpp	[new file with mode: 0755]	patch \| blob
deconvolutecommand.cpp		patch \| blob \| history
deconvolutecommand.h		patch \| blob \| history
deuniquetreecommand.cpp		patch \| blob \| history
flowdata.cpp		patch \| blob \| history
getgroupscommand.cpp		patch \| blob \| history
getgroupscommand.h		patch \| blob \| history
getlineagecommand.cpp		patch \| blob \| history
getlineagecommand.h		patch \| blob \| history
getoturepcommand.cpp		patch \| blob \| history
getoturepcommand.h		patch \| blob \| history
getseqscommand.cpp		patch \| blob \| history
getseqscommand.h		patch \| blob \| history
groupmap.cpp		patch \| blob \| history
groupmap.h		patch \| blob \| history
hcluster.cpp		patch \| blob \| history
heatmapsimcommand.cpp		patch \| blob \| history
heatmapsimcommand.h		patch \| blob \| history
indicatorcommand.cpp		patch \| blob \| history
indicatorcommand.h		patch \| blob \| history
kmernode.cpp	[new file with mode: 0755]	patch \| blob
kmernode.h	[new file with mode: 0755]	patch \| blob
kmertree.cpp	[new file with mode: 0755]	patch \| blob
kmertree.h	[new file with mode: 0755]	patch \| blob
knn.cpp		patch \| blob \| history
listseqscommand.cpp		patch \| blob \| history
listseqscommand.h		patch \| blob \| history
macros.h	[new file with mode: 0755]	patch \| blob
makebiomcommand.cpp		patch \| blob \| history
makecontigscommand.cpp		patch \| blob \| history
makecontigscommand.h		patch \| blob \| history
makefile		patch \| blob \| history
mgclustercommand.cpp		patch \| blob \| history
mgclustercommand.h		patch \| blob \| history
mothur.h		patch \| blob \| history
mothurout.cpp		patch \| blob \| history
mothurout.h		patch \| blob \| history
parsefastaqcommand.cpp		patch \| blob \| history
parsefastaqcommand.h		patch \| blob \| history
parsimony.cpp		patch \| blob \| history
parsimony.h		patch \| blob \| history
parsimonycommand.cpp		patch \| blob \| history
parsimonycommand.h		patch \| blob \| history
pcrseqscommand.h		patch \| blob \| history
phylodiversitycommand.cpp		patch \| blob \| history
phylodiversitycommand.h		patch \| blob \| history
phylosummary.cpp		patch \| blob \| history
phylosummary.h		patch \| blob \| history
phylotree.cpp		patch \| blob \| history
phylotree.h		patch \| blob \| history
prcseqscommand.cpp		patch \| blob \| history
preclustercommand.cpp		patch \| blob \| history
preclustercommand.h		patch \| blob \| history
randomforest.cpp	[new file with mode: 0644]	patch \| blob
randomforest.hpp	[new file with mode: 0755]	patch \| blob
readcluster.cpp		patch \| blob \| history
readcluster.h		patch \| blob \| history
readmatrix.hpp		patch \| blob \| history
readtree.cpp		patch \| blob \| history
readtree.h		patch \| blob \| history
removegroupscommand.cpp		patch \| blob \| history
removegroupscommand.h		patch \| blob \| history
removelineagecommand.cpp		patch \| blob \| history
removelineagecommand.h		patch \| blob \| history
removerarecommand.cpp		patch \| blob \| history
removerarecommand.h		patch \| blob \| history
removeseqscommand.cpp		patch \| blob \| history
removeseqscommand.h		patch \| blob \| history
rftreenode.cpp	[new file with mode: 0644]	patch \| blob
rftreenode.hpp	[new file with mode: 0755]	patch \| blob
screenseqscommand.cpp		patch \| blob \| history
screenseqscommand.h		patch \| blob \| history
secondarystructurecommand.cpp		patch \| blob \| history
secondarystructurecommand.h		patch \| blob \| history
sensspeccommand.cpp		patch \| blob \| history
seqsummarycommand.cpp		patch \| blob \| history
seqsummarycommand.h		patch \| blob \| history
sequencecountparser.cpp	[new file with mode: 0644]	patch \| blob
sequencecountparser.h	[new file with mode: 0644]	patch \| blob
sequenceparser.cpp		patch \| blob \| history
sequenceparser.h		patch \| blob \| history
sffinfocommand.cpp		patch \| blob \| history
sffinfocommand.h		patch \| blob \| history
sffmultiplecommand.cpp	[new file with mode: 0644]	patch \| blob
sffmultiplecommand.h	[new file with mode: 0644]	patch \| blob
sharedcommand.cpp		patch \| blob \| history
sharedrabundvector.h		patch \| blob \| history
sharedutilities.cpp		patch \| blob \| history
shhhercommand.cpp		patch \| blob \| history
shhhercommand.h		patch \| blob \| history
sortseqscommand.cpp		patch \| blob \| history
sortseqscommand.h		patch \| blob \| history
sparsedistancematrix.cpp		patch \| blob \| history
splitabundcommand.cpp		patch \| blob \| history
splitabundcommand.h		patch \| blob \| history
splitgroupscommand.cpp		patch \| blob \| history
splitgroupscommand.h		patch \| blob \| history
splitmatrix.cpp		patch \| blob \| history
splitmatrix.h		patch \| blob \| history
subsample.cpp		patch \| blob \| history
subsample.h		patch \| blob \| history
subsamplecommand.cpp		patch \| blob \| history
subsamplecommand.h		patch \| blob \| history
summaryqualcommand.cpp		patch \| blob \| history
summaryqualcommand.h		patch \| blob \| history
summarytaxcommand.cpp		patch \| blob \| history
summarytaxcommand.h		patch \| blob \| history
taxonomynode.cpp	[new file with mode: 0755]	patch \| blob
taxonomynode.h	[new file with mode: 0755]	patch \| blob
tree.cpp		patch \| blob \| history
tree.h		patch \| blob \| history
treegroupscommand.cpp		patch \| blob \| history
treegroupscommand.h		patch \| blob \| history
treemap.cpp		patch \| blob \| history
treemap.h		patch \| blob \| history
treereader.cpp		patch \| blob \| history
treereader.h		patch \| blob \| history
trimflowscommand.cpp		patch \| blob \| history
trimoligos.cpp		patch \| blob \| history
trimoligos.h		patch \| blob \| history
trimseqscommand.cpp		patch \| blob \| history
trimseqscommand.h		patch \| blob \| history
unifracunweightedcommand.cpp		patch \| blob \| history
unifracunweightedcommand.h		patch \| blob \| history
unifracweightedcommand.cpp		patch \| blob \| history
unifracweightedcommand.h		patch \| blob \| history
unweighted.cpp		patch \| blob \| history
unweighted.h		patch \| blob \| history
validparameter.cpp		patch \| blob \| history
weighted.cpp		patch \| blob \| history
weighted.h		patch \| blob \| history
weightedlinkage.cpp		patch \| blob \| history