Merge remote-tracking branch 'mothur/master'

author Pat Schloss <pschloss@umich.edu>

Wed, 17 Jul 2013 18:52:16 +0000 (14:52 -0400)

committer Pat Schloss <pschloss@umich.edu>

Wed, 17 Jul 2013 18:52:16 +0000 (14:52 -0400)
author Pat Schloss <pschloss@umich.edu>
Wed, 17 Jul 2013 18:52:16 +0000 (14:52 -0400)
committer Pat Schloss <pschloss@umich.edu>
Wed, 17 Jul 2013 18:52:16 +0000 (14:52 -0400)
diff --git a/.gitignore b/.gitignore

index 853fd84a4e280293f81727c25057644c758cd2c0..fb4ae5123e7cdc9f64ab449b496415f8edd27733 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,9 @@
-.DS_Store
-*.zip
+*.logfile
+*.o
  *.pbxproj
-*.xcuserdata
-\ No newline at end of file
+*.xcuserdata
+*.zip
+.DS_Store
+.idea
+build
+xcuserdata
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index d7474058b9a98dc706b1af88db7303d84dc0a074..b09bab447e3307bc279189d5e5b5dae30a677e7a 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -20,6 +20,7 @@
                 A7128B1D16B7002A00723BE4 /* getdistscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7128B1C16B7002600723BE4 /* getdistscommand.cpp */; };
                 A713EBAC12DC7613000092AC /* readphylipvector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A713EBAB12DC7613000092AC /* readphylipvector.cpp */; };
                 A713EBED12DC7C5E000092AC /* nmdscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A713EBEC12DC7C5E000092AC /* nmdscommand.cpp */; };
+               A7190B221768E0DF00A9AFA6 /* lefsecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7190B201768E0DF00A9AFA6 /* lefsecommand.cpp */; };
                 A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; };
                 A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; };
                 A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; };
@@ -32,11 +33,11 @@
                 A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; };
                 A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C211619CCE600651424 /* classifysharedcommand.cpp */; };
                 A7386C251619E52300651424 /* abstractdecisiontree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C241619E52200651424 /* abstractdecisiontree.cpp */; };
-               A7386C27161A0F9D00651424 /* abstractrandomforest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C26161A0F9C00651424 /* abstractrandomforest.cpp */; };
                 A7386C29161A110800651424 /* decisiontree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C28161A110700651424 /* decisiontree.cpp */; };
                 A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73901071588C40900ED2ED6 /* loadlogfilecommand.cpp */; };
                 A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */; };
                 A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */; };
+               A741744C175CD9B1007DF49B /* makelefsecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741744A175CD9B1007DF49B /* makelefsecommand.cpp */; };
                 A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */; };
                 A7496D2E167B531B00CC7D7C /* kruskalwalliscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7496D2C167B531B00CC7D7C /* kruskalwalliscommand.cpp */; };
                 A74A9A9F148E881E00AB5E3E /* spline.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74A9A9E148E881E00AB5E3E /* spline.cpp */; };
@@ -53,6 +54,7 @@
                 A774104814696F320098E6AC /* myseqdist.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774104614696F320098E6AC /* myseqdist.cpp */; };
                 A77410F614697C300098E6AC /* seqnoise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77410F414697C300098E6AC /* seqnoise.cpp */; };
                 A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; };
+               A77916E8176F7F7600EEFE18 /* designmap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77916E6176F7F7600EEFE18 /* designmap.cpp */; };
                 A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; };
                 A77B7185173D2240002163C2 /* sparcccommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77B7184173D2240002163C2 /* sparcccommand.cpp */; };
                 A77B7188173D4042002163C2 /* randomnumber.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77B7186173D4041002163C2 /* randomnumber.cpp */; };
@@ -77,6 +79,7 @@
                 A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; };
                 A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */; };
                 A7C7DAB915DA758B0059B0CF /* sffmultiplecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */; };
+               A7CFA4311755401800D9ED4D /* renameseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7CFA4301755401800D9ED4D /* renameseqscommand.cpp */; };
                 A7D755DA1535F679009BF21A /* treereader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D755D91535F679009BF21A /* treereader.cpp */; };
                 A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */; };
                 A7E6F69E17427D06006775E2 /* makelookupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E6F69D17427D06006775E2 /* makelookupcommand.cpp */; };
@@ -417,6 +420,8 @@
                 A713EBAB12DC7613000092AC /* readphylipvector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readphylipvector.cpp; sourceTree = "<group>"; };
                 A713EBEB12DC7C5E000092AC /* nmdscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = nmdscommand.h; sourceTree = "<group>"; };
                 A713EBEC12DC7C5E000092AC /* nmdscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nmdscommand.cpp; sourceTree = "<group>"; };
+               A7190B201768E0DF00A9AFA6 /* lefsecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lefsecommand.cpp; sourceTree = "<group>"; };
+               A7190B211768E0DF00A9AFA6 /* lefsecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lefsecommand.h; sourceTree = "<group>"; };
                 A71CB15E130B04A2001E7287 /* anosimcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = anosimcommand.cpp; sourceTree = "<group>"; };
                 A71CB15F130B04A2001E7287 /* anosimcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = anosimcommand.h; sourceTree = "<group>"; };
                 A71FE12A12EDF72400963CA7 /* mergegroupscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mergegroupscommand.h; sourceTree = "<group>"; };
@@ -438,7 +443,6 @@
                 A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = "<group>"; };
                 A727864312E9E28C00F86ABA /* removerarecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removerarecommand.cpp; sourceTree = "<group>"; };
                 A7386C1B1619CACB00651424 /* abstractdecisiontree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = abstractdecisiontree.hpp; sourceTree = "<group>"; };
-               A7386C1C1619CACB00651424 /* abstractrandomforest.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = abstractrandomforest.hpp; sourceTree = "<group>"; };
                 A7386C1D1619CACB00651424 /* decisiontree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = decisiontree.hpp; sourceTree = "<group>"; };
                 A7386C1E1619CACB00651424 /* macros.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = macros.h; sourceTree = "<group>"; };
                 A7386C1F1619CACB00651424 /* randomforest.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = randomforest.hpp; sourceTree = "<group>"; };
@@ -446,7 +450,6 @@
                 A7386C211619CCE600651424 /* classifysharedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifysharedcommand.cpp; sourceTree = "<group>"; };
                 A7386C221619CCE600651424 /* classifysharedcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifysharedcommand.h; sourceTree = "<group>"; };
                 A7386C241619E52200651424 /* abstractdecisiontree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstractdecisiontree.cpp; sourceTree = "<group>"; };
-               A7386C26161A0F9C00651424 /* abstractrandomforest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = abstractrandomforest.cpp; sourceTree = "<group>"; };
                 A7386C28161A110700651424 /* decisiontree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decisiontree.cpp; sourceTree = "<group>"; };
                 A73901051588C3EF00ED2ED6 /* loadlogfilecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = loadlogfilecommand.h; sourceTree = "<group>"; };
                 A73901071588C40900ED2ED6 /* loadlogfilecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = loadlogfilecommand.cpp; sourceTree = "<group>"; };
@@ -454,6 +457,8 @@
                 A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clearmemorycommand.cpp; sourceTree = "<group>"; };
                 A73DDC3613C4BF64006AAE38 /* mothurmetastats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurmetastats.h; sourceTree = "<group>"; };
                 A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurmetastats.cpp; sourceTree = "<group>"; };
+               A741744A175CD9B1007DF49B /* makelefsecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makelefsecommand.cpp; sourceTree = "<group>"; };
+               A741744B175CD9B1007DF49B /* makelefsecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makelefsecommand.h; sourceTree = "<group>"; };
                 A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequencecountparser.cpp; sourceTree = "<group>"; };
                 A741FAD415D168A00067BCC5 /* sequencecountparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequencecountparser.h; sourceTree = "<group>"; };
                 A7496D2C167B531B00CC7D7C /* kruskalwalliscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kruskalwalliscommand.cpp; sourceTree = "<group>"; };
@@ -486,6 +491,8 @@
                 A77410F514697C300098E6AC /* seqnoise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqnoise.h; sourceTree = "<group>"; };
                 A778FE69134CA6CA00C0BA33 /* getcommandinfocommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getcommandinfocommand.h; sourceTree = "<group>"; };
                 A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcommandinfocommand.cpp; sourceTree = "<group>"; };
+               A77916E6176F7F7600EEFE18 /* designmap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = designmap.cpp; sourceTree = "<group>"; };
+               A77916E7176F7F7600EEFE18 /* designmap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = designmap.h; sourceTree = "<group>"; };
                 A77A221D139001B600B0BE70 /* deuniquetreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deuniquetreecommand.h; sourceTree = "<group>"; };
                 A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deuniquetreecommand.cpp; sourceTree = "<group>"; };
                 A77B7183173D222F002163C2 /* sparcccommand.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = sparcccommand.h; sourceTree = "<group>"; };
@@ -534,6 +541,8 @@
                 A7C3DC0E14FE469500FE1924 /* trialswap2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trialswap2.h; sourceTree = "<group>"; };
                 A7C7DAB615DA75760059B0CF /* sffmultiplecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sffmultiplecommand.h; sourceTree = "<group>"; };
                 A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sffmultiplecommand.cpp; sourceTree = "<group>"; };
+               A7CFA42F1755400500D9ED4D /* renameseqscommand.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = renameseqscommand.h; sourceTree = "<group>"; };
+               A7CFA4301755401800D9ED4D /* renameseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = renameseqscommand.cpp; sourceTree = "<group>"; };
                 A7D755D71535F665009BF21A /* treereader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treereader.h; sourceTree = "<group>"; };
                 A7D755D91535F679009BF21A /* treereader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treereader.cpp; sourceTree = "<group>"; };
                 A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = "<group>"; };
@@ -1264,8 +1273,6 @@
                         children = (
                                 A7386C1B1619CACB00651424 /* abstractdecisiontree.hpp */,
                                 A7386C241619E52200651424 /* abstractdecisiontree.cpp */,
-                               A7386C1C1619CACB00651424 /* abstractrandomforest.hpp */,
-                               A7386C26161A0F9C00651424 /* abstractrandomforest.cpp */,
                                 A7386C1D1619CACB00651424 /* decisiontree.hpp */,
                                 A7386C28161A110700651424 /* decisiontree.cpp */,
                                 A7386C1E1619CACB00651424 /* macros.h */,
@@ -1401,8 +1408,6 @@
                                 A7E9B6F612D37EC400DA6239 /* getlabelcommand.cpp */,
                                 A7548FAB17142EA500B1F05A /* getmetacommunitycommand.h */,
                                 A7548FAC17142EBC00B1F05A /* getmetacommunitycommand.cpp */,
-                               A70056E8156A93E300924A2D /* getotulabelscommand.h */,
-                               A70056E5156A93D000924A2D /* getotulabelscommand.cpp */,
                                 A7E9B6F912D37EC400DA6239 /* getlineagecommand.h */,
                                 A7E9B6F812D37EC400DA6239 /* getlineagecommand.cpp */,
                                 A7E9B6FB12D37EC400DA6239 /* getlistcountcommand.h */,
@@ -1411,6 +1416,8 @@
                                 A7E9B6FE12D37EC400DA6239 /* getoturepcommand.cpp */,
                                 A7E9B70112D37EC400DA6239 /* getotuscommand.h */,
                                 A7E9B70012D37EC400DA6239 /* getotuscommand.cpp */,
+                               A70056E8156A93E300924A2D /* getotulabelscommand.h */,
+                               A70056E5156A93D000924A2D /* getotulabelscommand.cpp */,
                                 A7E9B70312D37EC400DA6239 /* getrabundcommand.h */,
                                 A7E9B70212D37EC400DA6239 /* getrabundcommand.cpp */,
                                 A7E9B70512D37EC400DA6239 /* getrelabundcommand.h */,
@@ -1435,6 +1442,8 @@
                                 A7E9B72B12D37EC400DA6239 /* indicatorcommand.cpp */,
                                 A7496D2D167B531B00CC7D7C /* kruskalwalliscommand.h */,
                                 A7496D2C167B531B00CC7D7C /* kruskalwalliscommand.cpp */,
+                               A7190B211768E0DF00A9AFA6 /* lefsecommand.h */,
+                               A7190B201768E0DF00A9AFA6 /* lefsecommand.cpp */,
                                 A7E9B73C12D37EC400DA6239 /* libshuffcommand.h */,
                                 A7E9B73B12D37EC400DA6239 /* libshuffcommand.cpp */,
                                 A7A0671C156294810095C8C5 /* listotulabelscommand.h */,
@@ -1453,6 +1462,8 @@
                                 A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */,
                                 A7E9B74412D37EC400DA6239 /* makegroupcommand.h */,
                                 A7E9B74312D37EC400DA6239 /* makegroupcommand.cpp */,
+                               A741744B175CD9B1007DF49B /* makelefsecommand.h */,
+                               A741744A175CD9B1007DF49B /* makelefsecommand.cpp */,
                                 A7E6F69C17427CF2006775E2 /* makelookupcommand.h */,
                                 A7E6F69D17427D06006775E2 /* makelookupcommand.cpp */,
                                 A7E9B74A12D37EC400DA6239 /* matrixoutputcommand.h */,
@@ -1521,6 +1532,8 @@
                                 A727864312E9E28C00F86ABA /* removerarecommand.cpp */,
                                 A7E9B7CA12D37EC400DA6239 /* removeseqscommand.h */,
                                 A7E9B7C912D37EC400DA6239 /* removeseqscommand.cpp */,
+                               A7CFA42F1755400500D9ED4D /* renameseqscommand.h */,
+                               A7CFA4301755401800D9ED4D /* renameseqscommand.cpp */,
                                 A7E9B7CE12D37EC400DA6239 /* reversecommand.h */,
                                 A7E9B7CD12D37EC400DA6239 /* reversecommand.cpp */,
                                 A7E9B7D212D37EC400DA6239 /* screenseqscommand.h */,
@@ -1763,6 +1776,8 @@
                                 A7E9B6BD12D37EC400DA6239 /* database.cpp */,
                                 A7E9B6BE12D37EC400DA6239 /* database.hpp */,
                                 A7E9B6BF12D37EC400DA6239 /* datavector.hpp */,
+                               A77916E7176F7F7600EEFE18 /* designmap.h */,
+                               A77916E6176F7F7600EEFE18 /* designmap.cpp */,
                                 A7E9B6CE12D37EC400DA6239 /* distancedb.hpp */,
                                 A7E9B6CD12D37EC400DA6239 /* distancedb.cpp */,
                                 A7E9B6DE12D37EC400DA6239 /* fastamap.cpp */,
@@ -2346,7 +2361,6 @@
                                 A7C7DAB915DA758B0059B0CF /* sffmultiplecommand.cpp in Sources */,
                                 A7386C231619CCE600651424 /* classifysharedcommand.cpp in Sources */,
                                 A7386C251619E52300651424 /* abstractdecisiontree.cpp in Sources */,
-                               A7386C27161A0F9D00651424 /* abstractrandomforest.cpp in Sources */,
                                 A7386C29161A110800651424 /* decisiontree.cpp in Sources */,
                                 A77E1938161B201E00DB1A2A /* randomforest.cpp in Sources */,
                                 A77E193B161B289600DB1A2A /* rftreenode.cpp in Sources */,
@@ -2370,6 +2384,10 @@
                                 A77B7188173D4042002163C2 /* randomnumber.cpp in Sources */,
                                 A77B718B173D40E5002163C2 /* calcsparcc.cpp in Sources */,
                                 A7E6F69E17427D06006775E2 /* makelookupcommand.cpp in Sources */,
+                               A7CFA4311755401800D9ED4D /* renameseqscommand.cpp in Sources */,
+                               A741744C175CD9B1007DF49B /* makelefsecommand.cpp in Sources */,
+                               A7190B221768E0DF00A9AFA6 /* lefsecommand.cpp in Sources */,
+                               A77916E8176F7F7600EEFE18 /* designmap.cpp in Sources */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;
                 };
@@ -2419,11 +2437,11 @@
                                 GCC_ENABLE_SSE3_EXTENSIONS = NO;
                                 GCC_ENABLE_SSE41_EXTENSIONS = NO;
                                 GCC_ENABLE_SSE42_EXTENSIONS = NO;
-                               GCC_OPTIMIZATION_LEVEL = 3;
+                               GCC_OPTIMIZATION_LEVEL = s;
                                 GCC_PREPROCESSOR_DEFINITIONS = (
                                         "MOTHUR_FILES=\"\\\"../../release\\\"\"",
-                                       "VERSION=\"\\\"1.29.2\\\"\"",
-                                       "RELEASE_DATE=\"\\\"2/12/2013\\\"\"",
+                                       "VERSION=\"\\\"1.31.0\\\"\"",
+                                       "RELEASE_DATE=\"\\\"5/24/2013\\\"\"",
                                 );
                                 "GCC_VERSION[arch=*]" = "";
                                 GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
@@ -2456,10 +2474,10 @@
                                 GCC_C_LANGUAGE_STANDARD = gnu99;
                                 GCC_GENERATE_DEBUGGING_SYMBOLS = NO;
                                 GCC_MODEL_TUNING = "";
-                               GCC_OPTIMIZATION_LEVEL = 3;
+                               GCC_OPTIMIZATION_LEVEL = s;
                                 GCC_PREPROCESSOR_DEFINITIONS = (
-                                       "VERSION=\"\\\"1.30.0\\\"\"",
-                                       "RELEASE_DATE=\"\\\"4/01/2013\\\"\"",
+                                       "VERSION=\"\\\"1.31.0\\\"\"",
+                                       "RELEASE_DATE=\"\\\"5/24/2013\\\"\"",
                                 );
                                 GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
                                 GCC_WARN_ABOUT_RETURN_TYPE = YES;
diff --git a/abstractdecisiontree.cpp b/abstractdecisiontree.cpp

index 085cd316225df92402d0a9fe86c623b45acdd553..595f483dc5ba6cde01b825f8ac44d9ceea4ece16 100644 (file)
--- a/abstractdecisiontree.cpp
+++ b/abstractdecisiontree.cpp
@@ -10,36 +10,39 @@
  
  /**************************************************************************************************/
  
-AbstractDecisionTree::AbstractDecisionTree(vector<vector<int> >baseDataSet, 
-                     vector<int> globalDiscardedFeatureIndices, 
-                     OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, 
-                     string treeSplitCriterion) : baseDataSet(baseDataSet),
-numSamples((int)baseDataSet.size()),
-numFeatures((int)(baseDataSet[0].size() - 1)),
-numOutputClasses(0),
-rootNode(NULL),
-globalDiscardedFeatureIndices(globalDiscardedFeatureIndices),
-optimumFeatureSubsetSize(optimumFeatureSubsetSelector.getOptimumFeatureSubsetSize(numFeatures)),
-treeSplitCriterion(treeSplitCriterion) {
+AbstractDecisionTree::AbstractDecisionTree(vector<vector<int> >& baseDataSet,
+                                         vector<int> globalDiscardedFeatureIndices,
+                                         OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, 
+                                         string treeSplitCriterion)
+
+                    : baseDataSet(baseDataSet),
+                    numSamples((int)baseDataSet.size()),
+                    numFeatures((int)(baseDataSet[0].size() - 1)),
+                    numOutputClasses(0),
+                    rootNode(NULL),
+                    nodeIdCount(0),
+                    globalDiscardedFeatureIndices(globalDiscardedFeatureIndices),
+                    optimumFeatureSubsetSize(optimumFeatureSubsetSelector.getOptimumFeatureSubsetSize(numFeatures)),
+                    treeSplitCriterion(treeSplitCriterion) {
  
      try {
-    // TODO: istead of calculating this for every DecisionTree
-    // clacualte this once in the RandomForest class and pass the values
-    m = MothurOut::getInstance();
-    for (int i = 0;  i < numSamples; i++) {
-        if (m->control_pressed) { break; }
-        int outcome = baseDataSet[i][numFeatures];
-        vector<int>::iterator it = find(outputClasses.begin(), outputClasses.end(), outcome);
-        if (it == outputClasses.end()){       // find() will return classes.end() if the element is not found
-            outputClasses.push_back(outcome);
-            numOutputClasses++;
+        // TODO: istead of calculating this for every DecisionTree
+        // clacualte this once in the RandomForest class and pass the values
+        m = MothurOut::getInstance();
+        for (int i = 0;  i < numSamples; i++) {
+            if (m->control_pressed) { break; }
+            int outcome = baseDataSet[i][numFeatures];
+            vector<int>::iterator it = find(outputClasses.begin(), outputClasses.end(), outcome);
+            if (it == outputClasses.end()){       // find() will return classes.end() if the element is not found
+                outputClasses.push_back(outcome);
+                numOutputClasses++;
+            }
+        }
+        
+        if (m->debug) {
+            //m->mothurOut("outputClasses = " + toStringVectorInt(outputClasses));
+            m->mothurOut("numOutputClasses = " + toString(numOutputClasses) + '\n');
          }
-    }
-    
-    if (m->debug) {
-        //m->mothurOut("outputClasses = " + toStringVectorInt(outputClasses));
-        m->mothurOut("numOutputClasses = " + toString(numOutputClasses) + '\n');
-    }
  
      }
         catch(exception& e) {
@@ -50,25 +53,38 @@ treeSplitCriterion(treeSplitCriterion) {
  /**************************************************************************************************/
  int AbstractDecisionTree::createBootStrappedSamples(){
      try {    
-    vector<bool> isInTrainingSamples(numSamples, false);
-    
-    for (int i = 0; i < numSamples; i++) {
-        if (m->control_pressed) { return 0; }
-        // TODO: optimize the rand() function call + double check if it's working properly
-        int randomIndex = rand() % numSamples;
-        bootstrappedTrainingSamples.push_back(baseDataSet[randomIndex]);
-        isInTrainingSamples[randomIndex] = true;
-    }
-    
-    for (int i = 0; i < numSamples; i++) {
-        if (m->control_pressed) { return 0; }
-        if (isInTrainingSamples[i]){ bootstrappedTrainingSampleIndices.push_back(i); }
-        else{
-            bootstrappedTestSamples.push_back(baseDataSet[i]);
-            bootstrappedTestSampleIndices.push_back(i);
+        vector<bool> isInTrainingSamples(numSamples, false);
+        
+        for (int i = 0; i < numSamples; i++) {
+            if (m->control_pressed) { return 0; }
+            // TODO: optimize the rand() function call + double check if it's working properly
+            int randomIndex = rand() % numSamples;
+            bootstrappedTrainingSamples.push_back(baseDataSet[randomIndex]);
+            isInTrainingSamples[randomIndex] = true;
          }
-    }
-    
+        
+        for (int i = 0; i < numSamples; i++) {
+            if (m->control_pressed) { return 0; }
+            if (isInTrainingSamples[i]){ bootstrappedTrainingSampleIndices.push_back(i); }
+            else{
+                bootstrappedTestSamples.push_back(baseDataSet[i]);
+                bootstrappedTestSampleIndices.push_back(i);
+            }
+        }
+        
+            // do the transpose of Test Samples
+        for (int i = 0; i < bootstrappedTestSamples[0].size(); i++) {
+            if (m->control_pressed) { return 0; }
+            
+            vector<int> tmpFeatureVector(bootstrappedTestSamples.size(), 0);
+            for (int j = 0; j < bootstrappedTestSamples.size(); j++) {
+                if (m->control_pressed) { return 0; }
+                
+                tmpFeatureVector[j] = bootstrappedTestSamples[j][i];
+            }
+            testSampleFeatureVectors.push_back(tmpFeatureVector);
+        }
+        
          return 0;
      }
         catch(exception& e) {
@@ -77,25 +93,34 @@ int AbstractDecisionTree::createBootStrappedSamples(){
         } 
  }
  /**************************************************************************************************/
-int AbstractDecisionTree::getMinEntropyOfFeature(vector<int> featureVector, vector<int> outputVector, double& minEntropy, int& featureSplitValue, double& intrinsicValue){
+int AbstractDecisionTree::getMinEntropyOfFeature(vector<int> featureVector,
+                                                 vector<int> outputVector,
+                                                 double& minEntropy,
+                                                 int& featureSplitValue,
+                                                 double& intrinsicValue){
      try {
  
-        vector< vector<int> > featureOutputPair(featureVector.size(), vector<int>(2, 0));
+        vector< pair<int, int> > featureOutputPair(featureVector.size(), pair<int, int>(0, 0));
+        
          for (int i = 0; i < featureVector.size(); i++) { 
              if (m->control_pressed) { return 0; }
-            featureOutputPair[i][0] = featureVector[i];
-            featureOutputPair[i][1] = outputVector[i];
+            
+            featureOutputPair[i].first = featureVector[i];
+            featureOutputPair[i].second = outputVector[i];
          }
-        // TODO: using default behavior to sort(), need to specify the comparator for added safety and compiler portability
-        sort(featureOutputPair.begin(), featureOutputPair.end());
+        // TODO: using default behavior to sort(), need to specify the comparator for added safety and compiler portability,
          
+        IntPairVectorSorter intPairVectorSorter;
+        sort(featureOutputPair.begin(), featureOutputPair.end(), intPairVectorSorter);
          
          vector<int> splitPoints;
-        vector<int> uniqueFeatureValues(1, featureOutputPair[0][0]);
+        vector<int> uniqueFeatureValues(1, featureOutputPair[0].first);
          
          for (int i = 0; i < featureOutputPair.size(); i++) {
+
              if (m->control_pressed) { return 0; }
-            int featureValue = featureOutputPair[i][0];
+            int featureValue = featureOutputPair[i].first;
+
              vector<int>::iterator it = find(uniqueFeatureValues.begin(), uniqueFeatureValues.end(), featureValue);
              if (it == uniqueFeatureValues.end()){                 // NOT FOUND
                  uniqueFeatureValues.push_back(featureValue);
@@ -115,7 +140,7 @@ int AbstractDecisionTree::getMinEntropyOfFeature(vector<int> featureVector, vect
              featureSplitValue = -1;                                                   // OUTPUT
          }else{
              getBestSplitAndMinEntropy(featureOutputPair, splitPoints, minEntropy, bestSplitIndex, intrinsicValue);  // OUTPUT
-            featureSplitValue = featureOutputPair[splitPoints[bestSplitIndex]][0];    // OUTPUT
+            featureSplitValue = featureOutputPair[splitPoints[bestSplitIndex]].first;    // OUTPUT
          }
          
          return 0;
@@ -146,8 +171,9 @@ double AbstractDecisionTree::calcIntrinsicValue(int numLessThanValueAtSplitPoint
         } 
  }
  /**************************************************************************************************/
-int AbstractDecisionTree::getBestSplitAndMinEntropy(vector< vector<int> > featureOutputPairs, vector<int> splitPoints,
-                               double& minEntropy, int& minEntropyIndex, double& relatedIntrinsicValue){
+
+int AbstractDecisionTree::getBestSplitAndMinEntropy(vector< pair<int, int> > featureOutputPairs, vector<int> splitPoints,
+                                                    double& minEntropy, int& minEntropyIndex, double& relatedIntrinsicValue){
      try {
          
          int numSamples = (int)featureOutputPairs.size();
@@ -155,16 +181,17 @@ int AbstractDecisionTree::getBestSplitAndMinEntropy(vector< vector<int> > featur
          vector<double> intrinsicValues;
          
          for (int i = 0; i < splitPoints.size(); i++) {
-             if (m->control_pressed) { return 0; }
+            if (m->control_pressed) { return 0; }
              int index = splitPoints[i];
-            int valueAtSplitPoint = featureOutputPairs[index][0];
+            int valueAtSplitPoint = featureOutputPairs[index].first;
+
              int numLessThanValueAtSplitPoint = 0;
              int numGreaterThanValueAtSplitPoint = 0;
              
              for (int j = 0; j < featureOutputPairs.size(); j++) {
-                 if (m->control_pressed) { return 0; }
-                vector<int> record = featureOutputPairs[j];
-                if (record[0] < valueAtSplitPoint){ numLessThanValueAtSplitPoint++; }
+                if (m->control_pressed) { return 0; }
+                pair<int, int> record = featureOutputPairs[j];
+                if (record.first < valueAtSplitPoint){ numLessThanValueAtSplitPoint++; }
                  else{ numGreaterThanValueAtSplitPoint++; }
              }
              
@@ -193,19 +220,19 @@ int AbstractDecisionTree::getBestSplitAndMinEntropy(vector< vector<int> > featur
  }
  /**************************************************************************************************/
  
-double AbstractDecisionTree::calcSplitEntropy(vector< vector<int> > featureOutputPairs, int splitIndex, int numOutputClasses, bool isUpperSplit = true) {
+double AbstractDecisionTree::calcSplitEntropy(vector< pair<int, int> > featureOutputPairs, int splitIndex, int numOutputClasses, bool isUpperSplit = true) {
      try {
          vector<int> classCounts(numOutputClasses, 0);
          
          if (isUpperSplit) { 
-            for (int i = 0; i < splitIndex; i++) { 
+            for (int i = 0; i < splitIndex; i++) {
                  if (m->control_pressed) { return 0; }
-                classCounts[featureOutputPairs[i][1]]++; 
+                classCounts[featureOutputPairs[i].second]++;
              }
          } else {
              for (int i = splitIndex; i < featureOutputPairs.size(); i++) { 
                  if (m->control_pressed) { return 0; }
-                classCounts[featureOutputPairs[i][1]]++; 
+                classCounts[featureOutputPairs[i].second]++;
              }
          }
          
@@ -245,8 +272,9 @@ int AbstractDecisionTree::getSplitPopulation(RFTreeNode* node, vector< vector<in
              if (m->control_pressed) { return 0; }
              vector<int> sample =  node->getBootstrappedTrainingSamples()[i];
              if (m->control_pressed) { return 0; }
-            if (sample[splitFeatureGlobalIndex] < node->getSplitFeatureValue()){ leftChildSamples.push_back(sample); }
-            else{ rightChildSamples.push_back(sample); }
+            
+            if (sample[splitFeatureGlobalIndex] < node->getSplitFeatureValue()) { leftChildSamples.push_back(sample); }
+            else { rightChildSamples.push_back(sample); }
          }
          
          return 0;
diff --git a/abstractdecisiontree.hpp b/abstractdecisiontree.hpp

index 3445db4a511e8705ed5b0bed14626eb869ce5043..cc238b4687e57c6cab840f5ec31ce10adbc8a655 100755 (executable)
--- a/abstractdecisiontree.hpp
+++ b/abstractdecisiontree.hpp
@@ -6,8 +6,8 @@
  //  Copyright (c) 2012 Schloss Lab. All rights reserved.
  //
  
-#ifndef rrf_fs_prototype_abstractdecisiontree_hpp
-#define rrf_fs_prototype_abstractdecisiontree_hpp
+#ifndef RF_ABSTRACTDECISIONTREE_HPP
+#define RF_ABSTRACTDECISIONTREE_HPP
  
  #include "mothurout.h"
  #include "macros.h"
@@ -17,14 +17,22 @@
  
  /**************************************************************************************************/
  
+struct IntPairVectorSorter{
+    bool operator() (const pair<int, int>& firstPair, const pair<int, int>& secondPair) {
+        return firstPair.first < secondPair.first;
+    }
+};
+
+/**************************************************************************************************/
+
  class AbstractDecisionTree{
    
  public:
    
-    AbstractDecisionTree(vector<vector<int> >baseDataSet, 
-                       vector<int> globalDiscardedFeatureIndices, 
-                       OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, 
-                       string treeSplitCriterion);    
+    AbstractDecisionTree(vector<vector<int> >& baseDataSet,
+                           vector<int> globalDiscardedFeatureIndices, 
+                           OptimumFeatureSubsetSelector optimumFeatureSubsetSelector, 
+                           string treeSplitCriterion);    
      virtual ~AbstractDecisionTree(){}
      
    
@@ -32,23 +40,29 @@ protected:
    
      virtual int createBootStrappedSamples();
      virtual int getMinEntropyOfFeature(vector<int> featureVector, vector<int> outputVector, double& minEntropy, int& featureSplitValue, double& intrinsicValue);
-    virtual int getBestSplitAndMinEntropy(vector< vector<int> > featureOutputPairs, vector<int> splitPoints, double& minEntropy, int& minEntropyIndex, double& relatedIntrinsicValue);
+        virtual int getBestSplitAndMinEntropy(vector< pair<int, int> > featureOutputPairs, vector<int> splitPoints, double& minEntropy, int& minEntropyIndex, double& relatedIntrinsicValue);
      virtual double calcIntrinsicValue(int numLessThanValueAtSplitPoint, int numGreaterThanValueAtSplitPoint, int numSamples);
-    virtual double calcSplitEntropy(vector< vector<int> > featureOutputPairs, int splitIndex, int numOutputClasses, bool);
+    virtual double calcSplitEntropy(vector< pair<int, int> > featureOutputPairs, int splitIndex, int numOutputClasses, bool);
+
      virtual int getSplitPopulation(RFTreeNode* node, vector< vector<int> >& leftChildSamples, vector< vector<int> >& rightChildSamples);
      virtual bool checkIfAlreadyClassified(RFTreeNode* treeNode, int& outputClass);
  
-    vector< vector<int> > baseDataSet;
+    vector< vector<int> >& baseDataSet;
      int numSamples;
      int numFeatures;
      int numOutputClasses;
      vector<int> outputClasses;
+    
      vector< vector<int> > bootstrappedTrainingSamples;
      vector<int> bootstrappedTrainingSampleIndices;
      vector< vector<int> > bootstrappedTestSamples;
      vector<int> bootstrappedTestSampleIndices;
      
+    vector<vector<int> > testSampleFeatureVectors;
+    
      RFTreeNode* rootNode;
+    int nodeIdCount;
+    map<int, int> nodeMisclassificationCounts;
      vector<int> globalDiscardedFeatureIndices;
      int optimumFeatureSubsetSize;
      string treeSplitCriterion;
diff --git a/bayesian.cpp b/bayesian.cpp

index 6f65965bfb912e983d4de12dcbaa3dfe307e23e1..8278afb32e1d2c028a83ffdece17ed9910a78e20 100644 (file)
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -145,6 +145,8 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
  
                                 //for each word
                                 for (int i = 0; i < numKmers; i++) {
+                    //m->mothurOut("[DEBUG]: kmer = " + toString(i) + "\n");
+                    
                                         if (m->control_pressed) {  break; }
                                         
                                         #ifdef USE_MPI
@@ -239,7 +241,9 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                         }
                 }
                 
+        if (m->debug) { m->mothurOut("[DEBUG]: about to generateWordPairDiffArr\n"); }
                 generateWordPairDiffArr();
+        if (m->debug) { m->mothurOut("[DEBUG]: done generateWordPairDiffArr\n"); }
                 
                 //save probabilities
                 if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; }
diff --git a/calcsparcc.cpp b/calcsparcc.cpp

index 9d1d88a166566e0fa50e5217dfe4597ba2374fba..7fc61921cd8ff12fdfd9af41d82db20ee44ad72a 100644 (file)
--- a/calcsparcc.cpp
+++ b/calcsparcc.cpp
@@ -6,7 +6,7 @@
  //  Copyright (c) 2012 University of Michigan. All rights reserved.
  //
  
-#include "calcSparcc.h"
+#include "calcsparcc.h"
  #include "linearalgebra.h"
  
  /**************************************************************************************************/
diff --git a/classify.cpp b/classify.cpp

index 36179f471da4ac5d40b563e609feb2f8d3d32e6c..5f97c7ec693a97997554683188a013b1054eb3d7 100644 (file)
--- a/classify.cpp
+++ b/classify.cpp
@@ -309,16 +309,19 @@ int Classify::readTaxonomy(string file) {
          
          taxonomy.clear(); 
          m->readTax(file, taxonomy);
-        map<string, string> tempTaxonomy;
+        
+        //commented out to save time with large templates. 6/12/13
+        //map<string, string> tempTaxonomy;
          for (map<string, string>::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) {  
-            if (m->inUsersGroups(itTax->first, names)) {
-                phyloTree->addSeqToTree(itTax->first, itTax->second); 
-                tempTaxonomy[itTax->first] = itTax->second;
-            }else {
-                m->mothurOut("[WARNING]: " + itTax->first + " is in your taxonomy file and not in your reference file, ignoring.\n");
-            }
+            //if (m->inUsersGroups(itTax->first, names)) {
+                phyloTree->addSeqToTree(itTax->first, itTax->second);
+            if (m->control_pressed) { break; }
+                //tempTaxonomy[itTax->first] = itTax->second;
+           // }else {
+            //    m->mothurOut("[WARNING]: " + itTax->first + " is in your taxonomy file and not in your reference file, ignoring.\n");
+            //}
          }
-        taxonomy = tempTaxonomy;
+        //taxonomy = tempTaxonomy;
  #endif 
                 phyloTree->assignHeirarchyIDs(0);
                 
@@ -340,21 +343,8 @@ int Classify::readTaxonomy(string file) {
  vector<string> Classify::parseTax(string tax) {
         try {
                 vector<string> taxons;
-               
-               tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
-       
-               //parse taxonomy
-               string individual;
-               while (tax.find_first_of(';') != -1) {
-                       individual = tax.substr(0,tax.find_first_of(';'));
-                       tax = tax.substr(tax.find_first_of(';')+1, tax.length());
-                       taxons.push_back(individual);
-                       
-               }
-               //get last one
-               taxons.push_back(tax);
-               
-               return taxons;
+               m->splitAtChar(tax, taxons, ';');
+        return taxons;
         }
         catch(exception& e) {
                 m->errorOut(e, "Classify", "parseTax");
diff --git a/classifysharedcommand.cpp b/classifysharedcommand.cpp

index 2dc963babfc0ebea56cb0505e647667d1e964170..6e32fd1e800023348e4a58eeaad678f262430d25 100755 (executable)
--- a/classifysharedcommand.cpp
+++ b/classifysharedcommand.cpp
@@ -20,6 +20,14 @@ vector<string> ClassifySharedCommand::setParameters(){
          CommandParameter potupersplit("otupersplit", "Multiple", "log2-squareroot", "log2", "", "", "","",false,false); parameters.push_back(potupersplit);
          CommandParameter psplitcriteria("splitcriteria", "Multiple", "gainratio-infogain", "gainratio", "", "", "","",false,false); parameters.push_back(psplitcriteria);
                 CommandParameter pnumtrees("numtrees", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pnumtrees);
+        
+            // parameters related to pruning
+        CommandParameter pdopruning("prune", "Boolean", "", "T", "", "", "", "", false, false); parameters.push_back(pdopruning);
+        CommandParameter ppruneaggrns("pruneaggressiveness", "Number", "", "0.9", "", "", "", "", false, false); parameters.push_back(ppruneaggrns);
+        CommandParameter pdiscardhetrees("discarderrortrees", "Boolean", "", "T", "", "", "", "", false, false); parameters.push_back(pdiscardhetrees);
+        CommandParameter phetdiscardthreshold("errorthreshold", "Number", "", "0.4", "", "", "", "", false, false); parameters.push_back(phetdiscardthreshold);
+        CommandParameter psdthreshold("stdthreshold", "Number", "", "0.0", "", "", "", "", false, false); parameters.push_back(psdthreshold);
+            // pruning params end
  
          CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
                 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
@@ -81,6 +89,7 @@ ClassifySharedCommand::ClassifySharedCommand() {
      exit(1);
    }
  }
+
  //**********************************************************************************************************************
  ClassifySharedCommand::ClassifySharedCommand(string option) {
    try {
@@ -104,7 +113,6 @@ ClassifySharedCommand::ClassifySharedCommand(string option) {
        for (it = parameters.begin(); it != parameters.end(); it++) {
          if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
        }
-        
          vector<string> tempOutNames;
          outputTypes["summary"] = tempOutNames;
        
@@ -130,7 +138,6 @@ ClassifySharedCommand::ClassifySharedCommand(string option) {
          }
          
        }
-       
          //check for parameters
          //get shared file, it is required
        sharedfile = validParameter.validFile(parameters, "shared", true);
@@ -158,25 +165,51 @@ ClassifySharedCommand::ClassifySharedCommand(string option) {
          outputDir = m->hasPath(sharedfile); //if user entered a file with a path then preserve it
        }
        
-    
          // NEW CODE for OTU per split selection criteria
-      otupersplit = validParameter.validFile(parameters, "otupersplit", false);
-      if (otupersplit == "not found") { otupersplit = "log2"; }
-      if ((otupersplit == "squareroot") || (otupersplit == "log2")) {
-        optimumFeatureSubsetSelectionCriteria = otupersplit;
-      }else { m->mothurOut("Not a valid OTU per split selection method. Valid OTU per split selection methods are 'log2' and 'squareroot'."); m->mothurOutEndLine(); abort = true; }
-      
-        // splitcriteria
-      splitcriteria = validParameter.validFile(parameters, "splitcriteria", false);
-      if (splitcriteria == "not found") { splitcriteria = "gainratio"; }
-      if ((splitcriteria == "gainratio") || (splitcriteria == "infogain")) {
-        treeSplitCriterion = splitcriteria;
-      }else { m->mothurOut("Not a valid tree splitting criterio. Valid tree splitting criteria are 'gainratio' and 'infogain'."); m->mothurOutEndLine(); abort = true; }
-      
-      
-      string temp = validParameter.validFile(parameters, "numtrees", false); if (temp == "not found"){ temp = "100";   }
-      m->mothurConvert(temp, numDecisionTrees);
-
+        string temp = validParameter.validFile(parameters, "splitcriteria", false);
+        if (temp == "not found") { temp = "gainratio"; }
+        if ((temp == "gainratio") || (temp == "infogain")) {
+            treeSplitCriterion = temp;
+        } else { m->mothurOut("Not a valid tree splitting criterio. Valid tree splitting criteria are 'gainratio' and 'infogain'.");
+            m->mothurOutEndLine();
+            abort = true;
+        }
+        
+        temp = validParameter.validFile(parameters, "numtrees", false); if (temp == "not found"){      temp = "100";   }
+        m->mothurConvert(temp, numDecisionTrees);
+        
+            // parameters for pruning
+        temp = validParameter.validFile(parameters, "prune", false);
+        if (temp == "not found") { temp = "f"; }
+        doPruning = m->isTrue(temp);
+        
+        temp = validParameter.validFile(parameters, "pruneaggressiveness", false);
+        if (temp == "not found") { temp = "0.9"; }
+        m->mothurConvert(temp, pruneAggressiveness);
+        
+        temp = validParameter.validFile(parameters, "discarderrortrees", false);
+        if (temp == "not found") { temp = "f"; }
+        discardHighErrorTrees = m->isTrue(temp);
+        
+        temp = validParameter.validFile(parameters, "errorthreshold", false);
+        if (temp == "not found") { temp = "0.4"; }
+        m->mothurConvert(temp, highErrorTreeDiscardThreshold);
+        
+        temp = validParameter.validFile(parameters, "otupersplit", false);
+        if (temp == "not found") { temp = "log2"; }
+        if ((temp == "squareroot") || (temp == "log2")) {
+            optimumFeatureSubsetSelectionCriteria = temp;
+        } else { m->mothurOut("Not a valid OTU per split selection method. Valid OTU per split selection methods are 'log2' and 'squareroot'.");
+            m->mothurOutEndLine();
+            abort = true;
+        }
+        
+        temp = validParameter.validFile(parameters, "stdthreshold", false);
+        if (temp == "not found") { temp = "0.0"; }
+        m->mothurConvert(temp, featureStandardDeviationThreshold);
+                        
+            // end of pruning params
+        
          //Groups must be checked later to make sure they are valid. SharedUtilities has functions of check the validity, just make to so m->setGroups() after the checks.  If you are using these with a shared file no need to check the SharedRAbundVector class will call SharedUtilites for you, kinda nice, huh?
        string groups = validParameter.validFile(parameters, "groups", false);
        if (groups == "not found") { groups = ""; }
@@ -235,7 +268,6 @@ int ClassifySharedCommand::execute() {
          for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
          lookup = input.getSharedRAbundVectors(lastLabel);
          m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
-           
          processSharedAndDesignData(lookup);        
          
          processedLabels.insert(lookup[0]->getLabel());
@@ -339,7 +371,8 @@ void ClassifySharedCommand::processSharedAndDesignData(vector<SharedRAbundVector
              dataSet[i][j] = treatmentToIntMap[treatmentName];
          }
          
-        RandomForest randomForest(dataSet, numDecisionTrees, treeSplitCriterion);
+        RandomForest randomForest(dataSet, numDecisionTrees, treeSplitCriterion, doPruning, pruneAggressiveness, discardHighErrorTrees, highErrorTreeDiscardThreshold, optimumFeatureSubsetSelectionCriteria, featureStandardDeviationThreshold);
+        
          randomForest.populateDecisionTrees();
          randomForest.calcForrestErrorRate();
          
diff --git a/classifysharedcommand.h b/classifysharedcommand.h

index fe0637494db78aa96ecbac56c2f82b08ba2f9c6d..7a6907f47cc2030d9cab822db6c34b95c72e1504 100755 (executable)
--- a/classifysharedcommand.h
+++ b/classifysharedcommand.h
@@ -20,12 +20,11 @@ public:
    
    vector<string> setParameters();
    string getCommandName()                      { return "classify.shared";     }
-   string getCommandCategory()         { return "OTU-Based Approaches";                }
-  
+  string getCommandCategory()          { return "OTU-Based Approaches";                }  
    string getHelpString();      
-    string getOutputPattern(string);
+  string getOutputPattern(string);
    string getCitation() { return "http://www.mothur.org/wiki/Classify.shared\n"; }
-  string getDescription()              { return "description"; }
+  string getDescription()              { return "implements the random forest machine learning algorithm to identify OTUs that can be used to differentiate between various groups of samples"; }
    int execute();
    
    void help() { m->mothurOut(getHelpString()); }
@@ -35,7 +34,7 @@ private:
      string outputDir;
      vector<string> outputNames, Groups;
    
-    string sharedfile, designfile, otupersplit, splitcriteria;
+    string sharedfile, designfile;
      set<string> labels;
      bool allLines;
    
diff --git a/cluster.hpp b/cluster.hpp

index dff55e61fcf7b571da5c0b895a5f46910f58f8c6..26a01b9389778fea2e2b1067a1738c75027ad7f2 100644 (file)
--- a/cluster.hpp
+++ b/cluster.hpp
@@ -1,7 +1,7 @@
  #ifndef CLUSTER_H
  #define CLUSTER_H
  
-
+//test change
  
  #include "mothur.h"
  #include "sparsedistancematrix.h"
diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp

index f17c11e7384330c4b9eb4d5af86f734fc0d0f21f..9c1408c9a112dcc3ad9ed579026e2528c334a102 100644 (file)
--- a/clusterfragmentscommand.cpp
+++ b/clusterfragmentscommand.cpp
@@ -181,7 +181,7 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) {
              countfile = validParameter.validFile(parameters, "count", true);
                         if (countfile == "not open") { abort = true; countfile = ""; }  
                         else if (countfile == "not found") { countfile = ""; }
-                       else { ct.readTable(countfile, false); m->setCountTableFile(countfile); }
+                       else { ct.readTable(countfile, true); m->setCountTableFile(countfile); }
                         
              if ((countfile != "") && (namefile != "")) { m->mothurOut("When executing a cluster.fragments command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
                         
diff --git a/commandfactory.cpp b/commandfactory.cpp

index 387ee10833323300a66a0e6d1161a64bff6fb9ee..41a67297c4199ab819ef7f0498dfd27e6debd515 100644 (file)
--- a/commandfactory.cpp
+++ b/commandfactory.cpp
@@ -144,6 +144,10 @@
  #include "getmetacommunitycommand.h"
  #include "sparcccommand.h"
  #include "makelookupcommand.h"
+#include "renameseqscommand.h"
+#include "makelefsecommand.h"
+#include "lefsecommand.h"
+#include "kruskalwalliscommand.h"
  
  /*******************************************************/
  
@@ -311,6 +315,10 @@ CommandFactory::CommandFactory(){
      commands["get.metacommunity"]   = "get.metacommunity";
      commands["sparcc"]              = "sparcc";
      commands["make.lookup"]         = "make.lookup";
+    commands["rename.seqs"]         = "rename.seqs";
+    commands["make.lefse"]          = "make.lefse";
+    commands["lefse"]               = "lefse";
+    commands["kruskal.wallis"]      = "kruskal.wallis";
      
  
  }
@@ -534,6 +542,10 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
          else if(commandName == "get.metacommunity")     {      command = new GetMetaCommunityCommand(optionString);        }
          else if(commandName == "sparcc")                {      command = new SparccCommand(optionString);                  }
          else if(commandName == "make.lookup")                  {       command = new MakeLookupCommand(optionString);                          }
+        else if(commandName == "rename.seqs")                  {       command = new RenameSeqsCommand(optionString);                          }
+        else if(commandName == "make.lefse")                   {       command = new MakeLefseCommand(optionString);                           }
+        else if(commandName == "lefse")                 {      command = new LefseCommand(optionString);                   }
+        else if(commandName == "kruskal.wallis")        {      command = new KruskalWallisCommand(optionString);           }
                 else                                                                                    {       command = new NoCommand(optionString);                                          }
  
                 return command;
@@ -698,6 +710,10 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str
          else if(commandName == "get.metacommunity")     {      pipecommand = new GetMetaCommunityCommand(optionString);        }
          else if(commandName == "sparcc")                {      pipecommand = new SparccCommand(optionString);                  }
          else if(commandName == "make.lookup")                  {       pipecommand = new MakeLookupCommand(optionString);                              }
+        else if(commandName == "rename.seqs")                  {       pipecommand = new RenameSeqsCommand(optionString);                              }
+        else if(commandName == "make.lefse")                   {       pipecommand = new MakeLefseCommand(optionString);                               }
+        else if(commandName == "lefse")                 {      pipecommand = new LefseCommand(optionString);                   }
+        else if(commandName == "kruskal.wallis")        {      pipecommand = new KruskalWallisCommand(optionString);           }
                 else                                                                                    {       pipecommand = new NoCommand(optionString);                                              }
  
                 return pipecommand;
@@ -848,6 +864,10 @@ Command* CommandFactory::getCommand(string commandName){
          else if(commandName == "get.metacommunity")     {      shellcommand = new GetMetaCommunityCommand();       }
          else if(commandName == "sparcc")                {      shellcommand = new SparccCommand();                 }
          else if(commandName == "make.lookup")                  {       shellcommand = new MakeLookupCommand();                         }
+        else if(commandName == "rename.seqs")                  {       shellcommand = new RenameSeqsCommand();                         }
+        else if(commandName == "make.lefse")                   {       shellcommand = new MakeLefseCommand();                          }
+        else if(commandName == "lefse")                 {      shellcommand = new LefseCommand();                  }
+        else if(commandName == "kruskal.wallis")        {      shellcommand = new KruskalWallisCommand();          }
                 else                                                                                    {       shellcommand = new NoCommand();                                         }
  
                 return shellcommand;
diff --git a/createdatabasecommand.cpp b/createdatabasecommand.cpp

index 58799e7a33e840580654d006b7a085ec02e7c5f5..eab7c8261a55e45abe3f762f86cc46e59c9822a6 100644 (file)
--- a/createdatabasecommand.cpp
+++ b/createdatabasecommand.cpp
@@ -380,8 +380,8 @@ int CreateDatabaseCommand::execute(){
                  if (countfile == "") {
                      sort(binNames.begin(), binNames.end());
                      bin = "";
-                    for (int i = 0; i < binNames.size()-1; i++) {
-                        bin += binNames[i] + ',';
+                    for (int j = 0; j < binNames.size()-1; j++) {
+                        bin += binNames[j] + ',';
                      }
                      bin += binNames[binNames.size()-1];
                      map<string, string>::iterator it = repNames.find(bin);
@@ -743,7 +743,7 @@ vector<SharedRAbundVector*> CreateDatabaseCommand::getShared(){
          return lookup;
      }
         catch(exception& e) {
-               m->errorOut(e, "CreateDatabaseCommand", "getList");
+               m->errorOut(e, "CreateDatabaseCommand", "getShared");
                 exit(1);
         }
  }
diff --git a/decisiontree.cpp b/decisiontree.cpp

index 99853f303c3562e41c43909948276dd82e0f9c6d..1fd29240c156049f61abd9e9bc1f66132275d16c 100644 (file)
--- a/decisiontree.cpp
+++ b/decisiontree.cpp
@@ -8,13 +8,18 @@
  
  #include "decisiontree.hpp"
  
-DecisionTree::DecisionTree(vector< vector<int> > baseDataSet,
-             vector<int> globalDiscardedFeatureIndices,
-             OptimumFeatureSubsetSelector optimumFeatureSubsetSelector,
-             string treeSplitCriterion) : AbstractDecisionTree(baseDataSet,
-                       globalDiscardedFeatureIndices,
-                       optimumFeatureSubsetSelector,
-                       treeSplitCriterion), variableImportanceList(numFeatures, 0){
+DecisionTree::DecisionTree(vector< vector<int> >& baseDataSet,
+                           vector<int> globalDiscardedFeatureIndices,
+                           OptimumFeatureSubsetSelector optimumFeatureSubsetSelector,
+                           string treeSplitCriterion,
+                           float featureStandardDeviationThreshold)
+            : AbstractDecisionTree(baseDataSet,
+                                   globalDiscardedFeatureIndices,
+                                   optimumFeatureSubsetSelector,
+                                   treeSplitCriterion),
+            variableImportanceList(numFeatures, 0),
+            featureStandardDeviationThreshold(featureStandardDeviationThreshold) {
+                
      try {
          m = MothurOut::getInstance();
          createBootStrappedSamples();
@@ -28,41 +33,57 @@ DecisionTree::DecisionTree(vector< vector<int> > baseDataSet,
  
  /***********************************************************************/
  
-int DecisionTree::calcTreeVariableImportanceAndError() {
+int DecisionTree::calcTreeVariableImportanceAndError(int& numCorrect, double& treeErrorRate) {
      try {
+        vector< vector<int> > randomlySampledTestData(bootstrappedTestSamples.size(), vector<int>(bootstrappedTestSamples[0].size(), 0));
          
-        int numCorrect;
-        double treeErrorRate;
-        calcTreeErrorRate(numCorrect, treeErrorRate);
+            // TODO: is is possible to further speed up the following O(N^2) by using std::copy?
+        for (int i = 0; i < bootstrappedTestSamples.size(); i++) {
+            for (int j = 0; j < bootstrappedTestSamples[i].size(); j++) {
+                randomlySampledTestData[i][j] = bootstrappedTestSamples[i][j];
+            }
+        }
          
-        if (m->control_pressed) {return 0; }
-                
          for (int i = 0; i < numFeatures; i++) {
-            if (m->control_pressed) {return 0; }
-            // NOTE: only shuffle the features, never shuffle the output vector
-            // so i = 0 and i will be alwaays <= (numFeatures - 1) as the index at numFeatures will denote
-            // the feature vector
-            vector< vector<int> > randomlySampledTestData = randomlyShuffleAttribute(bootstrappedTestSamples, i);
+            if (m->control_pressed) { return 0; }
              
-            int numCorrectAfterShuffle = 0;
-            for (int j = 0; j < randomlySampledTestData.size(); j++) {
-                if (m->control_pressed) {return 0; }
-                vector<int> shuffledSample = randomlySampledTestData[j];
-                int actualSampleOutputClass = shuffledSample[numFeatures];
-                int predictedSampleOutputClass = evaluateSample(shuffledSample);
-                if (actualSampleOutputClass == predictedSampleOutputClass) { numCorrectAfterShuffle++; }
+                // if the index is in globalDiscardedFeatureIndices (i.e, null feature) we don't want to shuffle them
+            vector<int>::iterator it = find(globalDiscardedFeatureIndices.begin(), globalDiscardedFeatureIndices.end(), i);
+            if (it == globalDiscardedFeatureIndices.end()) {        // NOT FOUND
+                // if the standard deviation is very low, we know it's not a good feature at all
+                // we can save some time here by discarding that feature
+                
+                vector<int> featureVector = testSampleFeatureVectors[i];
+                if (m->getStandardDeviation(featureVector) > featureStandardDeviationThreshold) {
+                    // NOTE: only shuffle the features, never shuffle the output vector
+                    // so i = 0 and i will be alwaays <= (numFeatures - 1) as the index at numFeatures will denote
+                    // the feature vector
+                    randomlyShuffleAttribute(bootstrappedTestSamples, i, i - 1, randomlySampledTestData);
+
+                    int numCorrectAfterShuffle = 0;
+                    for (int j = 0; j < randomlySampledTestData.size(); j++) {
+                        if (m->control_pressed) {return 0; }
+                        
+                        vector<int> shuffledSample = randomlySampledTestData[j];
+                        int actualSampleOutputClass = shuffledSample[numFeatures];
+                        int predictedSampleOutputClass = evaluateSample(shuffledSample);
+                        if (actualSampleOutputClass == predictedSampleOutputClass) { numCorrectAfterShuffle++; }
+                    }
+                    variableImportanceList[i] += (numCorrect - numCorrectAfterShuffle);
+                }
              }
-            variableImportanceList[i] += (numCorrect - numCorrectAfterShuffle);
          }
          
          // TODO: do we need to save the variableRanks in the DecisionTree, do we need it later?
-        vector< vector<int> > variableRanks;
+        vector< pair<int, int> > variableRanks;
+        
          for (int i = 0; i < variableImportanceList.size(); i++) {
              if (m->control_pressed) {return 0; }
              if (variableImportanceList[i] > 0) {
                  // TODO: is there a way to optimize the follow line's code?
-                vector<int> variableRank(2, 0);
-                variableRank[0] = i; variableRank[1] = variableImportanceList[i];
+                pair<int, int> variableRank(0, 0);
+                variableRank.first = i;
+                variableRank.second = variableImportanceList[i];
                  variableRanks.push_back(variableRank);
              }
          }
@@ -84,8 +105,10 @@ int DecisionTree::evaluateSample(vector<int> testSample) {
      try {
          RFTreeNode *node = rootNode;
          while (true) {
-            if (m->control_pressed) {return 0; }
+            if (m->control_pressed) { return 0; }
+            
              if (node->checkIsLeaf()) { return node->getOutputClass(); }
+            
              int sampleSplitFeatureValue = testSample[node->getSplitFeatureIndex()];
              if (sampleSplitFeatureValue < node->getSplitFeatureValue()) { node = node->getLeftChildNode(); }
              else { node = node->getRightChildNode(); } 
@@ -101,8 +124,8 @@ int DecisionTree::evaluateSample(vector<int> testSample) {
  /***********************************************************************/
  
  int DecisionTree::calcTreeErrorRate(int& numCorrect, double& treeErrorRate){
+    numCorrect = 0;
      try {
-        numCorrect = 0;
          for (int i = 0; i < bootstrappedTestSamples.size(); i++) {
               if (m->control_pressed) {return 0; }
              
@@ -128,35 +151,44 @@ int DecisionTree::calcTreeErrorRate(int& numCorrect, double& treeErrorRate){
  }
  
  /***********************************************************************/
-
  // TODO: optimize the algo, instead of transposing two time, we can extarct the feature,
  // shuffle it and then re-insert in the original place, thus iproving runnting time
  //This function randomize abundances for a given OTU/feature.
-vector< vector<int> > DecisionTree::randomlyShuffleAttribute(vector< vector<int> > samples, int featureIndex) {
+
+void DecisionTree::randomlyShuffleAttribute(const vector< vector<int> >& samples,
+                               const int featureIndex,
+                               const int prevFeatureIndex,
+                               vector< vector<int> >& shuffledSample) {
      try {
          // NOTE: we need (numFeatures + 1) featureVecotors, the last extra vector is actually outputVector
-        vector< vector<int> > shuffledSample = samples;
-        vector<int> featureVectors(samples.size(), 0);
          
+        // restore previously shuffled feature
+        if (prevFeatureIndex > -1) {
+            for (int j = 0; j < samples.size(); j++) {
+                if (m->control_pressed) { return; }
+                shuffledSample[j][prevFeatureIndex] = samples[j][prevFeatureIndex];
+            }
+        }
+        
+        // now do the shuffling
+        vector<int> featureVectors(samples.size(), 0);
          for (int j = 0; j < samples.size(); j++) {
-            if (m->control_pressed) { return shuffledSample; }
+            if (m->control_pressed) { return; }
              featureVectors[j] = samples[j][featureIndex];
          }
-        
          random_shuffle(featureVectors.begin(), featureVectors.end());
-
          for (int j = 0; j < samples.size(); j++) {
-            if (m->control_pressed) {return shuffledSample; }
+            if (m->control_pressed) { return; }
              shuffledSample[j][featureIndex] = featureVectors[j];
          }
-        
-        return shuffledSample;
      }
         catch(exception& e) {
-               m->errorOut(e, "DecisionTree", "randomlyShuffleAttribute");
+        m->errorOut(e, "DecisionTree", "randomlyShuffleAttribute");
                 exit(1);
-       } 
+       }
+    
  }
+
  /***********************************************************************/
  
  int DecisionTree::purgeTreeNodesDataRecursively(RFTreeNode* treeNode) {
@@ -181,10 +213,12 @@ int DecisionTree::purgeTreeNodesDataRecursively(RFTreeNode* treeNode) {
  void DecisionTree::buildDecisionTree(){
      try {
      
-    int generation = 0;
-    rootNode = new RFTreeNode(bootstrappedTrainingSamples, globalDiscardedFeatureIndices, numFeatures, numSamples, numOutputClasses, generation);
-    
-    splitRecursively(rootNode);
+        int generation = 0;
+        rootNode = new RFTreeNode(bootstrappedTrainingSamples, globalDiscardedFeatureIndices, numFeatures, numSamples, numOutputClasses, generation, nodeIdCount, featureStandardDeviationThreshold);
+        nodeIdCount++;
+        
+        splitRecursively(rootNode);
+        
          }
         catch(exception& e) {
                 m->errorOut(e, "DecisionTree", "buildDecisionTree");
@@ -210,24 +244,32 @@ int DecisionTree::splitRecursively(RFTreeNode* rootNode) {
              rootNode->setOutputClass(classifiedOutputClass);
              return 0;
          }
-        if (m->control_pressed) {return 0;}
+        if (m->control_pressed) { return 0; }
          vector<int> featureSubsetIndices = selectFeatureSubsetRandomly(globalDiscardedFeatureIndices, rootNode->getLocalDiscardedFeatureIndices());
+        
+            // TODO: need to check if the value is actually copied correctly
          rootNode->setFeatureSubsetIndices(featureSubsetIndices);
-        if (m->control_pressed) {return 0;}
+        if (m->control_pressed) { return 0; }
        
          findAndUpdateBestFeatureToSplitOn(rootNode);
          
-        if (m->control_pressed) {return 0;}
+        // update rootNode outputClass, this is needed for pruning
+        // this is only for internal nodes
+        updateOutputClassOfNode(rootNode);
+        
+        if (m->control_pressed) { return 0; }
          
          vector< vector<int> > leftChildSamples;
          vector< vector<int> > rightChildSamples;
          getSplitPopulation(rootNode, leftChildSamples, rightChildSamples);
          
-        if (m->control_pressed) {return 0;}
+        if (m->control_pressed) { return 0; }
          
          // TODO: need to write code to clear this memory
-        RFTreeNode* leftChildNode = new RFTreeNode(leftChildSamples, globalDiscardedFeatureIndices, numFeatures, (int)leftChildSamples.size(), numOutputClasses, rootNode->getGeneration() + 1);
-        RFTreeNode* rightChildNode = new RFTreeNode(rightChildSamples, globalDiscardedFeatureIndices, numFeatures, (int)rightChildSamples.size(), numOutputClasses, rootNode->getGeneration() + 1);
+        RFTreeNode* leftChildNode = new RFTreeNode(leftChildSamples, globalDiscardedFeatureIndices, numFeatures, (int)leftChildSamples.size(), numOutputClasses, rootNode->getGeneration() + 1, nodeIdCount, featureStandardDeviationThreshold);
+        nodeIdCount++;
+        RFTreeNode* rightChildNode = new RFTreeNode(rightChildSamples, globalDiscardedFeatureIndices, numFeatures, (int)rightChildSamples.size(), numOutputClasses, rootNode->getGeneration() + 1, nodeIdCount, featureStandardDeviationThreshold);
+        nodeIdCount++;
          
          rootNode->setLeftChildNode(leftChildNode);
          leftChildNode->setParentNode(rootNode);
@@ -237,7 +279,7 @@ int DecisionTree::splitRecursively(RFTreeNode* rootNode) {
          
          // TODO: This recursive split can be parrallelized later
          splitRecursively(leftChildNode);
-        if (m->control_pressed) {return 0;}
+        if (m->control_pressed) { return 0; }
          
          splitRecursively(rightChildNode);
          return 0;
@@ -254,11 +296,11 @@ int DecisionTree::findAndUpdateBestFeatureToSplitOn(RFTreeNode* node){
      try {
  
          vector< vector<int> > bootstrappedFeatureVectors = node->getBootstrappedFeatureVectors();
-        if (m->control_pressed) {return 0;}
+        if (m->control_pressed) { return 0; }
          vector<int> bootstrappedOutputVector = node->getBootstrappedOutputVector();
-        if (m->control_pressed) {return 0;}
+        if (m->control_pressed) { return 0; }
          vector<int> featureSubsetIndices = node->getFeatureSubsetIndices();
-        if (m->control_pressed) {return 0;}
+        if (m->control_pressed) { return 0; }
          
          vector<double> featureSubsetEntropies;
          vector<int> featureSubsetSplitValues;
@@ -266,7 +308,7 @@ int DecisionTree::findAndUpdateBestFeatureToSplitOn(RFTreeNode* node){
          vector<double> featureSubsetGainRatios;
          
          for (int i = 0; i < featureSubsetIndices.size(); i++) {
-            if (m->control_pressed) {return 0;}
+            if (m->control_pressed) { return 0; }
              
              int tryIndex = featureSubsetIndices[i];
                         
@@ -275,7 +317,7 @@ int DecisionTree::findAndUpdateBestFeatureToSplitOn(RFTreeNode* node){
              double featureIntrinsicValue;
              
              getMinEntropyOfFeature(bootstrappedFeatureVectors[tryIndex], bootstrappedOutputVector, featureMinEntropy, featureSplitValue, featureIntrinsicValue);
-            if (m->control_pressed) {return 0;}
+            if (m->control_pressed) { return 0; }
              
              featureSubsetEntropies.push_back(featureMinEntropy);
              featureSubsetSplitValues.push_back(featureSplitValue);
@@ -290,23 +332,33 @@ int DecisionTree::findAndUpdateBestFeatureToSplitOn(RFTreeNode* node){
          vector<double>::iterator minEntropyIterator = min_element(featureSubsetEntropies.begin(), featureSubsetEntropies.end());
          vector<double>::iterator maxGainRatioIterator = max_element(featureSubsetGainRatios.begin(), featureSubsetGainRatios.end());
          double featureMinEntropy = *minEntropyIterator;
-        //double featureMaxGainRatio = *maxGainRatioIterator;
+        
+        // TODO: kept the following line as future reference, can be useful
+        // double featureMaxGainRatio = *maxGainRatioIterator;
          
          double bestFeatureSplitEntropy = featureMinEntropy;
          int bestFeatureToSplitOnIndex = -1;
-        if (treeSplitCriterion == "gainRatio"){ 
+        if (treeSplitCriterion == "gainratio"){
              bestFeatureToSplitOnIndex = (int)(maxGainRatioIterator - featureSubsetGainRatios.begin());
              // if using 'gainRatio' measure, then featureMinEntropy must be re-updated, as the index
              // for 'featureMaxGainRatio' would be different
              bestFeatureSplitEntropy = featureSubsetEntropies[bestFeatureToSplitOnIndex];
+        } else  if ( treeSplitCriterion == "infogain"){
+            bestFeatureToSplitOnIndex = (int)(minEntropyIterator - featureSubsetEntropies.begin());
+        } else {
+                // TODO: we need an abort mechanism here
          }
-        else { bestFeatureToSplitOnIndex = (int)(minEntropyIterator - featureSubsetEntropies.begin()); }
+        
+            // TODO: is the following line needed? kept is as future reference
+        // splitInformationGain = node.ownEntropy - node.splitFeatureEntropy
          
          int bestFeatureSplitValue = featureSubsetSplitValues[bestFeatureToSplitOnIndex];
          
          node->setSplitFeatureIndex(featureSubsetIndices[bestFeatureToSplitOnIndex]);
          node->setSplitFeatureValue(bestFeatureSplitValue);
          node->setSplitFeatureEntropy(bestFeatureSplitEntropy);
+            // TODO: kept the following line as future reference
+        // node.splitInformationGain = splitInformationGain
          
          return 0;
      }
@@ -363,17 +415,17 @@ vector<int> DecisionTree::selectFeatureSubsetRandomly(vector<int> globalDiscarde
  int DecisionTree::printTree(RFTreeNode* treeNode, string caption){
      try { 
          string tabs = "";
-        for (int i = 0; i < treeNode->getGeneration(); i++) { tabs += "   "; }
+        for (int i = 0; i < treeNode->getGeneration(); i++) { tabs += "|--"; }
          //    for (int i = 0; i < treeNode->getGeneration() - 1; i++) { tabs += "|  "; }
          //    if (treeNode->getGeneration() != 0) { tabs += "|--"; }
          
          if (treeNode != NULL && treeNode->checkIsLeaf() == false){
-            m->mothurOut(tabs + caption + " [ gen: " + toString(treeNode->getGeneration()) + " ] ( " + toString(treeNode->getSplitFeatureValue()) + " < X" + toString(treeNode->getSplitFeatureIndex()) +" )\n");
+            m->mothurOut(tabs + caption + " [ gen: " + toString(treeNode->getGeneration()) + " , id: " + toString(treeNode->nodeId) + " ] ( " + toString(treeNode->getSplitFeatureValue()) + " < X" + toString(treeNode->getSplitFeatureIndex()) + " ) ( predicted: " + toString(treeNode->outputClass) + " , misclassified: " + toString(treeNode->testSampleMisclassificationCount) + " )\n");
              
-            printTree(treeNode->getLeftChildNode(), "leftChild");
-            printTree(treeNode->getRightChildNode(), "rightChild");
+            printTree(treeNode->getLeftChildNode(), "left ");
+            printTree(treeNode->getRightChildNode(), "right");
          }else {
-            m->mothurOut(tabs + caption + " [ gen: " + toString(treeNode->getGeneration()) + " ] ( classified to: " + toString(treeNode->getOutputClass()) + ", samples: " + toString(treeNode->getNumSamples()) + " )\n");
+            m->mothurOut(tabs + caption + " [ gen: " + toString(treeNode->getGeneration()) + " , id: " + toString(treeNode->nodeId) + " ] ( classified to: " + toString(treeNode->getOutputClass()) + ", samples: " + toString(treeNode->getNumSamples()) + " , misclassified: " + toString(treeNode->testSampleMisclassificationCount) + " )\n");
          }
          return 0;
      }
@@ -388,7 +440,7 @@ void DecisionTree::deleteTreeNodesRecursively(RFTreeNode* treeNode) {
          if (treeNode == NULL) { return; }
          deleteTreeNodesRecursively(treeNode->leftChildNode);
          deleteTreeNodesRecursively(treeNode->rightChildNode);
-        delete treeNode;
+        delete treeNode; treeNode = NULL;
      }
         catch(exception& e) {
                 m->errorOut(e, "DecisionTree", "deleteTreeNodesRecursively");
@@ -397,3 +449,87 @@ void DecisionTree::deleteTreeNodesRecursively(RFTreeNode* treeNode) {
  }
  /***********************************************************************/
  
+void DecisionTree::pruneTree(double pruneAggressiveness = 0.9) {
+    
+    // find out the number of misclassification by each of the nodes
+    for (int i = 0; i < bootstrappedTestSamples.size(); i++) {
+        if (m->control_pressed) { return; }
+        
+        vector<int> testSample = bootstrappedTestSamples[i];
+        updateMisclassificationCountRecursively(rootNode, testSample);
+    }
+    
+    // do the actual pruning
+    pruneRecursively(rootNode, pruneAggressiveness);
+}
+/***********************************************************************/
+
+void DecisionTree::pruneRecursively(RFTreeNode* treeNode, double pruneAggressiveness){
+    
+    if (treeNode != NULL && treeNode->checkIsLeaf() == false) {
+        if (m->control_pressed) { return; }
+        
+        pruneRecursively(treeNode->leftChildNode, pruneAggressiveness);
+        pruneRecursively(treeNode->rightChildNode, pruneAggressiveness);
+        
+        int subTreeMisclassificationCount = treeNode->leftChildNode->getTestSampleMisclassificationCount() + treeNode->rightChildNode->getTestSampleMisclassificationCount();
+        int ownMisclassificationCount = treeNode->getTestSampleMisclassificationCount();
+        
+        if (subTreeMisclassificationCount * pruneAggressiveness > ownMisclassificationCount) {
+                // TODO: need to check the effect of these two delete calls
+            delete treeNode->leftChildNode;
+            treeNode->leftChildNode = NULL;
+            
+            delete treeNode->rightChildNode;
+            treeNode->rightChildNode = NULL;
+            
+            treeNode->isLeaf = true;
+        }
+        
+    }
+}
+/***********************************************************************/
+
+void DecisionTree::updateMisclassificationCountRecursively(RFTreeNode* treeNode, vector<int> testSample) {
+    
+    int actualSampleOutputClass = testSample[numFeatures];
+    int nodePredictedOutputClass = treeNode->outputClass;
+    
+    if (actualSampleOutputClass != nodePredictedOutputClass) {
+        treeNode->testSampleMisclassificationCount++;
+        map<int, int>::iterator it = nodeMisclassificationCounts.find(treeNode->nodeId);
+        if (it == nodeMisclassificationCounts.end()) {  // NOT FOUND
+            nodeMisclassificationCounts[treeNode->nodeId] = 0;
+        }
+        nodeMisclassificationCounts[treeNode->nodeId]++;
+    }
+    
+    if (treeNode->checkIsLeaf() == false) { // NOT A LEAF
+        int sampleSplitFeatureValue = testSample[treeNode->splitFeatureIndex];
+        if (sampleSplitFeatureValue < treeNode->splitFeatureValue) {
+            updateMisclassificationCountRecursively(treeNode->leftChildNode, testSample);
+        } else {
+            updateMisclassificationCountRecursively(treeNode->rightChildNode, testSample);
+        }
+    }
+}
+
+/***********************************************************************/
+
+void DecisionTree::updateOutputClassOfNode(RFTreeNode* treeNode) {
+    vector<int> counts(numOutputClasses, 0);
+    for (int i = 0; i < treeNode->bootstrappedOutputVector.size(); i++) {
+        int bootstrappedOutput = treeNode->bootstrappedOutputVector[i];
+        counts[bootstrappedOutput]++;
+    }
+
+    vector<int>::iterator majorityVotedOutputClassCountIterator = max_element(counts.begin(), counts.end());
+    int majorityVotedOutputClassCount = *majorityVotedOutputClassCountIterator;
+    vector<int>::iterator it = find(counts.begin(), counts.end(), majorityVotedOutputClassCount);
+    int majorityVotedOutputClass = (int)(it - counts.begin());
+    treeNode->setOutputClass(majorityVotedOutputClass);
+
+}
+/***********************************************************************/
+
+
diff --git a/decisiontree.hpp b/decisiontree.hpp

index d4441ed738a3049c1ec6fce73b67a91c7b1cdfb3..e890c3214c496f5a208b68744cb18d665cd1aabd 100755 (executable)
--- a/decisiontree.hpp
+++ b/decisiontree.hpp
@@ -6,8 +6,8 @@
    //  Copyright (c) 2012 Schloss Lab. All rights reserved.
    //
  
-#ifndef rrf_fs_prototype_decisiontree_hpp
-#define rrf_fs_prototype_decisiontree_hpp
+#ifndef RF_DECISIONTREE_HPP
+#define RF_DECISIONTREE_HPP
  
  #include "macros.h"
  #include "rftreenode.hpp"
@@ -16,10 +16,14 @@
  /***********************************************************************/
  
  struct VariableRankDescendingSorter {
-  bool operator() (vector<int> first, vector<int> second){ return first[1] > second[1]; }
+  bool operator() (const pair<int, int>& firstPair, const pair<int, int>& secondPair){
+      return firstPair.second > secondPair.second;
+  }
  };
  struct VariableRankDescendingSorterDouble {
-    bool operator() (vector<double> first, vector<double> second){ return first[1] > second[1]; }
+    bool operator() (const pair<int, double>& firstPair, const pair<int, double>& secondPair){
+        return firstPair.second > secondPair.second;
+    }
  };
  /***********************************************************************/
  
@@ -29,19 +33,31 @@ class DecisionTree: public AbstractDecisionTree{
      
  public:
      
-    DecisionTree(vector< vector<int> > baseDataSet,
+    DecisionTree(vector< vector<int> >& baseDataSet,
                   vector<int> globalDiscardedFeatureIndices,
                   OptimumFeatureSubsetSelector optimumFeatureSubsetSelector,
-                 string treeSplitCriterion);
+                 string treeSplitCriterion,
+                 float featureStandardDeviationThreshold);
+    
      virtual ~DecisionTree(){ deleteTreeNodesRecursively(rootNode); }
      
-    int calcTreeVariableImportanceAndError();
+    int calcTreeVariableImportanceAndError(int& numCorrect, double& treeErrorRate);
      int evaluateSample(vector<int> testSample);
      int calcTreeErrorRate(int& numCorrect, double& treeErrorRate);
-    vector< vector<int> > randomlyShuffleAttribute(vector< vector<int> > samples, int featureIndex);  
+    
+    void randomlyShuffleAttribute(const vector< vector<int> >& samples,
+                                  const int featureIndex,
+                                  const int prevFeatureIndex,
+                                  vector< vector<int> >& shuffledSample);
+    
      void purgeDataSetsFromTree() { purgeTreeNodesDataRecursively(rootNode); }
      int purgeTreeNodesDataRecursively(RFTreeNode* treeNode);
      
+    void pruneTree(double pruneAggressiveness);
+    void pruneRecursively(RFTreeNode* treeNode, double pruneAggressiveness);
+    void updateMisclassificationCountRecursively(RFTreeNode* treeNode, vector<int> testSample);
+    void updateOutputClassOfNode(RFTreeNode* treeNode);
+    
      
  private:
      
@@ -54,6 +70,8 @@ private:
      
      vector<int> variableImportanceList;
      map<int, int> outOfBagEstimates;
+  
+    float featureStandardDeviationThreshold;
  };
  
  #endif
diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp

index 8b73425fc5c2aefb2d7e3b4a87caf4fa8be44c40..51fa43b39f87f06a3054808acb4937e05ae2337f 100644 (file)
--- a/deconvolutecommand.cpp
+++ b/deconvolutecommand.cpp
@@ -209,7 +209,7 @@ int DeconvoluteCommand::execute() {
          }
          CountTable ct;
          if (countfile != "")  {  
-            ct.readTable(countfile, false);
+            ct.readTable(countfile, true);
              if (countfile == outCountFile){
                  //prepare filenames and open files
                  map<string, string> mvariables;
diff --git a/designmap.cpp b/designmap.cpp

new file mode 100644 (file)

index 0000000..dff9700
--- /dev/null
+++ b/designmap.cpp
@@ -0,0 +1,502 @@
+//
+//  designmap.cpp
+//  Mothur
+//
+//  Created by SarahsWork on 6/17/13.
+//  Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#include "designmap.h"
+
+/************************************************************/
+DesignMap::DesignMap(string file) {
+    try {
+        m = MothurOut::getInstance();
+        defaultClass = "not found";
+        read(file);
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "DesignMap");
+               exit(1);
+       }
+}
+/************************************************************/
+int DesignMap::read(string file) {
+    try {
+        ifstream in;
+        m->openInputFile(file, in);
+        
+        string headers = m->getline(in); m->gobble(in);
+        vector<string> columnHeaders = m->splitWhiteSpace(headers);
+        
+        namesOfCategories.clear();
+        indexCategoryMap.clear();
+        indexNameMap.clear();
+        designMap.clear();
+        map<int, string> originalGroupIndexes;
+        for (int i = 1; i < columnHeaders.size(); i++) {  namesOfCategories.push_back(columnHeaders[i]);  originalGroupIndexes[i-1] = columnHeaders[i]; }
+        if (columnHeaders.size() > 1) { defaultClass = columnHeaders[1]; }
+        
+        //sort groups to keep consistent with how we store the groups in groupmap
+        sort(namesOfCategories.begin(), namesOfCategories.end());
+        for (int i = 0; i < namesOfCategories.size(); i++) {  indexCategoryMap[namesOfCategories[i]] = i; }
+        int numCategories = namesOfCategories.size();
+        
+        bool error = false;
+        string group;
+        totalCategories.resize(numCategories);
+        int count = 0;
+        while (!in.eof()) {
+            
+            if (m->control_pressed) { break; }
+            
+            in >> group; m->gobble(in); 
+            if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + "\n"); }
+            
+            //if group info, then read it
+            vector<string> categoryValues; categoryValues.resize(numCategories, "not found");
+            for (int i = 0; i < numCategories; i++) {
+                int thisIndex = indexCategoryMap[originalGroupIndexes[i]]; //find index of this category because we sort the values.
+                string temp = "not found";
+                in >> temp; categoryValues[thisIndex] = temp; m->gobble(in);
+                
+                if (m->debug) { m->mothurOut("[DEBUG]: value = " + temp + "\n"); }
+                
+                //do we have this value for this category already
+                map<string, int>::iterator it = totalCategories[thisIndex].find(temp);
+                if (it == totalCategories[thisIndex].end()) { totalCategories[thisIndex][temp] = 1; }
+                else {  totalCategories[thisIndex][temp]++; }
+            }
+                           
+            
+            map<string, int>::iterator it = indexNameMap.find(group);
+            if (it == indexNameMap.end()) {
+                indexNameMap[group] = count;
+                designMap.push_back(categoryValues);
+                count++;
+            }else {
+                error = true;
+                m->mothurOut("[ERROR]: Your design file contains more than 1 group named " + group + ", group names must be unique. Please correct."); m->mothurOutEndLine();
+            }
+        }
+        in.close();
+        
+        //sanity check
+        for (int i = 0; i < totalCategories.size(); i++) {
+            map<string, int>::iterator it = totalCategories[i].find(namesOfCategories[i]);
+            if (it != totalCategories[i].end()) { //we may have an old style design file since category name matches a value
+                m->mothurOut("\n[WARNING]: Your design file has a category and value for that category named " + namesOfCategories[i] + ". Perhaps you are using an old style design file without headers? If so, please correct."); m->mothurOutEndLine();
+            }
+        }
+        
+        if (error) { m->control_pressed = true; }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "read");
+               exit(1);
+       }
+}
+/************************************************************/
+////groupName, returns first categories value. 
+string DesignMap::get(string groupName) {
+    try {
+        string value = "not found";
+        
+        map<string, int>::iterator it2 = indexNameMap.find(groupName);
+        if (it2 == indexNameMap.end()) {
+            m->mothurOut("[ERROR]: group " + groupName + " is not in your design file. Please correct.\n"); m->control_pressed = true;
+        }else {
+            return designMap[it2->second][0];
+        }
+       
+        return value;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "get");
+               exit(1);
+       }
+}
+/************************************************************/
+////categoryName, returns category values.
+vector<string> DesignMap::getValues(string catName) {
+    try {
+        vector<string> values;
+        
+        map<string, int>::iterator it2 = indexCategoryMap.find(catName);
+        if (it2 == indexCategoryMap.end()) {
+            m->mothurOut("[ERROR]: category " + catName + " is not in your design file. Please correct.\n"); m->control_pressed = true;
+        }else {
+            for (map<string, int>::iterator it = totalCategories[it2->second].begin(); it != totalCategories[it2->second].end(); it++) {
+                values.push_back(it->first);
+            }
+        }
+        
+        return values;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "getValues");
+               exit(1);
+       }
+}
+
+/************************************************************/
+////groupName, category returns value. example F000132, sex -> male
+string DesignMap::get(string groupName, string categoryName) {
+    try {
+        string value = "not found";
+        map<string, int>::iterator it = indexCategoryMap.find(categoryName);
+        if (it == indexCategoryMap.end()) {
+            m->mothurOut("[ERROR]: category " + categoryName + " is not in your design file. Please correct.\n"); m->control_pressed = true;
+        }else {
+            map<string, int>::iterator it2 = indexNameMap.find(groupName);
+            if (it2 == indexNameMap.end()) {
+                m->mothurOut("[ERROR]: group " + groupName + " is not in your design file. Please correct.\n"); m->control_pressed = true;
+            }else {
+                return designMap[it2->second][it->second];
+            }
+        }
+        return value;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "get");
+               exit(1);
+       }
+}
+/************************************************************/
+//add group, assumes order is correct
+int DesignMap::push_back(string group, vector<string> values) {
+    try {
+        map<string, int>::iterator it = indexNameMap.find(group);
+        if (it == indexNameMap.end()) {
+            if (values.size() != getNumCategories()) {  m->mothurOut("[ERROR]: Your design file has a " + toString(getNumCategories()) + " categories and " + group + " has " + toString(values.size()) + ", please correct."); m->mothurOutEndLine(); m->control_pressed = true;  return 0; }
+            
+            for (int i = 0; i < values.size(); i++) {
+                //do we have this value for this category already
+                map<string, int>::iterator it = totalCategories[i].find(values[i]);
+                if (it == totalCategories[i].end()) { totalCategories[i][values[i]] = 1; }
+                else {  totalCategories[i][values[i]]++; }
+            }
+            int count = indexNameMap.size();
+            indexNameMap[group] = count;
+            designMap.push_back(values);
+        }else {
+            m->mothurOut("[ERROR]: Your design file contains more than 1 group named " + group + ", group names must be unique. Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "push_back");
+               exit(1);
+       }
+}
+/************************************************************/
+//set values for group, does not need to set all values. assumes group is in table already
+int DesignMap::set(string group, map<string, string> values) {
+    try {
+        map<string, int>::iterator it = indexNameMap.find(group);
+        if (it != indexNameMap.end()) {
+            for (map<string, string>::iterator it2 = values.begin(); it2 != values.end(); it2++) {
+                
+                map<string, int>::iterator it3 = indexCategoryMap.find(it2->first); //do we have this category
+                if (it3 == indexCategoryMap.end()) {
+                     m->mothurOut("[ERROR]: Your design file does not contain a category called " + it2->first + ". Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+                }else {
+                    string oldCategory = designMap[it->second][it3->second];
+                    //adjust totals for old category
+                    int oldCount = totalCategories[it3->second][oldCategory];
+                    if (oldCount == 1) { totalCategories[it3->second].erase(oldCategory); }
+                    else {  totalCategories[it3->second][oldCategory]--; }
+                    
+                    designMap[it->second][it3->second] = it2->second; //reset value
+                    
+                    //adjust totals for new category
+                    map<string, int>::iterator it4 = totalCategories[it3->second].find(it2->second);
+                    if (it4 == totalCategories[it3->second].end()) { totalCategories[it3->second][it2->second] = 1; }
+                    else {  totalCategories[it3->second][it2->second]++; }
+                }
+            }
+        }else {
+            m->mothurOut("[ERROR]: Your design file does not contain a group named " + group + ". Please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+        }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "set");
+               exit(1);
+       }
+}
+/************************************************************/
+//get number of groups belonging to a category or set of categories, with value or a set of values. Must have all categories and values. Example:
+//  map<treatment - > early, late>, <sex -> male> would return 1. Only one group is male and from early or late.
+int DesignMap::getNumUnique(map<string, vector<string> > selected) {
+    try {
+        int num = 0;
+        
+        //get indexes of categories
+        vector<int> indexes;
+        for (map<string, vector<string> >::iterator it = selected.begin(); it != selected.end(); it++) {
+            map<string, int>::iterator it2 = indexCategoryMap.find(it->first);
+            if (it2 == indexCategoryMap.end()) {
+                m->mothurOut("[ERROR]: Your design file does not contain a category named " + it->first + ". Please correct."); m->mothurOutEndLine(); m->control_pressed = true; return 0;
+            }else { indexes.push_back(it2->second); }
+        }
+        
+        for (int i = 0; i < designMap.size(); i++) {
+            bool hasAll = true; //innocent til proven guilty
+            int count = 0;
+            for (map<string, vector<string> >::iterator it = selected.begin(); it != selected.end(); it++) { //loop through each
+                //check category to see if this group meets the requirements
+                if (!m->inUsersGroups(designMap[i][indexes[count]], it->second)) { hasAll = false; it = selected.end(); }
+                count++;
+            }
+            if (hasAll) { num++; }
+        }
+        
+        return num;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "getNumUnique");
+               exit(1);
+       }
+}
+/************************************************************/
+//get number of groups belonging to a category or set of categories, with value or a set of values. Must have at least one categories and values. Example:
+//  map<treatment - > early, late>, <sex -> male> would return 3. All three group have are either male or from early or late.
+int DesignMap::getNumShared(map<string, vector<string> > selected) {
+    try {
+        int num = 0;
+        
+        //get indexes of categories
+        vector<int> indexes;
+        for (map<string, vector<string> >::iterator it = selected.begin(); it != selected.end(); it++) {
+            map<string, int>::iterator it2 = indexCategoryMap.find(it->first);
+            if (it2 == indexCategoryMap.end()) {
+                m->mothurOut("[ERROR]: Your design file does not contain a category named " + it->first + ". Please correct."); m->mothurOutEndLine(); m->control_pressed = true; return 0;
+            }else { indexes.push_back(it2->second); }
+        }
+        
+        for (int i = 0; i < designMap.size(); i++) {
+            bool hasAny = false; //guilty til proven innocent
+            int count = 0;
+            for (map<string, vector<string> >::iterator it = selected.begin(); it != selected.end(); it++) { //loop through each
+                //check category to see if this group meets the requirements
+                if (m->inUsersGroups(designMap[i][indexes[count]], it->second)) { hasAny = true; it = selected.end(); }
+                count++;
+            }
+            if (hasAny) { num++; }
+        }
+
+        
+        return num;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "getNumShared");
+               exit(1);
+       }
+}
+
+/************************************************************/
+//get names of groups belonging to a category or set of categories, with value or a set of values. Must have all categories and values. Example:
+//  map<treatment - > early, late>, <sex -> male> would return F000132. F000132 is the only group which is male and from early or late.
+vector<string> DesignMap::getNamesUnique(map<string, vector<string> > selected) {
+    try {
+        vector<string> names;
+        
+        //get indexes of categories
+        vector<int> indexes;
+        for (map<string, vector<string> >::iterator it = selected.begin(); it != selected.end(); it++) {
+            map<string, int>::iterator it2 = indexCategoryMap.find(it->first);
+            if (it2 == indexCategoryMap.end()) {
+                m->mothurOut("[ERROR]: Your design file does not contain a category named " + it->first + ". Please correct."); m->mothurOutEndLine(); m->control_pressed = true; return names;
+            }else { indexes.push_back(it2->second); }
+        }
+        
+        //map int to name
+        map<int, string> reverse;
+        for (map<string, int>::iterator it = indexNameMap.begin(); it != indexNameMap.end(); it++) {
+            reverse[it->second] = it->first;
+        }
+        
+        for (int i = 0; i < designMap.size(); i++) {
+            bool hasAll = true; //innocent til proven guilty
+            int count = 0;
+            for (map<string, vector<string> >::iterator it = selected.begin(); it != selected.end(); it++) { //loop through each
+                //check category to see if this group meets the requirements
+                if (!m->inUsersGroups(designMap[i][indexes[count]], it->second)) { hasAll = false; it = selected.end(); }
+                count++;
+            }
+            if (hasAll) {
+                map<int, string>::iterator it = reverse.find(i);
+                if (it == reverse.end()) {
+                    m->mothurOut("[ERROR]: should never get here, oops. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; return names;
+                }else { names.push_back(it->second); }
+            }
+        }
+
+        
+        return names;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "getNamesUnique");
+               exit(1);
+       }
+}
+/************************************************************/
+//get names of groups belonging to a category or set of categories, with value or a set of values. Must have all categories and values. Example:
+//  map<treatment - > early, late>, <sex -> male> would return F000132. F000132 is the only group which is male and from early or late.
+vector<string> DesignMap::getNamesShared(map<string, vector<string> > selected) {
+    try {
+        vector<string> names;
+        
+        //get indexes of categories
+        vector<int> indexes;
+        for (map<string, vector<string> >::iterator it = selected.begin(); it != selected.end(); it++) {
+            map<string, int>::iterator it2 = indexCategoryMap.find(it->first);
+            if (it2 == indexCategoryMap.end()) {
+                m->mothurOut("[ERROR]: Your design file does not contain a category named " + it->first + ". Please correct."); m->mothurOutEndLine(); m->control_pressed = true; return names;
+            }else { indexes.push_back(it2->second); }
+        }
+        
+        //map int to name
+        map<int, string> reverse;
+        for (map<string, int>::iterator it = indexNameMap.begin(); it != indexNameMap.end(); it++) {
+            reverse[it->second] = it->first;
+        }
+        
+        for (int i = 0; i < designMap.size(); i++) {
+            bool hasAny = false; 
+            int count = 0;
+            for (map<string, vector<string> >::iterator it = selected.begin(); it != selected.end(); it++) { //loop through each
+                //check category to see if this group meets the requirements
+                if (m->inUsersGroups(designMap[i][indexes[count]], it->second)) { hasAny = true; it = selected.end(); }
+                count++;
+            }
+            if (hasAny) {
+                map<int, string>::iterator it = reverse.find(i);
+                if (it == reverse.end()) {
+                    m->mothurOut("[ERROR]: should never get here, oops. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; return names;
+                }else { names.push_back(it->second); }
+            }
+        }
+        
+        
+        return names;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "getNamesShared");
+               exit(1);
+       }
+}
+
+/************************************************************/
+//get names of groups belonging to a category or set of categories, with value or a set of values. Must have at least one categories and values. Example:
+//  map<treatment - > early, late>, <sex -> male> would return F000132, F000142, F000138. All three group have are either male or from early or late.
+
+vector<string> DesignMap::getNames(string category, string value) {
+    try {
+        vector<string> names;
+        
+        map<string, int>::iterator it = indexCategoryMap.find(category);
+        if (it == indexCategoryMap.end()) {
+            m->mothurOut("[ERROR]: category " + category + " is not in your design file. Please correct.\n"); m->control_pressed = true;
+        }else {
+            int column = it->second;
+            
+            //map int to name
+            map<int, string> reverse;
+            for (map<string, int>::iterator it2 = indexNameMap.begin(); it2 != indexNameMap.end(); it2++) {
+                reverse[it2->second] = it2->first;
+            }
+            
+            for (int i = 0; i < designMap.size(); i++) {
+                if (designMap[i][column] == value) {
+                    map<int, string>::iterator it2 = reverse.find(i);
+                    if (it2 == reverse.end()) {
+                        m->mothurOut("[ERROR]: should never get here, oops. Please correct."); m->mothurOutEndLine(); m->control_pressed = true; return names;
+                    }else { names.push_back(it2->second); }
+                }
+            }
+        }
+        return names;
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "getNames");
+               exit(1);
+       }
+}
+
+/************************************************************/
+int DesignMap::print(ofstream& out) {
+    try {
+       
+               out << "group\t";
+        for (int i = 0; i < namesOfCategories.size(); i++) { out << namesOfCategories[i] << '\t'; }
+        out << endl;
+        
+        map<int, string> reverse; //use this to preserve order
+        for (map<string, int>::iterator it = indexNameMap.begin(); it !=indexNameMap.end(); it++) { reverse[it->second] = it->first;  }
+        
+        for (int i = 0; i < designMap.size(); i++) {
+            map<int, string>::iterator itR = reverse.find(i);
+            
+            if (itR != reverse.end()) { //will equal end if seqs were removed because remove just removes from indexNameMap
+                out << itR->second  << '\t';
+                
+                for (int j = 0; j < namesOfCategories.size(); j++) {
+                    out << designMap[i][j] << '\t';
+                }
+                out << endl;
+            }
+        }
+        out.close();
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "print");
+               exit(1);
+       }
+}
+/************************************************************/
+//print specific categories
+int DesignMap::print(ofstream& out, vector<string> cats) {
+    try {
+        
+               out << "group\t";
+        for (int i = 0; i < namesOfCategories.size(); i++) { if (m->inUsersGroups(namesOfCategories[i], cats)) { out << namesOfCategories[i] << '\t'; } }
+        out << endl;
+        
+        map<int, string> reverse; //use this to preserve order
+        for (map<string, int>::iterator it = indexNameMap.begin(); it !=indexNameMap.end(); it++) { reverse[it->second] = it->first;  }
+        
+        for (int i = 0; i < designMap.size(); i++) {
+            map<int, string>::iterator itR = reverse.find(i);
+            
+            if (itR != reverse.end()) { //will equal end if seqs were removed because remove just removes from indexNameMap
+                out << itR->second  << '\t';
+                
+                for (int j = 0; j < namesOfCategories.size(); j++) {
+                    if (m->inUsersGroups(namesOfCategories[i], cats)) {
+                        out << designMap[i][j] << '\t';
+                    }
+                }
+                out << endl;
+            }
+        }
+        out.close();
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "DesignMap", "print");
+               exit(1);
+       }
+}
+
+/************************************************************/
+
diff --git a/designmap.h b/designmap.h

new file mode 100644 (file)

index 0000000..b6dec3b
--- /dev/null
+++ b/designmap.h
@@ -0,0 +1,70 @@
+//
+//  designmap.h
+//  Mothur
+//
+//  Created by SarahsWork on 6/17/13.
+//  Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#ifndef __Mothur__designmap__
+#define __Mothur__designmap__
+
+#include "mothurout.h"
+
+/* This class is a representation of the design file.  
+ 
+ group    treatment    sex        age
+ F000142    Early       female    young
+ F000132    Late        male        old
+ F000138    Mid         male        old
+ 
+ 
+ */
+
+class DesignMap {
+public:
+       DesignMap() { m = MothurOut::getInstance(); defaultClass = "not found"; }
+       DesignMap(string); 
+       ~DesignMap() {}
+    
+    int read(string);
+    string get(string, string); //groupName, category returns value. example F000132, sex -> male
+    string get(string); //groupName, returns first categories value. example F000132, -> late
+    vector<string> getValues(string); //categoryName, returns values. example treatment, -> early,late,mid
+       int set(string, map<string, string>); //groupName, map<category, value>
+    int push_back(string, vector<string>); //groupName, vector<value> - assumes you put values in order of getNamesOfCategories
+       vector<string> getNamesOfCategories() {
+               sort(namesOfCategories.begin(), namesOfCategories.end());
+        return namesOfCategories;
+       }
+    int getNumCategories() { return namesOfCategories.size(); }
+       int getNum()  {  return designMap.size();  }
+    int getNumUnique(map<string, vector<string> >); //get number of groups belonging to a category or set of categories, with value or a set of values. Must have all categories and values. Example:
+    //  map<treatment - > early, late>, <sex -> male> would return 1. Only one group is male and from early or late.
+    int getNumShared(map<string, vector<string> >); //get number of groups belonging to a category or set of categories, with value or a set of values. Must have at least one categories and values. Example:
+    //  map<treatment - > early, late>, <sex -> male> would return 3. All three group have are either male or from early or late.
+
+    vector<string> getNames();
+    vector<string> getNames(string, string); //get names group from category and value.
+       vector<string> getNamesUnique(map<string, vector<string> >); //get names of groups belonging to a category or set of categories, with value or a set of values. Must have all categories and values. Example:
+    //  map<treatment - > early, late>, <sex -> male> would return F000132. F000132 is the only group which is male and from early or late.
+    vector<string> getNamesShared(map<string, vector<string> >); //get names of groups belonging to a category or set of categories, with value or a set of values. Must have at least one categories and values. Example:
+    //  map<treatment - > early, late>, <sex -> male> would return F000132, F000142, F000138. All three group have are either male or from early or late.
+
+    int print(ofstream&);
+    int print(ofstream&, vector<string>); //print certain categories
+    
+    string getDefaultClass() { return defaultClass; }
+    
+private:
+    string defaultClass;
+       vector<string> namesOfCategories;
+       MothurOut* m;
+       vector< vector<string> > designMap;
+    vector< map<string, int> > totalCategories;  //for each category, total groups assigned to it.   vector[0] early -> 1, vector[1] male -> 2
+    map<string, int> indexNameMap;
+    map<string, int> indexCategoryMap;
+};
+
+
+#endif /* defined(__Mothur__designmap__) */
diff --git a/forest.cpp b/forest.cpp

index 179ecef19d26de2a61f68776445dfdfc72132f07..3cfb5b9054cf5d43fbbb35eb7b81a6a2ba0f5833 100644 (file)
--- a/forest.cpp
+++ b/forest.cpp
@@ -10,14 +10,28 @@
  
  /***********************************************************************/
  Forest::Forest(const std::vector < std::vector<int> > dataSet,
-                                           const int numDecisionTrees,
-                                           const string treeSplitCriterion = "informationGain")
-: dataSet(dataSet),
-numDecisionTrees(numDecisionTrees),
-numSamples((int)dataSet.size()),
-numFeatures((int)(dataSet[0].size() - 1)),
-globalVariableImportanceList(numFeatures, 0),
-treeSplitCriterion(treeSplitCriterion) {
+               const int numDecisionTrees,
+               const string treeSplitCriterion = "gainratio",
+               const bool doPruning = false,
+               const float pruneAggressiveness = 0.9,
+               const bool discardHighErrorTrees = true,
+               const float highErrorTreeDiscardThreshold = 0.4,
+               const string optimumFeatureSubsetSelectionCriteria = "log2",
+               const float featureStandardDeviationThreshold = 0.0)
+      : dataSet(dataSet),
+        numDecisionTrees(numDecisionTrees),
+        numSamples((int)dataSet.size()),
+        numFeatures((int)(dataSet[0].size() - 1)),
+        globalVariableImportanceList(numFeatures, 0),
+        treeSplitCriterion(treeSplitCriterion),
+        doPruning(doPruning),
+        pruneAggressiveness(pruneAggressiveness),
+        discardHighErrorTrees(discardHighErrorTrees),
+        highErrorTreeDiscardThreshold(highErrorTreeDiscardThreshold),
+        optimumFeatureSubsetSelectionCriteria(optimumFeatureSubsetSelectionCriteria),
+        featureStandardDeviationThreshold(featureStandardDeviationThreshold)
+        {
+        
      m = MothurOut::getInstance();
      globalDiscardedFeatureIndices = getGlobalDiscardedFeatureIndices();
      // TODO: double check if the implemenatation of 'globalOutOfBagEstimates' is correct
@@ -40,7 +54,7 @@ vector<int> Forest::getGlobalDiscardedFeatureIndices() {
          for (int i = 0; i < featureVectors.size(); i++) {
              if (m->control_pressed) { return globalDiscardedFeatureIndices; }
              double standardDeviation = m->getStandardDeviation(featureVectors[i]);
-            if (standardDeviation <= 0){ globalDiscardedFeatureIndices.push_back(i); }
+            if (standardDeviation <= featureStandardDeviationThreshold){ globalDiscardedFeatureIndices.push_back(i); }
          }
          
          if (m->debug) {
diff --git a/forest.h b/forest.h

index 78f61b3646c446bd9279eb09986d7e5d4bdc10df..9e9860de61f2ec10e9bf02192856a3224ca46193 100644 (file)
--- a/forest.h
+++ b/forest.h
@@ -21,8 +21,14 @@ class Forest{
  public:
      // intialization with vectors
      Forest(const std::vector < std::vector<int> > dataSet,
-                         const int numDecisionTrees,
-                         const string);
+           const int numDecisionTrees,
+           const string treeSplitCriterion,
+           const bool doPruning,
+           const float pruneAggressiveness,
+           const bool discardHighErrorTrees,
+           const float highErrorTreeDiscardThreshold,
+           const string optimumFeatureSubsetSelectionCriteria,
+           const float featureStandardDeviationThreshold);
      virtual ~Forest(){ }
      virtual int populateDecisionTrees() = 0;
      virtual int calcForrestErrorRate() = 0;
@@ -53,6 +59,14 @@ protected:
      vector<int> globalDiscardedFeatureIndices;
      vector<double> globalVariableImportanceList;
      string treeSplitCriterion;
+  
+    bool doPruning;
+    float pruneAggressiveness;
+    bool discardHighErrorTrees;
+    float highErrorTreeDiscardThreshold;
+    string optimumFeatureSubsetSelectionCriteria;
+    float featureStandardDeviationThreshold;
+  
      // This is a map of each feature to outcome count of each classes
      // e.g. 1 => [2 7] means feature 1 has 2 outcome of 0 and 7 outcome of 1
      map<int, vector<int> > globalOutOfBagEstimates;
diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp

index 25a68be75c950de1ebad3afe6955cf308ad0ce0b..005a89f2ebbee9e1b4c37567b6ce382979c90226 100644 (file)
--- a/getlineagecommand.cpp
+++ b/getlineagecommand.cpp
@@ -318,7 +318,7 @@ GetLineageCommand::GetLineageCommand(string option)  {
              
              if ((sharedfile != "") || (listfile != "")) {
                  label = validParameter.validFile(parameters, "label", false);
-                if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your inputfile."); m->mothurOutEndLine(); label=""; }
+                if (label == "not found") { label = ""; m->mothurOut("[WARNING]: You did not provide a label, I will use the first label in your inputfile."); m->mothurOutEndLine(); }
              }
              
              if (countfile == "") {
diff --git a/getmetacommunitycommand.h b/getmetacommunitycommand.h

index 2264c87b9af87c5f3363f5774116ff95bb691b8f..65e5770fdc7c9bc8f4b594fd884f523e4387e40c 100644 (file)
--- a/getmetacommunitycommand.h
+++ b/getmetacommunitycommand.h
@@ -28,8 +28,8 @@ public:
      string getOutputPattern(string);
      
         string getHelpString();
-    string getCitation() { return "http://www.mothur.org/wiki/Get.metacommunity"; }
-    string getDescription()            { return "brief description"; }
+    string getCitation() { return "Holmes I, Harris K, Quince C (2012) Dirichlet Multinomial Mixtures: Generative Models for Microbial Metagenomics. PLoS ONE 7(2): e30126. doi:10.1371/journal.pone.0030126 http://www.mothur.org/wiki/Get.metacommunity"; }
+    string getDescription()            { return "Assigns samples to bins using a Dirichlet multinomial mixture model"; }
      
      int execute();
      void help() { m->mothurOut(getHelpString()); }
diff --git a/getseqscommand.cpp b/getseqscommand.cpp

index 142023e7a40881c1c6397a761ba350132490b75c..83fdf73e9af36fd9826f5a8c2efcc11655ac938e 100644 (file)
--- a/getseqscommand.cpp
+++ b/getseqscommand.cpp
@@ -45,7 +45,7 @@ string GetSeqsCommand::getHelpString(){
                 helpString += "The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, count, list, taxonomy, quality or alignreport file.\n";
                 helpString += "It outputs a file containing only the sequences in the .accnos file.\n";
                 helpString += "The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups.  You must provide accnos unless you have a valid current accnos file, and at least one of the other parameters.\n";
-               helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n";
+               helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=true. \n";
                 helpString += "The get.seqs command should be in the following format: get.seqs(accnos=yourAccnos, fasta=yourFasta).\n";
                 helpString += "Example get.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n";
                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
@@ -430,9 +430,11 @@ int GetSeqsCommand::readFasta(){
              
              if (!dups) {//adjust name if needed
                  map<string, string>::iterator it = uniqueMap.find(name);
-                if (it != uniqueMap.end()) { name = it->second; }
+                if (it != uniqueMap.end()) { currSeq.setName(it->second); }
              }
                         
+            name = currSeq.getName();
+            
                         if (name != "") {
                                 //if this name is in the accnos file
                                 if (names.count(name) != 0) {
@@ -710,7 +712,7 @@ int GetSeqsCommand::readName(){
                 
                         if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName);  return 0; }
  
-                       in >> firstCol;                         
+                       in >> firstCol;                 m->gobble(in);
                         in >> secondCol;
                         
                         string hold = "";
@@ -734,6 +736,7 @@ int GetSeqsCommand::readName(){
                                 selectedCount += parsedNames.size();
                  if (m->debug) { sanity["name"].insert(firstCol); }
                         }else {
+                
                                 selectedCount += validSecond.size();
                                 
                                 //if the name in the first column is in the set then print it and any other names in second column also in set
@@ -752,6 +755,7 @@ int GetSeqsCommand::readName(){
                                 
                                 //make first name in set you come to first column and then add the remaining names to second column
                                 }else {
+                    
                                         //you want part of this row
                                         if (validSecond.size() != 0) {
                                         
diff --git a/groupmap.cpp b/groupmap.cpp

index 9d2736405f8e306dc8e5238f8bad1a26b92b88bb..9b8aa3f8abc87ef9ff93520a80b3da9169d406f4 100644 (file)
--- a/groupmap.cpp
+++ b/groupmap.cpp
@@ -405,7 +405,66 @@ int GroupMap::getNumSeqs(string group) {
                 exit(1);
         }
  }
-
+/************************************************************/
+int GroupMap::renameSeq(string oldName, string newName) {
+       try {
+               
+               map<string, string>::iterator itName;
+               
+               itName = groupmap.find(oldName);
+               
+               if (itName == groupmap.end()) {
+            m->mothurOut("[ERROR]: cannot find " + toString(oldName) + " in group file");
+            m->control_pressed = true;
+            return 0;
+        }else {
+            string group = itName->second;
+            groupmap.erase(itName);
+            groupmap[newName] = group;
+        }
+        
+        return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "GroupMap", "renameSeq");
+               exit(1);
+       }
+}
+/************************************************************/
+int GroupMap::print(ofstream& out) {
+       try {
+               
+               for (map<string, string>::iterator itName = groupmap.begin(); itName != groupmap.end(); itName++) {
+            out << itName->first << '\t' << itName->second << endl;
+        }
+             
+        return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "GroupMap", "print");
+               exit(1);
+       }
+}
+/************************************************************/
+int GroupMap::print(ofstream& out, vector<string> userGroups) {
+       try {
+               
+               for (map<string, string>::iterator itName = groupmap.begin(); itName != groupmap.end(); itName++) {
+            if (m->inUsersGroups(itName->second, userGroups)) {
+                out << itName->first << '\t' << itName->second << endl;
+            }
+        }
+        
+        return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "GroupMap", "print");
+               exit(1);
+       }
+}
  /************************************************************/
  vector<string> GroupMap::getNamesSeqs(){
         try {
diff --git a/groupmap.h b/groupmap.h

index bdcd61e7c763cc10e31088d3292064887a8070a8..5397e02e7c158e5074a7f8e626d813561cc6d9b6 100644 (file)
--- a/groupmap.h
+++ b/groupmap.h
@@ -40,6 +40,9 @@ public:
         vector<string> getNamesSeqs(vector<string>); //get names of seqs belonging to a group or set of groups
         int getNumSeqs(string); //return the number of seqs in a given group
      int getCopy(GroupMap*);
+    int renameSeq(string, string);
+    int print(ofstream&);
+    int print(ofstream&, vector<string>); //print certain groups
      
      
      map<string, int> groupIndex;  //groupname, vectorIndex in namesOfGroups. - used by collectdisplays and libshuff commands.
diff --git a/kruskalwalliscommand.cpp b/kruskalwalliscommand.cpp

index bf4fafa950f2f180227f61ab41e1377008927c82..805d21e51d5331576185f6d28128af8dad7cbae3 100644 (file)
--- a/kruskalwalliscommand.cpp
+++ b/kruskalwalliscommand.cpp
@@ -6,14 +6,19 @@
   */
  
  #include "kruskalwalliscommand.h"
+#include "linearalgebra.h"
  
  //**********************************************************************************************************************
-vector<string> KruskalWallisCommand::setParameters(){  
+vector<string> KruskalWallisCommand::setParameters(){
         try {
+        CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pdesign);
+        CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","summary",false,true,true); parameters.push_back(pshared);
+        CommandParameter pclass("class", "String", "", "", "", "", "","",false,false); parameters.push_back(pclass);
+               CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
+               CommandParameter pclasses("classes", "String", "", "", "", "", "","",false,false); parameters.push_back(pclasses);
+        //every command must have inputdir and outputdir.  This allows mothur users to redirect input and output files.
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
-        CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false,true); parameters.push_back(pgroups);
-        CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","summary",false,true,true); parameters.push_back(pshared);     
                 
                 vector<string> myArray;
                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
@@ -25,12 +30,16 @@ vector<string> KruskalWallisCommand::setParameters(){
         }
  }
  //**********************************************************************************************************************
-string KruskalWallisCommand::getHelpString(){  
+string KruskalWallisCommand::getHelpString(){
         try {
                 string helpString = "";
-               helpString += "The kruskalwallis command parameter options are \n";
-        helpString += "Kruskal–Wallis one-way analysis of variance is a non-parametric method for testing whether samples originate from the same distribution.";
-               return helpString;
+               helpString += "The kruskal.wallis command allows you to ....\n";
+               helpString += "The kruskal.wallis command parameters are: shared, design, class,  label and classes.\n";
+               helpString += "The class parameter is used to indicate the which category you would like used for the Kruskal Wallis analysis. If none is provided first category is used.\n";
+        helpString += "The classes parameter is used to indicate the classes you would like to use. Clases should be inputted in the following format: classes=label<value1|value2|value3>-label<value1|value2>. For example to include groups from treatment with value early or late and age= young or old.  class=treatment<Early|Late>-age<young|old>.\n";
+        helpString += "The label parameter is used to indicate which distances in the shared file you would like to use. labels are separated by dashes.\n";
+               helpString += "The kruskal.wallis command should be in the following format: kruskal.wallis(shared=final.an.shared, design=final.design, class=treatment).\n";
+        return helpString;
         }
         catch(exception& e) {
                 m->errorOut(e, "KruskalWallisCommand", "getHelpString");
@@ -42,7 +51,7 @@ string KruskalWallisCommand::getOutputPattern(string type) {
      try {
          string pattern = "";
          
-        if (type == "summary") {  pattern = "[filename],cooccurence.summary"; } 
+        if (type == "kruskall-wallis") {  pattern = "[filename],[distance],kruskall_wallis"; }
          else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
          
          return pattern;
@@ -53,13 +62,12 @@ string KruskalWallisCommand::getOutputPattern(string type) {
      }
  }
  //**********************************************************************************************************************
-KruskalWallisCommand::KruskalWallisCommand(){  
+KruskalWallisCommand::KruskalWallisCommand(){
         try {
-               abort = true; calledHelp = true; 
+               abort = true; calledHelp = true;
                 setParameters();
          vector<string> tempOutNames;
-               outputTypes["summary"] = tempOutNames;
-
+        outputTypes["kruskall-wallis"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "KruskalWallisCommand", "KruskalWallisCommand");
@@ -67,68 +75,95 @@ KruskalWallisCommand::KruskalWallisCommand(){
         }
  }
  //**********************************************************************************************************************
-KruskalWallisCommand::KruskalWallisCommand(string option) {
+KruskalWallisCommand::KruskalWallisCommand(string option)  {
         try {
-               abort = false; calledHelp = false;   
-                               
+               abort = false; calledHelp = false;
+        allLines = 1;
+               
                 //allow user to run help
                 if(option == "help") { help(); abort = true; calledHelp = true; }
                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
                 
                 else {
+                       //valid paramters for this command
                         vector<string> myArray = setParameters();
                         
                         OptionParser parser(option);
                         map<string,string> parameters = parser.getParameters();
-                       map<string,string>::iterator it;
                         
                         ValidParameters validParameter;
-                       
+                       map<string,string>::iterator it;
                         //check to make sure all parameters are valid for command
-                       for (it = parameters.begin(); it != parameters.end(); it++) { 
+                       for (it = parameters.begin(); it != parameters.end(); it++) {
                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
                         }
+                       
+                       vector<string> tempOutNames;
+            outputTypes["kruskall-wallis"] = tempOutNames;
              
-            //get shared file
-                       sharedfile = validParameter.validFile(parameters, "shared", true);
-                       if (sharedfile == "not open") { sharedfile = ""; abort = true; }        
-                       else if (sharedfile == "not found") { 
-                               //if there is a current shared file, use it
-                               sharedfile = m->getSharedFile(); 
-                               if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
-                               else {  m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
-                       }else { m->setSharedFile(sharedfile); }
-            
-            //if the user changes the output directory command factory will send this info to us in the output parameter 
-                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(sharedfile);             }
-                    
-            groups = validParameter.validFile(parameters, "groups", false);   
-            if (groups == "not found") { groups = "";   }
-            else { 
-            m->splitAtDash(groups, Groups); 
-            }   
-            m->setGroups(Groups);
-                               
-                       //if the user changes the input directory command factory will send this info to us in the output parameter 
-                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       //if the user changes the input directory command factory will send this info to us in the output parameter
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);
                         if (inputDir == "not found"){   inputDir = "";          }
                         else {
-                               string path;
-                               it = parameters.find("shared");
+                
+                string path;
+                               it = parameters.find("design");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["desing"] = inputDir + it->second;           }
+                               }
+                               
+                it = parameters.find("shared");
                                 //user has given a template file
-                               if(it != parameters.end()){ 
+                               if(it != parameters.end()){
                                         path = m->hasPath(it->second);
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
                                 }
+            }
+            
+            //get shared file, it is required
+                       sharedfile = validParameter.validFile(parameters, "shared", true);
+                       if (sharedfile == "not open") { sharedfile = ""; abort = true; }
+                       else if (sharedfile == "not found") {
+                               //if there is a current shared file, use it
+                               sharedfile = m->getSharedFile();
+                               if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
+                               else {  m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       }else { m->setSharedFile(sharedfile); }
+            
+            //get shared file, it is required
+                       designfile = validParameter.validFile(parameters, "design", true);
+                       if (designfile == "not open") { designfile = ""; abort = true; }
+                       else if (designfile == "not found") {
+                               //if there is a current shared file, use it
+                               designfile = m->getDesignFile();
+                               if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); }
+                               else {  m->mothurOut("You have no current design file and the design parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       }else { m->setDesignFile(designfile); }
+            
+            //if the user changes the output directory command factory will send this info to us in the output parameter
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){
+                               outputDir = m->hasPath(sharedfile); //if user entered a file with a path then preserve it
                         }
-               
-            vector<string> tempOutNames;
-            outputTypes["summary"] = tempOutNames;
-
-
+            
+            string label = validParameter.validFile(parameters, "label", false);
+                       if (label == "not found") { label = ""; }
+                       else {
+                               if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
+                               else { allLines = 1;  }
+                       }
+            
+            mclass = validParameter.validFile(parameters, "class", false);
+                       if (mclass == "not found") { mclass = ""; }
+            
+            classes = validParameter.validFile(parameters, "classes", false);
+                       if (classes == "not found") { classes = ""; }
+            
                 }
-
+               
         }
         catch(exception& e) {
                 m->errorOut(e, "KruskalWallisCommand", "KruskalWallisCommand");
@@ -136,128 +171,165 @@ KruskalWallisCommand::KruskalWallisCommand(string option) {
         }
  }
  //**********************************************************************************************************************
+
  int KruskalWallisCommand::execute(){
         try {
+               
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
          
-        InputData* input = new InputData(sharedfile, "sharedfile");
-        vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
-               string lastLabel = lookup[0]->getLabel();
-        
-       
-               //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
-               set<string> processedLabels;
-               set<string> userLabels = labels;
-
-        ofstream out;
-        map<string,string> variables;
-        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
-               string outputFileName = getOutputFileName("summary",variables);
-        m->openOutputFile(outputFileName, out);
-        outputNames.push_back(outputFileName);  outputTypes["summary"].push_back(outputFileName);
-        out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
-        out << "H\tpvalue\n";
-        
-        //math goes here
+        DesignMap designMap(designfile);
+        //if user set classes set groups=those classes
+        if (classes != "") {
+            map<string, vector<string> > thisClasses = m->parseClasses(classes);
+            vector<string> groups = designMap.getNamesUnique(thisClasses);
+            if (groups.size() != 0) { m->setGroups(groups); }
+            else { m->mothurOut("[ERROR]: no groups meet your classes requirement, quitting.\n"); return 0; }
+        }
          
-        int N = m->getNumGroups();
-        double H;
-        double tmp = 0.0;
-        vector<groupRank> vec;
-        vector<string> groups = m->getGroups();
-        string group;
-        int count;
-        double sum;
-                
-        //merge all groups into a vector
+        //if user did not select class use first column
+        if (mclass == "") {  mclass = designMap.getDefaultClass(); m->mothurOut("\nYou did not provide a class, using " + mclass +".\n\n"); }
          
+        InputData input(sharedfile, "sharedfile");
+        vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
+        string lastLabel = lookup[0]->getLabel();
          
+        //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+        set<string> processedLabels;
+        set<string> userLabels = labels;
          
-        //rank function here
-        assignRank(vec);
          
-        //populate counts and ranSums vectors
-        for (int i=0;i<N;i++) {
-            count = 0;
-            sum = 0;
-            group = groups[i];
-            for(int j;j<vec.size();j++) {
-                if (vec[j].group == group) {
-                    count++;
-                    sum = sum + vec[j].rank;
-                }
+        //as long as you are not at the end of the file or done wih the lines you want
+        while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+            
+            if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  return 0; }
+            
+            if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
+                
+                m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+                
+                process(lookup, designMap);
+                
+                processedLabels.insert(lookup[0]->getLabel());
+                userLabels.erase(lookup[0]->getLabel());
              }
-            counts[i] = count;
-            rankSums[i] = sum;
+            
+            if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+                string saveLabel = lookup[0]->getLabel();
+                
+                for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+                lookup = input.getSharedRAbundVectors(lastLabel);
+                m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+                
+                process(lookup, designMap);
+                
+                processedLabels.insert(lookup[0]->getLabel());
+                userLabels.erase(lookup[0]->getLabel());
+                
+                //restore real lastlabel to save below
+                lookup[0]->setLabel(saveLabel);
+            }
+            
+            lastLabel = lookup[0]->getLabel();
+            //prevent memory leak
+            for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
+            
+            if (m->control_pressed) { return 0; }
+            
+            //get next line to process
+            lookup = input.getSharedRAbundVectors();
          }
          
-        //test statistic
-        for (int i=0;i<N;i++) { tmp = tmp + (pow(rankSums[i],2) / counts[i]); }
-        
-        H = (12 / (N*(N+1))) * tmp - (3*(N+1));
-        
-        //ss = tmp - pow(accumulate(rankSums.begin(), rankSums.end(), 0), 2);
+        if (m->control_pressed) {  return 0; }
          
-        //H = ss / ( (N * (N + 1))/12 );
-        
-        //correction for ties?
+        //output error messages about any remaining user labels
+        set<string>::iterator it;
+        bool needToRun = false;
+        for (it = userLabels.begin(); it != userLabels.end(); it++) {
+            m->mothurOut("Your file does not include the label " + *it);
+            if (processedLabels.count(lastLabel) != 1) {
+                m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+                needToRun = true;
+            }else {
+                m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+            }
+        }
          
-        //p-value calculation
+        //run last label if you need to
+        if (needToRun == true)  {
+            for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
+            lookup = input.getSharedRAbundVectors(lastLabel);
+            
+            m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+            process(lookup, designMap);
+            
+            for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+        }
          
-               return 0;
-       }
+               
+        //output files created by command
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
+        return 0;
+               
+    }
         catch(exception& e) {
                 m->errorOut(e, "KruskalWallisCommand", "execute");
                 exit(1);
         }
  }
  //**********************************************************************************************************************
-void KruskalWallisCommand::assignRank(vector<groupRank> &vec) {
-    try {
-        double rank = 1;
-        double numRanks, avgRank, j;
-        vector<groupRank>::iterator it, oldit;
-
-        sort (vec.begin(), vec.end(), comparevalue);
-
-        it = vec.begin();
-
-        while ( it != vec.end() ) {
-            j = rank;
-            oldit = it;
-            if (!equalvalue(*it, *(it+1))) {
-                (*it).rank = rank; 
-                rank = rank+1; 
-                it++; }
-            else {
-                while(equalrank(*it, *(it+1))) {
-                    j = j + (j+1);
-                    rank++;
-                    it++;
-                }
-                numRanks = double (distance(oldit, it));
-                avgRank = j / numRanks;
-                while(oldit != it) {
-                    (*oldit).rank = avgRank;
-                    oldit++;
-                }
-            }
  
+int KruskalWallisCommand::process(vector<SharedRAbundVector*>& lookup, DesignMap& designMap) {
+       try {
+        map<string, string> variables;
+        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
+        variables["[distance]"] = lookup[0]->getLabel();
+               string outputFileName = getOutputFileName("kruskall-wallis",variables);
+        
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               outputNames.push_back(outputFileName); outputTypes["kruskall-wallis"].push_back(outputFileName);
+        out << "OTULabel\tKW\tPvalue\n";
+        
+        int numBins = lookup[0]->getNumBins();
+        //sanity check to make sure each treatment has a group in the shared file
+        set<string> treatments;
+        for (int j = 0; j < lookup.size(); j++) {
+            string group = lookup[j]->getGroup();
+            string treatment = designMap.get(group, mclass); //get value for this group in this category
+            treatments.insert(treatment);
          }
+        if (treatments.size() < 2) { m->mothurOut("[ERROR]: need at least 2 things to classes to compare, quitting.\n"); m->control_pressed = true; }
          
-
+        LinearAlgebra linear;
+        for (int i = 0; i < numBins; i++) {
+            if (m->control_pressed) { break; }
+            
+            vector<spearmanRank> values;
+            for (int j = 0; j < lookup.size(); j++) {
+                string group = lookup[j]->getGroup();
+                string treatment = designMap.get(group, mclass); //get value for this group in this category
+                spearmanRank temp(treatment, lookup[j]->getAbundance(i));
+                values.push_back(temp);
+            }
+            
+            double pValue = 0.0;
+            double H = linear.calcKruskalWallis(values, pValue);
+            
+            //output H and signifigance
+            out << m->currentBinLabels[i] << '\t' << H << '\t' << pValue << endl;
+        }
+        out.close();
+                
+        return 0;
      }
-    catch(exception& e) {
-               m->errorOut(e, "KruskalWallisCommand", "getRank");
+       catch(exception& e) {
+               m->errorOut(e, "KruskalWallisCommand", "process");
                 exit(1);
         }
-    
-}
-//**********************************************************************************************************************
-void KruskalWallisCommand::assignValue(vector<groupRank> &vec) {
-    
  }
  //**********************************************************************************************************************
-//**********************************************************************************************************************
-//**********************************************************************************************************************
+
  
diff --git a/kruskalwalliscommand.h b/kruskalwalliscommand.h

index feefebd58fdb5650f41e9642a63e6dc68d011a24..a82d208cf15ea424413fdae4eee7c666868f4b35 100644 (file)
--- a/kruskalwalliscommand.h
+++ b/kruskalwalliscommand.h
@@ -10,8 +10,7 @@
  
  #include "command.hpp"
  #include "inputdata.h"
-#include "sharedrabundvector.h"
-
+#include "designmap.h"
  
  class KruskalWallisCommand : public Command {
     
@@ -22,11 +21,11 @@ public:
         ~KruskalWallisCommand(){}
         
         vector<string> setParameters();
-       string getCommandName()                 { return "kruskalwallis";                       }
+       string getCommandName()                 { return "kruskal.wallis";                      }
         string getCommandCategory()             { return "Hypothesis Testing";  }
      string getOutputPattern(string);   
         string getHelpString(); 
-       string getCitation() { return "http://www.mothur.org/wiki/kruskalwallis"; }
+       string getCitation() { return "http://www.mothur.org/wiki/Kruskal.wallis"; }
         string getDescription()         { return "Non-parametric method for testing whether samples originate from the same distribution."; }
      
      struct groupRank {
@@ -42,22 +41,12 @@ public:
      
      
  private:
-    string outputDir, sharedfile, groups;
-    bool abort;
+    bool abort, allLines;
+    string outputDir, sharedfile, designfile, mclass, classes;
+    vector<string> outputNames;
      set<string> labels;
-    vector<string> outputNames, Groups;
-    vector<int> counts;
-    vector<double> rankSums;
-    vector<double> rankMeans;
-    
-  
-        
-    static bool comparevalue(const groupRank &a, const groupRank &b) { return a.value < b.value; }
-    static bool equalvalue(const groupRank &a, const groupRank &b) { return a.value == b.value; }
-    static bool comparerank(const groupRank &a, const groupRank &b) { return a.rank < b.rank; }
-    static bool equalrank(const groupRank &a, const groupRank &b) { return a.rank == b.rank; }
-    static bool equalgroup(const groupRank &a, const groupRank &b) { return a.group == b.group; }
      
+    int process(vector<SharedRAbundVector*>&, DesignMap&);    
  };
  
  #endif /* KRUSKALWALLISCOMMAND_H */
diff --git a/lefsecommand.cpp b/lefsecommand.cpp

new file mode 100644 (file)

index 0000000..e894d57
--- /dev/null
+++ b/lefsecommand.cpp
@@ -0,0 +1,489 @@
+//
+//  lefsecommand.cpp
+//  Mothur
+//
+//  Created by SarahsWork on 6/12/13.
+//  Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#include "lefsecommand.h"
+#include "linearalgebra.h"
+
+//**********************************************************************************************************************
+vector<string> LefseCommand::setParameters(){
+       try {
+        CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pdesign);
+        CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","summary",false,true,true); parameters.push_back(pshared);
+        CommandParameter pclass("class", "String", "", "", "", "", "","",false,false); parameters.push_back(pclass);
+        CommandParameter psubclass("subclass", "String", "", "", "", "", "","",false,false); parameters.push_back(psubclass);
+               CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
+               CommandParameter pclasses("classes", "String", "", "", "", "", "","",false,false); parameters.push_back(pclasses);
+        CommandParameter palpha("anova_alpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(palpha);
+        CommandParameter pwalpha("wilcoxon_alpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(pwalpha);
+        //every command must have inputdir and outputdir.  This allows mothur users to redirect input and output files.
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
+               
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "LefseCommand", "setParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string LefseCommand::getHelpString(){
+       try {
+               string helpString = "";
+               helpString += "The lefse command allows you to ....\n";
+               helpString += "The lefse command parameters are: shared, design, class, subclass, label, classes, wilcoxon_alpha, anova_alpha.\n";
+               helpString += "The class parameter is used to indicate the which category you would like used for the Kruskal Wallis analysis. If none is provided first category is used.\n";
+        helpString += "The subclass parameter is used to indicate the .....If none is provided second category is used, or if only one category subclass is ignored. \n";
+        helpString += "The classes parameter is used to indicate the classes you would like to use. Clases should be inputted in the following format: classes=label<value1|value2|value3>-label<value1|value2>. For example to include groups from treatment with value early or late and age= young or old.  class=treatment<Early|Late>-age<young|old>.\n";
+        helpString += "The label parameter is used to indicate which distances in the shared file you would like to use. labels are separated by dashes.\n";
+               helpString += "The lefse command should be in the following format: lefse(shared=final.an.shared, design=final.design, class=treatment, subclass=age).\n";
+        return helpString;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "LefseCommand", "getHelpString");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string LefseCommand::getOutputPattern(string type) {
+    try {
+        string pattern = "";
+        
+        if (type == "summary") {  pattern = "[filename],[distance],lefse_summary"; }
+        else if (type == "kruskall-wallis") {  pattern = "[filename],[distance],kruskall_wallis"; }
+        else if (type == "wilcoxon") {  pattern = "[filename],[distance],wilcoxon"; }
+        else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
+        
+        return pattern;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "getOutputPattern");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+LefseCommand::LefseCommand(){
+       try {
+               abort = true; calledHelp = true;
+               setParameters();
+        vector<string> tempOutNames;
+               outputTypes["summary"] = tempOutNames;
+        outputTypes["kruskall-wallis"] = tempOutNames;
+        outputTypes["wilcoxon"] = tempOutNames;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "LefseCommand", "LefseCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+LefseCommand::LefseCommand(string option)  {
+       try {
+               abort = false; calledHelp = false;
+        allLines = 1;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; calledHelp = true; }
+               else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+               
+               else {
+                       //valid paramters for this command
+                       vector<string> myArray = setParameters();
+                       
+                       OptionParser parser(option);
+                       map<string,string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+                       map<string,string>::iterator it;
+                       //check to make sure all parameters are valid for command
+                       for (it = parameters.begin(); it != parameters.end(); it++) {
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       vector<string> tempOutNames;
+            outputTypes["summary"] = tempOutNames;
+            outputTypes["kruskall-wallis"] = tempOutNames;
+            outputTypes["wilcoxon"] = tempOutNames;
+            
+                       //if the user changes the input directory command factory will send this info to us in the output parameter
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                
+                string path;
+                               it = parameters.find("design");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["desing"] = inputDir + it->second;           }
+                               }
+                               
+                it = parameters.find("shared");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["shared"] = inputDir + it->second;           }
+                               }
+            }
+                    
+            //get shared file, it is required
+                       sharedfile = validParameter.validFile(parameters, "shared", true);
+                       if (sharedfile == "not open") { sharedfile = ""; abort = true; }
+                       else if (sharedfile == "not found") {
+                               //if there is a current shared file, use it
+                               sharedfile = m->getSharedFile();
+                               if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
+                               else {  m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       }else { m->setSharedFile(sharedfile); }
+            
+            //get shared file, it is required
+                       designfile = validParameter.validFile(parameters, "design", true);
+                       if (designfile == "not open") { designfile = ""; abort = true; }
+                       else if (designfile == "not found") {
+                               //if there is a current shared file, use it
+                               designfile = m->getDesignFile();
+                               if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); }
+                               else {  m->mothurOut("You have no current design file and the design parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       }else { m->setDesignFile(designfile); }
+            
+            //if the user changes the output directory command factory will send this info to us in the output parameter
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){
+                               outputDir = m->hasPath(sharedfile); //if user entered a file with a path then preserve it
+                       }
+            
+            string label = validParameter.validFile(parameters, "label", false);
+                       if (label == "not found") { label = ""; }
+                       else {
+                               if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
+                               else { allLines = 1;  }
+                       }
+            
+            mclass = validParameter.validFile(parameters, "class", false);
+                       if (mclass == "not found") { mclass = ""; }
+                       
+            subclass = validParameter.validFile(parameters, "subclass", false);
+                       if (subclass == "not found") { subclass = ""; }
+            
+            classes = validParameter.validFile(parameters, "classes", false);
+                       if (classes == "not found") { classes = ""; }
+            
+            string temp = validParameter.validFile(parameters, "anova_alpha", false);
+                       if (temp == "not found") { temp = "0.05"; }
+                       m->mothurConvert(temp, anovaAlpha);
+            
+            temp = validParameter.validFile(parameters, "wilcoxon_alpha", false);
+                       if (temp == "not found") { temp = "0.05"; }
+                       m->mothurConvert(temp, wilcoxonAlpha);
+
+               }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "LefseCommand", "LefseCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+int LefseCommand::execute(){
+       try {
+               
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
+        
+        DesignMap designMap(designfile);
+        //if user set classes set groups=those classes
+        if (classes != "") {
+            map<string, vector<string> > thisClasses = m->parseClasses(classes);
+            vector<string> groups = designMap.getNamesUnique(thisClasses);
+            if (groups.size() != 0) { m->setGroups(groups); }
+            else { m->mothurOut("[ERROR]: no groups meet your classes requirement, quitting.\n"); return 0; }
+        }
+        
+        //if user did not select class use first column
+        if (mclass == "") {  mclass = designMap.getDefaultClass(); m->mothurOut("\nYou did not provide a class, using " + mclass +".\n\n"); }
+        
+        InputData input(sharedfile, "sharedfile");
+        vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
+        string lastLabel = lookup[0]->getLabel();
+        
+        //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+        set<string> processedLabels;
+        set<string> userLabels = labels;
+        
+        
+        //as long as you are not at the end of the file or done wih the lines you want
+        while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+            
+            if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  return 0; }
+            
+            if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
+                
+                m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+                
+                process(lookup, designMap);
+                
+                processedLabels.insert(lookup[0]->getLabel());
+                userLabels.erase(lookup[0]->getLabel());
+            }
+            
+            if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+                string saveLabel = lookup[0]->getLabel();
+                
+                for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+                lookup = input.getSharedRAbundVectors(lastLabel);
+                m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+                
+                process(lookup, designMap);
+                
+                processedLabels.insert(lookup[0]->getLabel());
+                userLabels.erase(lookup[0]->getLabel());
+                
+                //restore real lastlabel to save below
+                lookup[0]->setLabel(saveLabel);
+            }
+            
+            lastLabel = lookup[0]->getLabel();
+            //prevent memory leak
+            for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
+            
+            if (m->control_pressed) { return 0; }
+            
+            //get next line to process
+            lookup = input.getSharedRAbundVectors();
+        }
+        
+        if (m->control_pressed) {  return 0; }
+        
+        //output error messages about any remaining user labels
+        set<string>::iterator it;
+        bool needToRun = false;
+        for (it = userLabels.begin(); it != userLabels.end(); it++) {
+            m->mothurOut("Your file does not include the label " + *it);
+            if (processedLabels.count(lastLabel) != 1) {
+                m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+                needToRun = true;
+            }else {
+                m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+            }
+        }
+        
+        //run last label if you need to
+        if (needToRun == true)  {
+            for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
+            lookup = input.getSharedRAbundVectors(lastLabel);
+            
+            m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+            process(lookup, designMap);
+            
+            for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+        }
+                
+               
+        //output files created by command
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
+        return 0;
+               
+    }
+       catch(exception& e) {
+               m->errorOut(e, "LefseCommand", "execute");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+int LefseCommand::process(vector<SharedRAbundVector*>& lookup, DesignMap& designMap) {
+       try {
+        //run kruskal wallis on each otu
+        vector<int> significantOtuLabels = runKruskalWallis(lookup, designMap);
+        
+        //check for subclass
+        if (subclass != "") {  significantOtuLabels = runWilcoxon(lookup, designMap, significantOtuLabels);  }
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "LefseCommand", "process");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+vector<int> LefseCommand::runKruskalWallis(vector<SharedRAbundVector*>& lookup, DesignMap& designMap) {
+       try {
+        map<string, string> variables;
+        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
+        variables["[distance]"] = lookup[0]->getLabel();
+               string outputFileName = getOutputFileName("kruskall-wallis",variables);
+        
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               outputNames.push_back(outputFileName); outputTypes["kruskall-wallis"].push_back(outputFileName);
+        out << "OTULabel\tKW\tPvalue\n";
+        
+        vector<int> significantOtuLabels;
+        int numBins = lookup[0]->getNumBins();
+        //sanity check to make sure each treatment has a group in the shared file
+        set<string> treatments;
+        for (int j = 0; j < lookup.size(); j++) {
+            string group = lookup[j]->getGroup();
+            string treatment = designMap.get(group, mclass); //get value for this group in this category
+            treatments.insert(treatment);
+        }
+        if (treatments.size() < 2) { m->mothurOut("[ERROR]: need at least 2 things to classes to compare, quitting.\n"); m->control_pressed = true; }
+        
+        LinearAlgebra linear;
+        for (int i = 0; i < numBins; i++) {
+            if (m->control_pressed) { break; }
+            
+            vector<spearmanRank> values;
+            for (int j = 0; j < lookup.size(); j++) {
+                string group = lookup[j]->getGroup();
+                string treatment = designMap.get(group, mclass); //get value for this group in this category
+                spearmanRank temp(treatment, lookup[j]->getAbundance(i));
+                values.push_back(temp);
+            }
+            
+            double pValue = 0.0;
+            double H = linear.calcKruskalWallis(values, pValue);
+            
+            //output H and signifigance
+            out << m->currentBinLabels[i] << '\t' << H << '\t' << pValue << endl;
+            
+            if (pValue < anovaAlpha) {  significantOtuLabels.push_back(i);  }
+        }
+        out.close();
+        
+        return significantOtuLabels;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "LefseCommand", "runKruskalWallis");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+vector<int> LefseCommand::runWilcoxon(vector<SharedRAbundVector*>& lookup, DesignMap& designMap, vector<int> bins) {
+    try {
+        vector<int> significantOtuLabels;
+        //if it exists and meets the following requirements run Wilcoxon
+        /*
+         1. Subclass members all belong to same main class
+         2. Number of groups in each subclass is the same
+         3. anything else??
+         
+         */
+        vector<string> subclasses;
+        map<string, string> subclass2Class;
+        map<string, int> subclassCounts;
+        map<string, vector<int> > subClass2GroupIndex; //maps subclass name to vector of indexes in lookup from that subclass. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old.  Saves time below.
+        bool error = false;
+        for (int j = 0; j < lookup.size(); j++) {
+            string group = lookup[j]->getGroup();
+            string treatment = designMap.get(group, mclass); //get value for this group in this category
+            string thisSub = designMap.get(group, subclass);
+            map<string, string>::iterator it = subclass2Class.find(thisSub);
+            if (it == subclass2Class.end()) {
+                subclass2Class[thisSub] = treatment;
+                subclassCounts[thisSub] = 1;
+                vector<int> temp; temp.push_back(j);
+                subClass2GroupIndex[thisSub] = temp;
+            }
+            else {
+                subclassCounts[thisSub]++;
+                subClass2GroupIndex[thisSub].push_back(j);
+                if (it->second != treatment) {
+                    error = true;
+                    m->mothurOut("[ERROR]: subclass " + thisSub + " has members in " + it->second + " and " + treatment + ". Subclass members must be from the same class. Ignoring wilcoxon.\n");
+                }
+            }
+        }
+        
+        if (error) { return significantOtuLabels; }
+        else { //check counts to make sure subclasses are the same size
+            set<int> counts;
+            for (map<string, int>::iterator it = subclassCounts.begin(); it != subclassCounts.end(); it++) { counts.insert(it->second); }
+            if (counts.size() > 1) { m->mothurOut("[ERROR]: subclasses must be the same size. Ignoring wilcoxon.\n");
+                return significantOtuLabels;  }
+        }
+        
+        int numBins = lookup[0]->getNumBins();
+        vector<compGroup> comp;
+        //find comparisons and fill comp
+        map<string, int>::iterator itB;
+        for(map<string, int>::iterator it=subclassCounts.begin();it!=subclassCounts.end();it++){
+            itB = it;itB++;
+            for(itB;itB!=subclassCounts.end();itB++){
+                compGroup temp(it->first,itB->first);
+                comp.push_back(temp);
+            }                  
+        }
+
+        int numComp = comp.size();
+        if (numComp < 2) {  m->mothurOut("[ERROR]: Need at least 2 subclasses, Ignoring Wilcoxon.\n");
+            return significantOtuLabels;  }
+        
+        map<string, string> variables;
+        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
+        variables["[distance]"] = lookup[0]->getLabel();
+        string outputFileName = getOutputFileName("wilcoxon",variables);
+        
+        ofstream out;
+        m->openOutputFile(outputFileName, out);
+        outputNames.push_back(outputFileName); outputTypes["wilcoxon"].push_back(outputFileName);
+        out << "OTULabel\tComparision\tWilcoxon\tPvalue\n";
+        
+        LinearAlgebra linear;
+        for (int i = 0; i < numBins; i++) {
+            if (m->control_pressed) { break; }
+            
+            if (m->inUsersGroups(i, bins)) { //flagged in Kruskal Wallis
+                
+                bool sig = false;
+                //for each subclass comparision
+                for (int j = 0; j < numComp; j++) {
+                    //fill x and y with this comparisons data
+                    vector<double> x; vector<double> y;
+                    
+                    //fill x and y
+                    vector<int> xIndexes = subClass2GroupIndex[comp[j].group1]; //indexes in lookup for this subclass
+                    for (int k = 0; k < xIndexes.size(); k++) { x.push_back(lookup[xIndexes[k]]->getAbundance(i)); }
+                    
+                    vector<int> yIndexes = subClass2GroupIndex[comp[j].group2]; //indexes in lookup for this subclass
+                    for (int k = 0; k < yIndexes.size(); k++) { y.push_back(lookup[yIndexes[k]]->getAbundance(i)); }
+                    
+                    double pValue = 0.0;
+                    double H = linear.calcWilcoxon(x, y, pValue);
+            
+                    //output H and signifigance
+                    out << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << '\t' << H << '\t' << pValue << endl;
+                    
+                    //set sig - not sure how yet
+                }
+                if (sig) {  significantOtuLabels.push_back(i);  }
+            }
+        }
+        out.close();
+        
+        return significantOtuLabels;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "runWilcoxon");
+        exit(1);
+    }
+}
+
+//**********************************************************************************************************************
+
+
diff --git a/lefsecommand.h b/lefsecommand.h

new file mode 100644 (file)

index 0000000..c9c4092
--- /dev/null
+++ b/lefsecommand.h
@@ -0,0 +1,68 @@
+//
+//  lefsecommand.h
+//  Mothur
+//
+//  Created by SarahsWork on 6/12/13.
+//  Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#ifndef __Mothur__lefsecommand__
+#define __Mothur__lefsecommand__
+
+#include "command.hpp"
+
+/* 
+ Columns = groups, rows are OTUs, class = design
+ 
+ From http://huttenhower.sph.harvard.edu/galaxy/root?tool_id=lefse_upload
+ Input data consist of a collection of m samples (columns) each made up of n numerical features (rows, typically normalized per-sample, red representing high values and green low). These samples are labeled with a class (taking two or more possible values) that represents the main biological hypothesis under investigation; they may also have one or more subclass labels reflecting within-class groupings.
+ 
+ Step 1: the Kruskall-Wallis test analyzes all features, testing whether the values in different classes are differentially distributed. Features violating the null hypothesis are further analyzed in Step 2.
+ Step 2: the pairwise Wilcoxon test checks whether all pairwise comparisons between subclasses within different classes significantly agree with the class level trend.
+ Step 3: the resulting subset of vectors is used to build a Linear Discriminant Analysis model from which the relative difference among classes is used to rank the features. The final output thus consists of a list of features that are discriminative with respect to the classes, consistent with the subclass grouping within classes, and ranked according to the effect size with which they differentiate classes.
+*/
+
+
+#include "command.hpp"
+#include "inputdata.h"
+#include "designmap.h"
+
+/**************************************************************************************************/
+
+class LefseCommand : public Command {
+public:
+    LefseCommand(string);
+    LefseCommand();
+    ~LefseCommand(){}
+    
+    vector<string> setParameters();
+    string getCommandName()                    { return "lefse";                       }
+    string getCommandCategory()                { return "OTU-Based Approaches";                }
+    
+    string getOutputPattern(string);
+       string getHelpString();
+    string getCitation() { return "http://www.mothur.org/wiki/Lefse"; }
+    string getDescription()            { return "brief description"; }
+    
+    int execute();
+    void help() { m->mothurOut(getHelpString()); }
+    
+private:
+    bool abort, allLines;
+    string outputDir, sharedfile, designfile, mclass, subclass, classes;
+    vector<string> outputNames;
+    set<string> labels;
+    float anovaAlpha, wilcoxonAlpha;
+    
+    int process(vector<SharedRAbundVector*>&, DesignMap&);
+    vector<int> runKruskalWallis(vector<SharedRAbundVector*>&, DesignMap&);
+    vector<int> runWilcoxon(vector<SharedRAbundVector*>&, DesignMap&, vector<int>);
+
+};
+
+/**************************************************************************************************/
+
+
+
+
+#endif /* defined(__Mothur__lefsecommand__) */
diff --git a/linearalgebra.cpp b/linearalgebra.cpp

index e1b9470fdb218d94fd9f5fe9f36ccb23ac31ea8a..e4eaf481b843bd0ef27c41e77bd726b8adac2397 100644 (file)
--- a/linearalgebra.cpp
+++ b/linearalgebra.cpp
@@ -805,7 +805,80 @@ double LinearAlgebra::calcSpearman(vector< vector<double> >& euclidDists, vector
                 exit(1);
         }
  }
-
+/*********************************************************************************************************************************/
+double LinearAlgebra::calcKruskalWallis(vector<spearmanRank>& values, double& pValue){
+       try {
+        double H;
+        set<string> treatments;
+        
+        //rank values
+        sort(values.begin(), values.end(), compareSpearman);
+        vector<spearmanRank*> ties;
+        int rankTotal = 0;
+        vector<int> TIES;
+        for (int j = 0; j < values.size(); j++) {
+            treatments.insert(values[j].name);
+            rankTotal += (j+1);
+            ties.push_back(&(values[j]));
+            
+            if (j != values.size()-1) { // you are not the last so you can look ahead
+                if (values[j].score != values[j+1].score) { // you are done with ties, rank them and continue
+                    if (ties.size() > 1) { TIES.push_back(ties.size()); }
+                    for (int k = 0; k < ties.size(); k++) {
+                        double thisrank = rankTotal / (double) ties.size();
+                        (*ties[k]).score = thisrank;
+                    }
+                    ties.clear();
+                    rankTotal = 0;
+                }
+            }else { // you are the last one
+                if (ties.size() > 1) { TIES.push_back(ties.size()); }
+                for (int k = 0; k < ties.size(); k++) {
+                    double thisrank = rankTotal / (double) ties.size();
+                    (*ties[k]).score = thisrank;
+                }
+            }
+        }
+        
+        
+        // H = 12/(N*(N+1)) * (sum Ti^2/n) - 3(N+1)
+        map<string, double> sums;
+        map<string, double> counts;
+        for (set<string>::iterator it = treatments.begin(); it != treatments.end(); it++) { sums[*it] = 0.0; counts[*it] = 0; }
+        
+        for (int j = 0; j < values.size(); j++) {
+            sums[values[j].name] += values[j].score;
+            counts[values[j].name]+= 1.0;
+        }
+        
+        double middleTerm = 0.0;
+        for (set<string>::iterator it = treatments.begin(); it != treatments.end(); it++) {
+            middleTerm += ((sums[*it]*sums[*it])/counts[*it]);
+        }
+        
+        double firstTerm = 12 / (double) (values.size()*(values.size()+1));
+        double lastTerm = 3 * (values.size()+1);
+        
+        H = firstTerm * middleTerm - lastTerm;
+        
+        //adjust for ties
+        if (TIES.size() != 0) {
+            double sum = 0.0;
+            for (int j = 0; j < TIES.size(); j++) { sum += ((TIES[j]*TIES[j]*TIES[j])-TIES[j]); }
+            double result = 1.0 - (sum / (double) ((values.size()*values.size()*values.size())-values.size()));
+            H /= result;
+        }
+        
+        //Numerical Recipes pg221
+        pValue = 1.0 - (gammp(((treatments.size()-1)/(double)2.0), H/2.0));
+        
+        return H;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "LinearAlgebra", "calcKruskalWallis");
+               exit(1);
+       }
+}
  /*********************************************************************************************************************************/
  //assumes both matrices are square and the same size
  double LinearAlgebra::calcKendall(vector< vector<double> >& euclidDists, vector< vector<double> >& userDists){
@@ -1228,7 +1301,73 @@ double LinearAlgebra::calcKendallSig(double n, double r){
                 exit(1);
         }
  }
+/*********************************************************************************************************************************/
+double LinearAlgebra::calcWilcoxon(vector<double>& x, vector<double>& y, double& sig){
+       try {
+               if (x.size() != y.size()) { m->mothurOut("[ERROR]: vector size mismatch."); m->mothurOutEndLine(); return 0.0; }
+               
+               double W = 0.0;
+        sig = 0.0;
+        
+        vector<double> signPairs;
+        vector<spearmanRank> absV;
+        for (int i = 0; i < x.size(); i++) {
+            if (m->control_pressed) { return W; }
+            double temp = y[i]-x[i];
+            double signV = sign(temp);
+            if (signV != 0) { // exclude zeros
+                spearmanRank member(toString(i), abs(temp));
+                absV.push_back(member);
+                signPairs.push_back(signV);
+            }
+        }
+        
+        //rank absV
+        //sort xscores
+               sort(absV.begin(), absV.end(), compareSpearman);
+               
+               //convert scores to ranks of x
+               vector<spearmanRank*> ties;
+               int rankTotal = 0;
+               for (int j = 0; j < absV.size(); j++) {
+            if (m->control_pressed) { return W; }
+                       rankTotal += (j+1);
+                       ties.push_back(&(absV[j]));
+            
+                       if (j != absV.size()-1) { // you are not the last so you can look ahead
+                               if (absV[j].score != absV[j+1].score) { // you are done with ties, rank them and continue
+                                       for (int k = 0; k < ties.size(); k++) {
+                                               float thisrank = rankTotal / (float) ties.size();
+                                               (*ties[k]).score = thisrank;
+                                       }
+                                       ties.clear();
+                                       rankTotal = 0;
+                               }
+                       }else { // you are the last one
+                               for (int k = 0; k < ties.size(); k++) {
+                                       float thisrank = rankTotal / (float) ties.size();
+                                       (*ties[k]).score = thisrank;
+                               }
+                       }
+               }
  
+        //sum ranks times sign
+        for (int i = 0; i < absV.size(); i++) {
+            if (m->control_pressed) { return W; }
+            W += (absV[i].score*signPairs[i]);
+        }
+        W = abs(W);
+        
+        //find zScore
+        cout << "still need to find sig!!" << endl;
+        
+               return W;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "LinearAlgebra", "calcWilcoxon");
+               exit(1);
+       }
+}
  /*********************************************************************************************************************************/
  double LinearAlgebra::calcSpearman(vector<double>& x, vector<double>& y, double& sig){
         try {
diff --git a/linearalgebra.h b/linearalgebra.h

index e0933ee52564915d35bcf6e9f91f73c2cea80d2d..5819ea85d8deefba15ca1c45ccf03efd21c9321a 100644 (file)
--- a/linearalgebra.h
+++ b/linearalgebra.h
@@ -29,6 +29,8 @@ public:
         double calcPearson(vector<vector<double> >&, vector<vector<double> >&);
         double calcSpearman(vector<vector<double> >&, vector<vector<double> >&);
         double calcKendall(vector<vector<double> >&, vector<vector<double> >&);
+    double calcKruskalWallis(vector<spearmanRank>&, double&);
+    double calcWilcoxon(vector<double>&, vector<double>&, double&);
         
         double calcPearson(vector<double>&, vector<double>&, double&);
         double calcSpearman(vector<double>&, vector<double>&, double&);
@@ -41,7 +43,7 @@ public:
      vector<double> solveEquations(vector<vector<double> >, vector<double>);
      vector<float> solveEquations(vector<vector<float> >, vector<float>);
      vector<vector<double> > getInverse(vector<vector<double> >);
-
+    
      
  private:
         MothurOut* m;
@@ -50,11 +52,11 @@ private:
      double betacf(const double, const double, const double);
      double betai(const double, const double, const double);
      double gammln(const double);
-    double gammp(const double, const double);
      double gammq(const double, const double);
      double gser(double&, const double, const double, double&);
      double gcf(double&, const double, const double, double&);
      double erfcc(double);
+    double gammp(const double, const double);
      
      double ran0(int&); //for testing 
      double ran1(int&); //for testing
diff --git a/macros.h b/macros.h

index f95acbed9864506e08041765107339125e7fb273..a9fc627d6ae774e7e151effbc5d700ae72bbe6f6 100755 (executable)
--- a/macros.h
+++ b/macros.h
@@ -6,8 +6,8 @@
  //  Copyright (c) 2012 Schloss Lab. All rights reserved.
  //
  
-#ifndef rrf_fs_prototype_macros_h
-#define rrf_fs_prototype_macros_h
+#ifndef RF_MACROS_H
+#define RF_MACROS_H
  
  #include "mothurout.h" 
  
diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp

index 85b6a8fdcbacf1ca1ee5b471b2142480ecf5da48..f888fbe4928a9526d34e9b8f25200fb32a329f6f 100644 (file)
--- a/makecontigscommand.cpp
+++ b/makecontigscommand.cpp
@@ -1541,6 +1541,7 @@ vector< vector<string> > MakeContigsCommand::readFileNames(string filename){
              }
              m->gobble(in);
              
+            if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ".\n"); }
              
              //check to make sure both are able to be opened
              ifstream in2;
@@ -1676,6 +1677,8 @@ bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, stri
                      }
                      //roligo = reverseOligo(roligo);
                      
+                    if (m->debug) { m->mothurOut("[DEBUG]: reading - " + roligo + ".\n"); }
+                    
                      group = "";
                      
                                         // get rest of line in case there is a primer name
@@ -1687,6 +1690,8 @@ bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, stri
                                         } 
                      
                      oligosPair newPrimer(foligo, roligo);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: primer pair " + newPrimer.forward + " " + newPrimer.reverse + ", and group = " + group + ".\n"); }
                                         
                                         //check for repeat barcodes
                      string tempPair = foligo+roligo;
diff --git a/makefile b/makefile

index 3ce1686327126b12c45afb51599b2e5a066e19aa..163db6bd0c547f2b73055ab5b4498dbd82ebfea3 100644 (file)
--- a/makefile
+++ b/makefile
@@ -15,8 +15,8 @@ USEREADLINE ?= yes
  CYGWIN_BUILD ?= no
  USECOMPRESSION ?= no
  MOTHUR_FILES="\"Enter_your_default_path_here\""
-RELEASE_DATE = "\"2/12/2013\""
-VERSION = "\"1.29.2\""
+RELEASE_DATE = "\"5/29/2013\""
+VERSION = "\"1.31.1\""
  FORTAN_COMPILER = gfortran
  FORTRAN_FLAGS = 
  
diff --git a/makelefsecommand.cpp b/makelefsecommand.cpp

new file mode 100644 (file)

index 0000000..dd44249
--- /dev/null
+++ b/makelefsecommand.cpp
@@ -0,0 +1,516 @@
+//
+//  makelefse.cpp
+//  Mothur
+//
+//  Created by SarahsWork on 6/3/13.
+//  Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#include "makelefsecommand.h"
+#include "designmap.h"
+
+//**********************************************************************************************************************
+vector<string> MakeLefseCommand::setParameters(){
+       try {
+        CommandParameter pshared("shared", "InputTypes", "", "", "SharedRel", "SharedRel", "none","lefse",false,false,true); parameters.push_back(pshared);
+               CommandParameter prelabund("relabund", "InputTypes", "", "", "SharedRel", "SharedRel", "none","lefse",false,false,true); parameters.push_back(prelabund);
+        CommandParameter pconstaxonomy("constaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false,false); parameters.push_back(pconstaxonomy);
+        CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none","",false,false, true); parameters.push_back(pdesign);
+        CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
+        CommandParameter pscale("scale", "Multiple", "totalgroup-totalotu-averagegroup-averageotu", "totalgroup", "", "", "","",false,false); parameters.push_back(pscale);
+               CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
+        CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
+               
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeLefseCommand", "setParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string MakeLefseCommand::getHelpString(){
+       try {
+               string helpString = "";
+               helpString += "The make.lefse command allows you to create a lefse formatted input file from mothur's output files.\n";
+               helpString += "The make.lefse command parameters are: shared, relabund, constaxonomy, design, scale, groups and label.  The shared or relabund are required.\n";
+               helpString += "The shared parameter is used to input your shared file, http://www.wiki.mothur.org/wiki/Shared_file.\n";
+        helpString += "The relabund parameter is used to input your relabund file, http://www.wiki.mothur.org/wiki/Relabund_file.\n";
+        helpString += "The design parameter is used to input your design file, http://www.wiki.mothur.org/wiki/Design_File.\n";
+        helpString += "The constaxonomy parameter is used to input your taxonomy file. http://www.wiki.mothur.org/wiki/Constaxonomy_file. The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance.  ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled. \n";
+        helpString += "The scale parameter allows you to select what scale you would like to use to convert your shared file abundances to relative abundances. Choices are totalgroup, totalotu, averagegroup, averageotu, default is totalgroup.\n";
+               helpString += "The label parameter allows you to select what distance level you would like used, if none is given the first distance is used.\n";
+               helpString += "The make.lefse command should be in the following format: make.lefse(list=yourListFile, taxonomy=outputFromClassify.seqsCommand, name=yourNameFile)\n";
+               helpString += "make.lefse(shared=final.an.list, taxonomy=final.an.taxonomy, name=final.names)\n";
+               return helpString;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeLefseCommand", "getHelpString");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string MakeLefseCommand::getOutputPattern(string type) {
+    try {
+        string pattern = "";
+        
+        if (type == "lefse") {  pattern = "[filename],[distance],lefse"; }
+        else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
+        
+        return pattern;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "MakeLefseCommand", "getOutputPattern");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+MakeLefseCommand::MakeLefseCommand(){
+       try {
+               abort = true; calledHelp = true;
+               setParameters();
+        vector<string> tempOutNames;
+               outputTypes["lefse"] = tempOutNames; 
+                       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeLefseCommand", "MakeLefseCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+MakeLefseCommand::MakeLefseCommand(string option)  {
+       try {
+               abort = false; calledHelp = false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; calledHelp = true; }
+               else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+               
+               else {
+                       //valid paramters for this command
+                       vector<string> myArray = setParameters();
+                       
+                       OptionParser parser(option);
+                       map<string,string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+                       map<string,string>::iterator it;
+                       //check to make sure all parameters are valid for command
+                       for (it = parameters.begin(); it != parameters.end(); it++) {
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       vector<string> tempOutNames;
+            outputTypes["lefse"] = tempOutNames;
+            
+                       //if the user changes the input directory command factory will send this info to us in the output parameter
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                string path;
+                               it = parameters.find("shared");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["shared"] = inputDir + it->second;           }
+                               }
+                               
+                               it = parameters.find("design");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["design"] = inputDir + it->second;           }
+                               }
+                               
+                               it = parameters.find("constaxonomy");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["constaxonomy"] = inputDir + it->second;             }
+                               }
+                
+                it = parameters.find("relabund");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["relabund"] = inputDir + it->second;         }
+                               }
+            }
+                    
+                       //check for parameters
+                       designfile = validParameter.validFile(parameters, "design", true);
+                       if (designfile == "not open") { abort = true; }
+                       else if (designfile == "not found") { designfile = ""; }
+                       else { m->setDesignFile(designfile); }
+            
+            sharedfile = validParameter.validFile(parameters, "shared", true);
+                       if (sharedfile == "not open") { abort = true; }
+                       else if (sharedfile == "not found") { sharedfile = ""; }
+                       else {  m->setSharedFile(sharedfile); }
+                       
+                       relabundfile = validParameter.validFile(parameters, "relabund", true);
+                       if (relabundfile == "not open") { abort = true; }
+                       else if (relabundfile == "not found") { relabundfile = ""; }
+                       else {  m->setRelAbundFile(relabundfile); }
+            
+            constaxonomyfile = validParameter.validFile(parameters, "constaxonomy", true);
+                       if (constaxonomyfile == "not open") { constaxonomyfile = ""; abort = true; }
+                       else if (constaxonomyfile == "not found") {  constaxonomyfile = "";  }
+                       
+                       label = validParameter.validFile(parameters, "label", false);
+                       if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your inputfile."); m->mothurOutEndLine(); label=""; }
+            
+            string groups = validParameter.validFile(parameters, "groups", false);
+                       if (groups == "not found") { groups = "";  }
+                       else {
+                               m->splitAtDash(groups, Groups);
+                               m->setGroups(Groups);
+                       }
+
+
+            if ((relabundfile == "") && (sharedfile == "")) {
+                               //is there are current file available for either of these?
+                               //give priority to shared, then relabund
+                               sharedfile = m->getSharedFile();
+                               if (sharedfile != "") {  m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
+                               else {
+                                       relabundfile = m->getRelAbundFile();
+                                       if (relabundfile != "") {   m->mothurOut("Using " + relabundfile + " as input file for the relabund parameter."); m->mothurOutEndLine(); }
+                                       else {
+                                               m->mothurOut("No valid current files. You must provide a shared or relabund."); m->mothurOutEndLine();
+                                               abort = true;
+                                       }
+                               }
+                       }
+                       
+                       if ((relabundfile != "") && (sharedfile != "")) { m->mothurOut("[ERROR]: You may not use both a shared and relabund file."); m->mothurOutEndLine(); abort = true;  }
+                       
+            //if the user changes the output directory command factory will send this info to us in the output parameter
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){
+                               outputDir = "";                         }
+            
+            scale = validParameter.validFile(parameters, "scale", false);                              if (scale == "not found") { scale = "totalgroup"; }
+                       
+                       if ((scale != "totalgroup") && (scale != "totalotu") && (scale != "averagegroup") && (scale != "averageotu")) {
+                               m->mothurOut(scale + " is not a valid scaling option for the get.relabund command. Choices are totalgroup, totalotu, averagegroup, averageotu."); m->mothurOutEndLine(); abort = true;
+                       }
+        }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeLefseCommand", "MakeLefseCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+int MakeLefseCommand::execute(){
+       try {
+               
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
+      
+        map<string, consTax2> consTax;
+        if (constaxonomyfile != "") {  m->readConsTax(constaxonomyfile, consTax);  }
+        
+        if (m->control_pressed) { return 0; }
+        
+        if (sharedfile != "") {
+            inputFile = sharedfile;
+            vector<SharedRAbundFloatVector*> lookup = getSharedRelabund();
+            runRelabund(consTax, lookup);
+        }else {
+            inputFile = relabundfile;
+            vector<SharedRAbundFloatVector*> lookup = getRelabund();
+            runRelabund(consTax, lookup);
+        }
+        
+        if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); } return 0; }
+               
+        //output files created by command
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
+        return 0;
+               
+    }
+       catch(exception& e) {
+               m->errorOut(e, "MakeLefseCommand", "execute");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int MakeLefseCommand::runRelabund(map<string, consTax2>& consTax, vector<SharedRAbundFloatVector*>& lookup){
+       try {
+        if (outputDir == "") { outputDir = m->hasPath(inputFile); }
+        map<string, string> variables;
+        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(inputFile));
+        variables["[distance]"] = lookup[0]->getLabel();
+               string outputFile = getOutputFileName("lefse",variables);
+               outputNames.push_back(outputFile); outputTypes["lefse"].push_back(outputFile);
+        
+        ofstream out;
+        m->openOutputFile(outputFile, out);
+
+        DesignMap* designMap = NULL;
+        if (designfile != "") {
+            designMap = new DesignMap(designfile);
+            vector<string> categories = designMap->getNamesOfCategories();
+            
+            for (int j = 0; j < categories.size(); j++) {
+                out << categories[j] << "\t";
+                for (int i = 0; i < lookup.size(); i++) {
+                    if (m->control_pressed) { out.close(); delete designMap; return 0; }
+                    string value = designMap->get(lookup[i]->getGroup(), categories[j]);
+                    if (value == "not found") {
+                        m->mothurOut("[ERROR]: " + lookup[i]->getGroup() + " is not in your design file, please correct.\n"); m->control_pressed = true;
+                    }else { out << value << '\t'; }
+                }
+                out << endl;
+            }
+        }
+        
+        out << "group\t";
+        for (int i = 0; i < lookup.size(); i++) {  out << lookup[i]->getGroup() << '\t'; }
+        out << endl;
+        
+        for (int i = 0; i < lookup[0]->getNumBins(); i++) { //process each otu
+            if (m->control_pressed) { break; }
+            string nameOfOtu = m->currentBinLabels[i];
+            if (constaxonomyfile != "") { //try to find the otuName in consTax to replace with consensus taxonomy
+                map<string, consTax2>::iterator it = consTax.find(nameOfOtu);
+                if (it != consTax.end()) {
+                    nameOfOtu = it->second.taxonomy;
+                    //add sanity check abundances here??
+                    string fixedName = "";
+                    //remove confidences and change ; to |
+                    m->removeConfidences(nameOfOtu);
+                    for (int j = 0; j < nameOfOtu.length()-1; j++) {
+                        if (nameOfOtu[j] == ';') { fixedName += "_" + m->currentBinLabels[i] + '|'; }
+                        else { fixedName += nameOfOtu[j]; }
+                    }
+                    nameOfOtu = fixedName;
+                }else {
+                    m->mothurOut("[ERROR]: can't find " + nameOfOtu + " in constaxonomy file. Do the distances match, did you forget to use the label parameter?\n"); m->control_pressed = true;
+                }
+                
+            }
+            //print name
+            out << nameOfOtu << '\t';
+            
+            //print out relabunds for each otu
+            for (int j = 0; j < lookup.size(); j++) { out << lookup[j]->getAbundance(i) << '\t'; }
+            out << endl;
+        }
+        out.close();
+
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "MakeLefseCommand", "execute");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<SharedRAbundFloatVector*> MakeLefseCommand::getSharedRelabund(){
+       try {
+               InputData input(sharedfile, "sharedfile");
+               vector<SharedRAbundVector*> templookup = input.getSharedRAbundVectors();
+               string lastLabel = templookup[0]->getLabel();
+        vector<SharedRAbundFloatVector*> lookup;
+               
+               if (label == "") { label = lastLabel;  }
+               else {
+            //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+            set<string> labels; labels.insert(label);
+            set<string> processedLabels;
+            set<string> userLabels = labels;
+            
+            //as long as you are not at the end of the file or done wih the lines you want
+            while((templookup[0] != NULL) && (userLabels.size() != 0)) {
+                if (m->control_pressed) {  for (int i = 0; i < templookup.size(); i++) {  delete templookup[i];  } return lookup;  }
+                
+                if(labels.count(templookup[0]->getLabel()) == 1){
+                    processedLabels.insert(templookup[0]->getLabel());
+                    userLabels.erase(templookup[0]->getLabel());
+                    break;
+                }
+                
+                if ((m->anyLabelsToProcess(templookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+                    string saveLabel = templookup[0]->getLabel();
+                    
+                    for (int i = 0; i < templookup.size(); i++) {  delete templookup[i];  }
+                    templookup = input.getSharedRAbundVectors(lastLabel);
+                    
+                    processedLabels.insert(templookup[0]->getLabel());
+                    userLabels.erase(templookup[0]->getLabel());
+                    
+                    //restore real lastlabel to save below
+                    templookup[0]->setLabel(saveLabel);
+                    break;
+                }
+                
+                lastLabel = templookup[0]->getLabel();
+                
+                //get next line to process
+                //prevent memory leak
+                for (int i = 0; i < templookup.size(); i++) {  delete templookup[i];  }
+                templookup = input.getSharedRAbundVectors();
+            }
+            
+            
+            if (m->control_pressed) { for (int i = 0; i < templookup.size(); i++) {  delete templookup[i];  } return lookup;  }
+            
+            //output error messages about any remaining user labels
+            set<string>::iterator it;
+            bool needToRun = false;
+            for (it = userLabels.begin(); it != userLabels.end(); it++) {
+                m->mothurOut("Your file does not include the label " + *it);
+                if (processedLabels.count(lastLabel) != 1) {
+                    m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+                    needToRun = true;
+                }else {
+                    m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+                }
+            }
+            
+            //run last label if you need to
+            if (needToRun == true)  {
+                for (int i = 0; i < templookup.size(); i++) {  if (templookup[i] != NULL) {    delete templookup[i];   } }
+                templookup = input.getSharedRAbundVectors(lastLabel);
+            }
+               }
+        
+        for (int i = 0; i < templookup.size(); i++) {
+            SharedRAbundFloatVector* temp = new SharedRAbundFloatVector();
+            temp->setLabel(templookup[i]->getLabel());
+            temp->setGroup(templookup[i]->getGroup());
+            lookup.push_back(temp);
+        }
+        
+        //convert to relabund
+        for (int i = 0; i < templookup.size(); i++) {
+                       for (int j = 0; j < templookup[i]->getNumBins(); j++) {
+                
+                               if (m->control_pressed) { for (int k = 0; k < templookup.size(); k++) {  delete templookup[k];  } return lookup; }
+                
+                               int abund = templookup[i]->getAbundance(j);
+                               float relabund = 0.0;
+                               
+                               if (scale == "totalgroup") {
+                                       relabund = abund / (float) templookup[i]->getNumSeqs();
+                               }else if (scale == "totalotu") {
+                                       //calc the total in this otu
+                                       int totalOtu = 0;
+                                       for (int l = 0; l < templookup.size(); l++) {  totalOtu += templookup[l]->getAbundance(j); }
+                                       relabund = abund / (float) totalOtu;
+                               }else if (scale == "averagegroup") {
+                                       relabund = abund / (float) (templookup[i]->getNumSeqs() / (float) templookup[i]->getNumBins());
+                               }else if (scale == "averageotu") {
+                                       //calc the total in this otu
+                                       int totalOtu = 0;
+                                       for (int l = 0; l < templookup.size(); l++) {  totalOtu += templookup[l]->getAbundance(j); }
+                                       float averageOtu = totalOtu / (float) templookup.size();
+                                       
+                                       relabund = abund / (float) averageOtu;
+                               }else{ m->mothurOut(scale + " is not a valid scaling option."); m->mothurOutEndLine(); m->control_pressed = true;  }
+                               
+                               lookup[i]->push_back(relabund, lookup[i]->getGroup());
+                       }
+        }
+
+        for (int k = 0; k < templookup.size(); k++) {  delete templookup[k];  }
+        
+               return lookup;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeLefseCommand", "getSharedRelabund");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<SharedRAbundFloatVector*> MakeLefseCommand::getRelabund(){
+       try {
+               InputData input(relabundfile, "relabund");
+               vector<SharedRAbundFloatVector*> lookupFloat = input.getSharedRAbundFloatVectors();
+               string lastLabel = lookupFloat[0]->getLabel();
+               
+               if (label == "") { label = lastLabel; return lookupFloat; }
+               
+               //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+               set<string> labels; labels.insert(label);
+               set<string> processedLabels;
+               set<string> userLabels = labels;
+               
+               //as long as you are not at the end of the file or done wih the lines you want
+               while((lookupFloat[0] != NULL) && (userLabels.size() != 0)) {
+                       
+                       if (m->control_pressed) {  return lookupFloat;  }
+                       
+                       if(labels.count(lookupFloat[0]->getLabel()) == 1){
+                               processedLabels.insert(lookupFloat[0]->getLabel());
+                               userLabels.erase(lookupFloat[0]->getLabel());
+                               break;
+                       }
+                       
+                       if ((m->anyLabelsToProcess(lookupFloat[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+                               string saveLabel = lookupFloat[0]->getLabel();
+                               
+                               for (int i = 0; i < lookupFloat.size(); i++) {  delete lookupFloat[i];  }
+                               lookupFloat = input.getSharedRAbundFloatVectors(lastLabel);
+                               
+                               processedLabels.insert(lookupFloat[0]->getLabel());
+                               userLabels.erase(lookupFloat[0]->getLabel());
+                               
+                               //restore real lastlabel to save below
+                               lookupFloat[0]->setLabel(saveLabel);
+                               break;
+                       }
+                       
+                       lastLabel = lookupFloat[0]->getLabel();
+                       
+                       //get next line to process
+                       //prevent memory leak
+                       for (int i = 0; i < lookupFloat.size(); i++) {  delete lookupFloat[i];  }
+                       lookupFloat = input.getSharedRAbundFloatVectors();
+               }
+               
+               
+               if (m->control_pressed) { return lookupFloat;  }
+               
+               //output error messages about any remaining user labels
+               set<string>::iterator it;
+               bool needToRun = false;
+               for (it = userLabels.begin(); it != userLabels.end(); it++) {
+                       m->mothurOut("Your file does not include the label " + *it);
+                       if (processedLabels.count(lastLabel) != 1) {
+                               m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+                               needToRun = true;
+                       }else {
+                               m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+                       }
+               }
+               
+               //run last label if you need to
+               if (needToRun == true)  {
+                       for (int i = 0; i < lookupFloat.size(); i++) {  if (lookupFloat[i] != NULL) {   delete lookupFloat[i];  } }
+                       lookupFloat = input.getSharedRAbundFloatVectors(lastLabel);
+               }
+               
+               return lookupFloat;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "MakeLefseCommand", "getRelabund");
+        exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+
diff --git a/makelefsecommand.h b/makelefsecommand.h

new file mode 100644 (file)

index 0000000..8270635
--- /dev/null
+++ b/makelefsecommand.h
@@ -0,0 +1,55 @@
+//
+//  makelefse.h
+//  Mothur
+//
+//  Created by SarahsWork on 6/3/13.
+//  Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#ifndef __Mothur__makelefse__
+#define __Mothur__makelefse__
+
+#include "mothurout.h"
+#include "command.hpp"
+#include "inputdata.h"
+#include "sharedutilities.h"
+#include "phylosummary.h"
+
+/**************************************************************************************************/
+
+class MakeLefseCommand : public Command {
+public:
+    MakeLefseCommand(string);
+    MakeLefseCommand();
+    ~MakeLefseCommand(){}
+    
+    vector<string> setParameters();
+    string getCommandName()                    { return "make.lefse";                  }
+    string getCommandCategory()                { return "General";             }
+    
+    string getOutputPattern(string);
+       string getHelpString();
+    string getCitation() { return "http://huttenhower.sph.harvard.edu/galaxy/root?tool_id=lefse_upload http://www.mothur.org/wiki/Make.lefse"; }
+    string getDescription()            { return "creates LEfSe input file"; }
+    
+    int execute();
+    void help() { m->mothurOut(getHelpString()); }
+    
+private:
+    bool abort, allLines, otulabel, hasGroupInfo;
+    string outputDir;
+    vector<string> outputNames, Groups;
+    string sharedfile, designfile, constaxonomyfile, relabundfile, scale, label, inputFile;
+    
+    int runRelabund(map<string, consTax2>&, vector<SharedRAbundFloatVector*>&);
+    
+    vector<SharedRAbundFloatVector*> getRelabund();
+    vector<SharedRAbundFloatVector*> getSharedRelabund();
+};
+
+/**************************************************************************************************/
+
+
+
+
+#endif /* defined(__Mothur__makelefse__) */
diff --git a/makelookupcommand.cpp b/makelookupcommand.cpp

index dd5bacfbc12aed0444fce46e3e836e4c2aea22d8..5b280f54fa297dd8401c964df08d7673cce0c6d9 100644 (file)
--- a/makelookupcommand.cpp
+++ b/makelookupcommand.cpp
@@ -197,7 +197,7 @@ int MakeLookupCommand::execute(){
          
          double gapOpening = 10;
          int maxHomoP = 101;
-        vector<vector<double> > penaltyMatrix(maxHomoP);
+        vector<vector<double> > penaltyMatrix; penaltyMatrix.resize(maxHomoP);
          for(int i=0;i<maxHomoP;i++){
              penaltyMatrix[i].resize(maxHomoP, 5);
              penaltyMatrix[i][i] = 0;
@@ -210,26 +210,31 @@ int MakeLookupCommand::execute(){
          
          while(!refFASTA.eof()){
              if (m->control_pressed) { refFASTA.close(); return 0; }
+            
              Sequence seq(refFASTA);  m->gobble(refFASTA);
              
+            if (m->debug) { m->mothurOut("[DEBUG]: seq = " + seq.getName() + ".\n"); }
+            
              string fullSequence = keySequence + barcodeSequence + seq.getAligned(); //  *   concatenate the keySequence, barcodeSequence, and
              //      referenceSequences
              refFlowgrams[seq.getName()] = convertSeqToFlow(fullSequence, flowOrder); //  *   translate concatenated sequences into flowgram
          }
          refFASTA.close();
          
-        vector<vector<double> > lookupTable(1000);
+        vector<vector<double> > lookupTable; lookupTable.resize(1000);
          for(int i=0;i<1000;i++){
              lookupTable[i].resize(11, 0);
          }
          
-        
+        if (m->debug) { m->mothurOut("[DEBUG]: here .\n"); }
          //Loop through each sequence in the flow file and the error summary file.
          ifstream flowFile;
          m->openInputFile(flowFileName, flowFile);
          int numFlows;
          flowFile >> numFlows;
          
+        if (m->debug) { m->mothurOut("[DEBUG]: numflows = " + toString(numFlows) +  ".\n"); }
+        
          ifstream errorFile;
          m->openInputFile(errorFileName, errorFile);
          m->getline(errorFile); //grab headers
@@ -238,7 +243,7 @@ int MakeLookupCommand::execute(){
          string chimera;
          float intensity;
          
-        vector<double> std(11, 0);
+        vector<double> std;  std.resize(11, 0);
          
          while(errorFile && flowFile){
              
@@ -260,26 +265,37 @@ int MakeLookupCommand::execute(){
                  flowFile >> flowQuery >> dummy;
                  if(flowQuery != errorQuery){    cout << flowQuery << " != " << errorQuery << endl;  }
                  
-                vector<double> refFlow = refFlowgrams[referenceName];       //  * compare sequence to its closest reference
-                
-                vector<double> flowgram(numFlows);
-                
-                for(int i=0;i<numFlows;i++){
-                    flowFile >> intensity;
-                    flowgram[i] = intensity;// (int)round(100 * intensity);
-                }
-                m->gobble(flowFile);
-                
-                alignFlowGrams(flowgram, refFlow, gapOpening, penaltyMatrix, flowOrder);
-                
-                if (m->control_pressed) { errorFile.close(); flowFile.close(); return 0; }
-                
-                for(int i=0;i<flowgram.size();i++){
-                    int count = (int)round(100*flowgram[i]);
-                    if(count > 1000){count = 999;}
-                    if(abs(flowgram[i]-refFlow[i])<=0.50){
-                        lookupTable[count][int(refFlow[i])]++;               //  * build table
-                        std[int(refFlow[i])] += (100*refFlow[i]-count)*(100*refFlow[i]-count);
+                map<string, vector<double> >::iterator it = refFlowgrams.find(referenceName);       //  * compare sequence to its closest reference
+                if (it == refFlowgrams.end()) {
+                    m->mothurOut("[WARNING]: missing reference flow " + referenceName + ", ignoring flow " + flowQuery + ".\n");
+                    m->getline(flowFile); m->gobble(flowFile);
+                }else {
+                    vector<double> refFlow = it->second;
+                    vector<double> flowgram; flowgram.resize(numFlows);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: flowQuery = " + flowQuery +  ".\t" + "refName " + referenceName+  ".\n"); }
+                    
+                    for(int i=0;i<numFlows;i++){
+                        flowFile >> intensity;
+                        flowgram[i] = intensity;// (int)round(100 * intensity);
+                    }
+                    m->gobble(flowFile);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: before align.\n"); }
+                    
+                    alignFlowGrams(flowgram, refFlow, gapOpening, penaltyMatrix, flowOrder);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: after align.\n"); }
+                    
+                    if (m->control_pressed) { errorFile.close(); flowFile.close(); return 0; }
+                    
+                    for(int i=0;i<flowgram.size();i++){
+                        int count = (int)round(100*flowgram[i]);
+                        if(count > 1000){count = 999;}
+                        if(abs(flowgram[i]-refFlow[i])<=0.50){
+                            lookupTable[count][int(refFlow[i])]++;               //  * build table
+                            std[int(refFlow[i])] += (100*refFlow[i]-count)*(100*refFlow[i]-count);
+                        }
                      }
                  }
                  
@@ -289,8 +305,9 @@ int MakeLookupCommand::execute(){
          }
          errorFile.close(); flowFile.close();
          
+        
          //get probabilities
-        vector<int> counts(11, 0);
+        vector<int> counts; counts.resize(11, 0);
          int totalCount = 0;
          for(int i=0;i<1000;i++){
              for(int j=0;j<11;j++){
@@ -337,7 +354,7 @@ int MakeLookupCommand::execute(){
          
          
          //calculate the probability of each homopolymer length
-        vector<double> negLogHomoProb(11, 0.00);  //bring back
+        vector<double> negLogHomoProb; negLogHomoProb.resize(11, 0.00);  //bring back
          for(int i=0;i<N;i++){
              negLogHomoProb[i] = -log(counts[i] / (double)totalCount);
          }
@@ -438,8 +455,8 @@ int MakeLookupCommand::alignFlowGrams(vector<double>& flowgram, vector<double>&
          
              //cout << numQueryFlows << '\t' << numRefFlows << endl;
          
-        vector<vector<double> > scoreMatrix(numQueryFlows+1);
-        vector<vector<char> > directMatrix(numQueryFlows+1);
+        vector<vector<double> > scoreMatrix; scoreMatrix.resize(numQueryFlows+1);
+        vector<vector<char> > directMatrix; directMatrix.resize(numQueryFlows+1);
          
          for(int i=0;i<=numQueryFlows;i++){
              if (m->control_pressed) { return 0; }
@@ -458,6 +475,7 @@ int MakeLookupCommand::alignFlowGrams(vector<double>& flowgram, vector<double>&
              directMatrix[0][i] = 'l';
          }
          
+        
          for(int i=1;i<=numQueryFlows;i++){
              for(int j=1;j<=numRefFlows;j++){
                  if (m->control_pressed) { return 0; }
@@ -498,7 +516,7 @@ int MakeLookupCommand::alignFlowGrams(vector<double>& flowgram, vector<double>&
          
          int minColumnIndex = numRefFlows;
          double minColumnScore = scoreMatrix[numQueryFlows][numRefFlows];
-        for(int i=0;i<numQueryFlows;i++){
+        for(int i=0;i<numRefFlows;i++){
              if (m->control_pressed) { return 0; }
              if(scoreMatrix[numQueryFlows][i] < minColumnScore){
                  minColumnScore = scoreMatrix[numQueryFlows][i];
@@ -512,7 +530,7 @@ int MakeLookupCommand::alignFlowGrams(vector<double>& flowgram, vector<double>&
          
          vector<double> newFlowgram;
          vector<double> newRefFlowgram;
-        
+       
          while(i > 0 && j > 0){
              if (m->control_pressed) { return 0; }
              if(directMatrix[i][j] == 'd'){
diff --git a/makelookupcommand.h b/makelookupcommand.h

index 743e756544d29624d699acd7f50764403976c834..ca274e057532e5a3bbc0839c92cdcee8984e61e1 100644 (file)
--- a/makelookupcommand.h
+++ b/makelookupcommand.h
@@ -26,8 +26,8 @@ public:
      
      string getOutputPattern(string);
         string getHelpString();
-    string getCitation() { return "http://www.mothur.org/wiki/Make.lookup"; }
-    string getDescription()            { return "create custom lookup files for use with shhh.flows"; }
+    string getCitation() { return "Quince, C., A. Lanzén, T. P. Curtis, R. J. Davenport, N. Hall, I. M. Head, L. F. Read, and W. T. Sloan. 2009. Accurate determination of microbial diversity from 454 pyrosequencing data. Nat Methods 6:639-41. http://www.mothur.org/wiki/Make.lookup"; }
+    string getDescription()            { return "Creates a lookup file for use with shhh.flows using user-supplied mock community data and flow grams"; }
      
      int execute();
      void help() { m->mothurOut(getHelpString()); }
diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp

index e76dd4860b03826db116323af7e42528a591c526..e3287f7f9bb19a4079c28e92fe57cf99e06216d5 100644 (file)
--- a/mgclustercommand.cpp
+++ b/mgclustercommand.cpp
@@ -228,7 +228,13 @@ int MGClusterCommand::execute(){
                 if (namefile != "") {
                         nameMap = new NameAssignment(namefile);
                         nameMap->readMap();
-               }else{ nameMap= new NameAssignment(); }
+               }else if (countfile != "") {
+            ct = new CountTable();
+            ct->readTable(countfile, false);
+            nameMap= new NameAssignment();
+            vector<string> tempNames = ct->getNamesOfSeqs();
+            for (int i = 0; i < tempNames.size(); i++) {  nameMap->push_back(tempNames[i]);  }
+        }else{ nameMap= new NameAssignment(); }
                 
                 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
                 string tag = "";
@@ -245,9 +251,6 @@ int MGClusterCommand::execute(){
          RAbundVector* rabund = NULL;
          
          if(countfile != "") {
-            //map<string, int> nameMapCounts = m->readNames(namefile);
-            ct = new CountTable();
-            ct->readTable(countfile, false);
              rabund = new RAbundVector();
              createRabund(ct, list, rabund);
          }else {
diff --git a/mothur.h b/mothur.h

index 98b34c31c58c88b614412be4e9ba6874ec1cbe36..32f4778a7c1ee19db25eeb4e1eafaf74fabcd617 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -127,6 +127,21 @@ struct PDistCell{
         PDistCell() :  index(0), dist(0) {};
         PDistCell(ull c, float d) :  index(c), dist(d) {}
  };
+/***********************************************************************/
+struct consTax{
+       string name;
+    string taxonomy;
+    int abundance;
+       consTax() :  name(""), taxonomy("unknown"), abundance(0) {};
+       consTax(string n, string t, int a) :  name(n), taxonomy(t), abundance(a) {}
+};
+/***********************************************************************/
+struct consTax2{
+    string taxonomy;
+    int abundance;
+       consTax2() :  taxonomy("unknown"), abundance(0) {};
+       consTax2(string t, int a) :  taxonomy(t), abundance(a) {}
+};
  /************************************************************/
  struct clusterNode {
         int numSeq;
@@ -158,6 +173,15 @@ struct seqPriorityNode {
         seqPriorityNode(int n, string s, string nm) : numIdentical(n), seq(s), name(nm) {}
         ~seqPriorityNode() {}
  };
+/************************************************************/
+struct compGroup {
+       string group1;
+       string group2;
+       compGroup() {}
+       compGroup(string s, string nm) : group1(s), group2(nm) {}
+    string getCombo() { return group1+"-"+group2; }
+       ~compGroup() {}
+};
  /***************************************************************/
  struct spearmanRank {
         string name;
@@ -194,7 +218,15 @@ inline bool compareDistLinePairs(distlinePair left, distlinePair right){
  //sorts lowest to highest
  inline bool compareSequenceDistance(seqDist left, seqDist right){
         return (left.dist < right.dist);        
-} 
+}
+//********************************************************************************************************************
+//returns sign of double
+inline double sign(double temp){
+       //find sign
+    if (temp > 0)       { return 1.0;   }
+    else if (temp < 0)  { return -1.0;  }
+    return 0;
+}
  /***********************************************************************/
  
  // snagged from http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.2
diff --git a/mothurout.cpp b/mothurout.cpp

index 8a3b6313072a158f611e1a69a84d0a9713add18c..1f1c96b6f203e4f1fa8b1dca2e018c735caa6b9f 100644 (file)
--- a/mothurout.cpp
+++ b/mothurout.cpp
@@ -796,6 +796,39 @@ bool MothurOut::dirCheck(string& dirName){
         }       
      
  }
+//**********************************************************************************************************************
+
+map<string, vector<string> > MothurOut::parseClasses(string classes){
+       try {
+        map<string, vector<string> > parts;
+        
+        //treatment<Early|Late>-age<young|old>
+        vector<string> pieces; splitAtDash(classes, pieces); // -> treatment<Early|Late>, age<young|old>
+        
+        for (int i = 0; i < pieces.size(); i++) {
+            string category = ""; string value = "";
+            bool foundOpen = false;
+            for (int j = 0; j < pieces[i].length(); j++) {
+                if (control_pressed) { return parts; }
+                
+                if (pieces[i][j] == '<')        { foundOpen = true;         }
+                else if (pieces[i][j] == '>')   { j += pieces[i].length();  }
+                else {
+                    if (!foundOpen) { category += pieces[i][j]; }
+                    else { value += pieces[i][j]; }
+                }
+            }
+            vector<string> values; splitAtChar(value, values, '|');
+            parts[category] = values;
+        }
+        
+        return parts;
+    }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "parseClasses");
+               exit(1);
+       }
+}
  /***********************************************************************/
  
  string MothurOut::hasPath(string longName){
@@ -1360,6 +1393,67 @@ vector<unsigned long long> MothurOut::setFilePosFasta(string filename, int& num)
                 exit(1);
         }
  }
+//**********************************************************************************************************************
+vector<consTax> MothurOut::readConsTax(string inputfile){
+       try {
+               
+        vector<consTax> taxes;
+        
+        ifstream in;
+        openInputFile(inputfile, in);
+        
+        //read headers
+        getline(in);
+        
+        while (!in.eof()) {
+            
+            if (control_pressed) { break; }
+            
+            string otu = ""; string tax = "unknown";
+            int size = 0;
+            
+            in >> otu >> size >> tax; gobble(in);
+            consTax temp(otu, tax, size);
+            taxes.push_back(temp);
+        }
+        in.close();
+        
+        return taxes;
+    }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "readConsTax");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int MothurOut::readConsTax(string inputfile, map<string, consTax2>& taxes){
+       try {
+        ifstream in;
+        openInputFile(inputfile, in);
+        
+        //read headers
+        getline(in);
+        
+        while (!in.eof()) {
+            
+            if (control_pressed) { break; }
+            
+            string otu = ""; string tax = "unknown";
+            int size = 0;
+            
+            in >> otu >> size >> tax; gobble(in);
+            consTax2 temp(tax, size);
+            taxes[otu] = temp;
+        }
+        in.close();
+        
+        return 0;
+    }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "readConsTax");
+               exit(1);
+       }
+}
  /**************************************************************************************************/
  vector<unsigned long long> MothurOut::setFilePosEachLine(string filename, int& num) {
         try {
@@ -2348,7 +2442,9 @@ set<string> MothurOut::readAccnos(string accnosfile){
              in.read(buffer, 4096);
              vector<string> pieces = splitWhiteSpace(rest, buffer, in.gcount());
              
-            for (int i = 0; i < pieces.size(); i++) {  checkName(pieces[i]); names.insert(pieces[i]);  }
+            for (int i = 0; i < pieces.size(); i++) {  checkName(pieces[i]);
+                names.insert(pieces[i]);
+            }
          }
                 in.close();     
                 
diff --git a/mothurout.h b/mothurout.h

index f99eac1917cbe2ce5a1ce278c66d27d1df71737e..77b1b6d4ec1efb758973d332aa8e4df00619f58b 100644 (file)
--- a/mothurout.h
+++ b/mothurout.h
@@ -107,6 +107,8 @@ class MothurOut {
          map<string, int> readNames(string);
          map<string, int> readNames(string, unsigned long int&);
          int readTax(string, map<string, string>&);
+        vector<consTax> readConsTax(string);
+        int readConsTax(string, map<string, consTax2>&);
          int readNames(string, map<string, string>&, map<string, int>&);
                 int readNames(string, map<string, string>&);
          int readNames(string, map<string, string>&, bool);
@@ -151,6 +153,7 @@ class MothurOut {
          string makeList(vector<string>&);
          bool isSubset(vector<string>, vector<string>); //bigSet, subset
          int checkName(string&);
+        map<string, vector<string> > parseClasses(string);
                 
                 //math operation
                 int factorial(int num);
diff --git a/otuassociationcommand.cpp b/otuassociationcommand.cpp

index 23cac294cad4f1b25caa0e4f4d55de40f1fb2876..9ccdeeaba28b54a899fd90dada402030e0d87c3f 100644 (file)
--- a/otuassociationcommand.cpp
+++ b/otuassociationcommand.cpp
@@ -572,6 +572,7 @@ int OTUAssociationCommand::readMetadata(){
                 //save names of columns you are reading
                 while (!iss.eof()) {
                         iss >> columnLabel; m->gobble(iss);
+            if (m->debug) { m->mothurOut("[DEBUG]: metadata column Label = " + columnLabel + "\n"); }
                         metadataLabels.push_back(columnLabel);
                 }
                 int count = metadataLabels.size();
@@ -583,6 +584,7 @@ int OTUAssociationCommand::readMetadata(){
                         
                         string group = "";
                         in >> group; m->gobble(in);
+            if (m->debug) { m->mothurOut("[DEBUG]: metadata group = " + group + "\n"); }
              
                         SharedRAbundFloatVector* tempLookup = new SharedRAbundFloatVector();
                         tempLookup->setGroup(group);
@@ -590,7 +592,8 @@ int OTUAssociationCommand::readMetadata(){
                         
                         for (int i = 0; i < count; i++) {
                                 float temp = 0.0;
-                               in >> temp; 
+                               in >> temp;
+                if (m->debug) { m->mothurOut("[DEBUG]: metadata value = " + toString(temp) + "\n"); }
                                 tempLookup->push_back(temp, group);
                         }
                         
diff --git a/phylosummary.cpp b/phylosummary.cpp

index 9717087d72a55a6bc28ce2d77f02545a7eb5b612..a9d501e2190a9f68ed79edc8586b4dba2cb03342 100644 (file)
--- a/phylosummary.cpp
+++ b/phylosummary.cpp
@@ -454,7 +454,61 @@ void PhyloSummary::print(ofstream& out){
                 exit(1);
         }
  }
+/**************************************************************************************************/
  
+void PhyloSummary::print(ofstream& out, bool relabund){
+       try {
+               
+               if (ignore) { assignRank(0); }
+       
+               int totalChildrenInTree = 0;
+               map<string, int>::iterator itGroup;
+               
+               map<string,int>::iterator it;
+               for(it=tree[0].children.begin();it!=tree[0].children.end();it++){
+                       if (tree[it->second].total != 0)  {
+                               totalChildrenInTree++;
+                               tree[0].total += tree[it->second].total;
+                               
+                               if (groupmap != NULL) {
+                    vector<string> mGroups = groupmap->getNamesOfGroups();
+                                       for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; }
+                               }else if ( ct != NULL) {
+                    vector<string> mGroups = ct->getNamesOfGroups();
+                    if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) { tree[0].groupCount[mGroups[i]] += tree[it->second].groupCount[mGroups[i]]; } }
+                }
+                       }
+               }
+               
+               //print root
+               out << tree[0].name << "\t" << "1.0000" << "\t"; //root relative abundance is 1, everyone classifies to root
+               
+               /*
+               if (groupmap != NULL) {
+                       for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; }
+        }else if ( ct != NULL) {
+            if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) {  out << tree[0].groupCount[mGroups[i]] << '\t'; } }
+        }*/
+        
+        if (groupmap != NULL) {
+            vector<string> mGroups = groupmap->getNamesOfGroups();
+                       for (int i = 0; i < mGroups.size(); i++) {  out << "1.0000" << '\t'; }
+        }else if ( ct != NULL) {
+            vector<string> mGroups = ct->getNamesOfGroups();
+            if (ct->hasGroupInfo()) { for (int i = 0; i < mGroups.size(); i++) {  out << "1.0000" << '\t'; } }
+        }
+        
+               out << endl;
+               
+               //print rest
+               print(0, out, relabund);
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "print");
+               exit(1);
+       }
+}
  /**************************************************************************************************/
  
  void PhyloSummary::print(int i, ofstream& out){
@@ -463,14 +517,14 @@ void PhyloSummary::print(int i, ofstream& out){
                 for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
                         
                         if (tree[it->second].total != 0)  {
-                       
+                
                                 int totalChildrenInTree = 0;
-               
+                
                                 map<string,int>::iterator it2;
-                               for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){   
+                               for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){
                                         if (tree[it2->second].total != 0)  {   totalChildrenInTree++; }
                                 }
-                       
+                
                                 out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << totalChildrenInTree << "\t" << tree[it->second].total << "\t";
                                 
                                 map<string, int>::iterator itGroup;
@@ -479,11 +533,11 @@ void PhyloSummary::print(int i, ofstream& out){
                                         //      out << itGroup->second << '\t';
                                         //}
                                         vector<string> mGroups = groupmap->getNamesOfGroups();
-                                       for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
+                                       for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
                                 }else if (ct != NULL) {
                      if (ct->hasGroupInfo()) {
                          vector<string> mGroups = ct->getNamesOfGroups();
-                        for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; } 
+                        for (int i = 0; i < mGroups.size(); i++) {  out << tree[it->second].groupCount[mGroups[i]] << '\t'; }
                      }
                  }
                                 out << endl;
@@ -498,6 +552,64 @@ void PhyloSummary::print(int i, ofstream& out){
                 exit(1);
         }
  }
+
+/**************************************************************************************************/
+
+void PhyloSummary::print(int i, ofstream& out, bool relabund){
+       try {
+               map<string,int>::iterator it;
+               for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
+                       
+                       if (tree[it->second].total != 0)  {
+                       
+                               int totalChildrenInTree = 0;
+               
+                               map<string,int>::iterator it2;
+                               for(it2=tree[it->second].children.begin();it2!=tree[it->second].children.end();it2++){   
+                                       if (tree[it2->second].total != 0)  {   totalChildrenInTree++; }
+                               }
+                
+                string nodeName = "";
+                int thisNode = it->second;
+                while (tree[thisNode].rank != "0") { //while you are not at top
+                    if (m->control_pressed) { break; }
+                    nodeName = tree[thisNode].name + "|" + nodeName;
+                    thisNode = tree[thisNode].parent;
+                }
+                if (nodeName != "") { nodeName = nodeName.substr(0, nodeName.length()-1); }
+                
+                               out << nodeName << "\t" << (tree[it->second].total / (float)tree[i].total) << "\t";
+                               
+                               map<string, int>::iterator itGroup;
+                               if (groupmap != NULL) {
+                                       vector<string> mGroups = groupmap->getNamesOfGroups();
+                                       for (int j = 0; j < mGroups.size(); j++) {
+                        if (tree[i].groupCount[mGroups[j]] == 0) {
+                            out << 0 << '\t';
+                        }else { out << (tree[it->second].groupCount[mGroups[j]] / (float)tree[i].groupCount[mGroups[j]]) << '\t'; }
+                    }
+                               }else if (ct != NULL) {
+                    if (ct->hasGroupInfo()) {
+                        vector<string> mGroups = ct->getNamesOfGroups();
+                        for (int j = 0; j < mGroups.size(); j++) {
+                            if (tree[i].groupCount[mGroups[j]] == 0) {
+                                out << 0 << '\t';
+                            }else { out << (tree[it->second].groupCount[mGroups[j]] / (float)tree[i].groupCount[mGroups[j]]) << '\t'; }
+                        }
+                    }
+                }
+                               out << endl;
+                               
+                       }
+                       
+                       print(it->second, out, relabund);
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "print");
+               exit(1);
+       }
+}
  /**************************************************************************************************/
  void PhyloSummary::readTreeStruct(ifstream& in){
         try {
diff --git a/phylosummary.h b/phylosummary.h

index 65a467483ddab1ae58757629644162dc83bab254..bd31173a48647e1bf2ed7b2f7ec95d25e0bfe388 100644 (file)
--- a/phylosummary.h
+++ b/phylosummary.h
@@ -43,12 +43,14 @@ public:
         int addSeqToTree(string, string);
         int addSeqToTree(string, map<string, bool>);
         void print(ofstream&);
+    void print(ofstream&, bool);
         int getMaxLevel() { return maxLevel; }
         
  private:
         string getNextTaxon(string&);
         vector<rawTaxNode> tree;
         void print(int, ofstream&);
+    void print(int, ofstream&, bool);
         void assignRank(int);
         void readTreeStruct(ifstream&);
         GroupMap* groupmap;
diff --git a/randomforest.cpp b/randomforest.cpp

index bd96cd2f7177633e3d21181c95e7ff2c07682eb2..2ae0eb595c1c8d5f989c44a5e561a0ff3dac2d2b 100644 (file)
--- a/randomforest.cpp
+++ b/randomforest.cpp
@@ -10,8 +10,16 @@
  
  /***********************************************************************/
  
-RandomForest::RandomForest(const vector <vector<int> > dataSet,const int numDecisionTrees,
-             const string treeSplitCriterion = "informationGain") : Forest(dataSet, numDecisionTrees, treeSplitCriterion) {
+RandomForest::RandomForest(const vector <vector<int> > dataSet,
+                           const int numDecisionTrees,
+                           const string treeSplitCriterion = "gainratio",
+                           const bool doPruning = false,
+                           const float pruneAggressiveness = 0.9,
+                           const bool discardHighErrorTrees = true,
+                           const float highErrorTreeDiscardThreshold = 0.4,
+                           const string optimumFeatureSubsetSelectionCriteria = "log2",
+                           const float featureStandardDeviationThreshold = 0.0)
+            : Forest(dataSet, numDecisionTrees, treeSplitCriterion, doPruning, pruneAggressiveness, discardHighErrorTrees, highErrorTreeDiscardThreshold, optimumFeatureSubsetSelectionCriteria, featureStandardDeviationThreshold) {
      m = MothurOut::getInstance();
  }
  
@@ -48,45 +56,50 @@ int RandomForest::calcForrestErrorRate() {
  }
  
  /***********************************************************************/
-// DONE
  int RandomForest::calcForrestVariableImportance(string filename) {
      try {
      
-    // TODO: need to add try/catch operators to fix this
-    // follow the link: http://en.wikipedia.org/wiki/Dynamic_cast
+        // follow the link: http://en.wikipedia.org/wiki/Dynamic_cast
          //if you are going to dynamically cast, aren't you undoing the advantage of abstraction. Why abstract at all?
          //could cause maintenance issues later if other types of Abstract decison trees are created that cannot be cast as a decision tree.
-    for (int i = 0; i < decisionTrees.size(); i++) {
-        if (m->control_pressed) { return 0; }
-        
-        DecisionTree* decisionTree = dynamic_cast<DecisionTree*>(decisionTrees[i]);
+        for (int i = 0; i < decisionTrees.size(); i++) {
+            if (m->control_pressed) { return 0; }
+            
+            DecisionTree* decisionTree = dynamic_cast<DecisionTree*>(decisionTrees[i]);
+            
+            for (int j = 0; j < numFeatures; j++) {
+                globalVariableImportanceList[j] += (double)decisionTree->variableImportanceList[j];
+            }
+        }
          
-        for (int j = 0; j < numFeatures; j++) {
-            globalVariableImportanceList[j] += (double)decisionTree->variableImportanceList[j];
+        for (int i = 0;  i < numFeatures; i++) {
+            globalVariableImportanceList[i] /= (double)numDecisionTrees;
          }
-    }
-    
-    for (int i = 0;  i < numFeatures; i++) {
-        cout << "[" << i << ',' << globalVariableImportanceList[i] << "], ";
-        globalVariableImportanceList[i] /= (double)numDecisionTrees;
-    }
-    
-    vector< vector<double> > globalVariableRanks;
-    for (int i = 0; i < globalVariableImportanceList.size(); i++) {
-        if (globalVariableImportanceList[i] > 0) {
-            vector<double> globalVariableRank(2, 0);
-            globalVariableRank[0] = i; globalVariableRank[1] = globalVariableImportanceList[i];
-            globalVariableRanks.push_back(globalVariableRank);
+        
+        vector< pair<int, double> > globalVariableRanks;
+        for (int i = 0; i < globalVariableImportanceList.size(); i++) {
+            //cout << "[" << i << ',' << globalVariableImportanceList[i] << "], ";
+            if (globalVariableImportanceList[i] > 0) {
+                pair<int, double> globalVariableRank(0, 0.0);
+                globalVariableRank.first = i;
+                globalVariableRank.second = globalVariableImportanceList[i];
+                globalVariableRanks.push_back(globalVariableRank);
+            }
          }
-    }
-    
-    VariableRankDescendingSorterDouble variableRankDescendingSorter;
-    sort(globalVariableRanks.begin(), globalVariableRanks.end(), variableRankDescendingSorter);
+        
+//        for (int i = 0; i < globalVariableRanks.size(); i++) {
+//            cout << m->currentBinLabels[(int)globalVariableRanks[i][0]] << '\t' << globalVariableImportanceList[globalVariableRanks[i][0]] << endl;
+//        }
+
+        
+        VariableRankDescendingSorterDouble variableRankDescendingSorter;
+        sort(globalVariableRanks.begin(), globalVariableRanks.end(), variableRankDescendingSorter);
+        
          ofstream out;
          m->openOutputFile(filename, out);
          out <<"OTU\tRank\n";
          for (int i = 0; i < globalVariableRanks.size(); i++) {
-            out << m->currentBinLabels[(int)globalVariableRanks[i][0]] << '\t' << globalVariableImportanceList[globalVariableRanks[i][0]] << endl;
+            out << m->currentBinLabels[(int)globalVariableRanks[i].first] << '\t' << globalVariableImportanceList[globalVariableRanks[i].first] << endl;
          }
          out.close();
          return 0;
@@ -97,27 +110,89 @@ int RandomForest::calcForrestVariableImportance(string filename) {
         }  
  }
  /***********************************************************************/
-// DONE
  int RandomForest::populateDecisionTrees() {
      try {
          
+        vector<double> errorRateImprovements;
+        
          for (int i = 0; i < numDecisionTrees; i++) {
+          
              if (m->control_pressed) { return 0; }
-            if (((i+1) % 10) == 0) {  m->mothurOut("Creating " + toString(i+1) + " (th) Decision tree\n");  }
+            if (((i+1) % 100) == 0) {  m->mothurOut("Creating " + toString(i+1) + " (th) Decision tree\n");  }
+          
              // TODO: need to first fix if we are going to use pointer based system or anything else
-            DecisionTree* decisionTree = new DecisionTree(dataSet, globalDiscardedFeatureIndices, OptimumFeatureSubsetSelector("log2"), treeSplitCriterion);
-            decisionTree->calcTreeVariableImportanceAndError();
-            if (m->control_pressed) { return 0; }
-            updateGlobalOutOfBagEstimates(decisionTree);
-            if (m->control_pressed) { return 0; }
-            decisionTree->purgeDataSetsFromTree();
-            if (m->control_pressed) { return 0; }
-            decisionTrees.push_back(decisionTree);
+            DecisionTree* decisionTree = new DecisionTree(dataSet, globalDiscardedFeatureIndices, OptimumFeatureSubsetSelector(optimumFeatureSubsetSelectionCriteria), treeSplitCriterion, featureStandardDeviationThreshold);
+          
+            if (m->debug && doPruning) {
+                m->mothurOut("Before pruning\n");
+                decisionTree->printTree(decisionTree->rootNode, "ROOT");
+            }
+            
+            int numCorrect;
+            double treeErrorRate;
+            
+            decisionTree->calcTreeErrorRate(numCorrect, treeErrorRate);
+            double prePrunedErrorRate = treeErrorRate;
+            
+            if (m->debug) {
+                m->mothurOut("treeErrorRate: " + toString(treeErrorRate) + " numCorrect: " + toString(numCorrect) + "\n");
+            }
+            
+            if (doPruning) {
+                decisionTree->pruneTree(pruneAggressiveness);
+                if (m->debug) {
+                    m->mothurOut("After pruning\n");
+                    decisionTree->printTree(decisionTree->rootNode, "ROOT");
+                }
+                decisionTree->calcTreeErrorRate(numCorrect, treeErrorRate);
+            }
+            double postPrunedErrorRate = treeErrorRate;
+            
+          
+            decisionTree->calcTreeVariableImportanceAndError(numCorrect, treeErrorRate);
+            double errorRateImprovement = (prePrunedErrorRate - postPrunedErrorRate) / prePrunedErrorRate;
+
+            if (m->debug) {
+                m->mothurOut("treeErrorRate: " + toString(treeErrorRate) + " numCorrect: " + toString(numCorrect) + "\n");
+                if (doPruning) {
+                    m->mothurOut("errorRateImprovement: " + toString(errorRateImprovement) + "\n");
+                }
+            }
+            
+            
+            if (discardHighErrorTrees) {
+                if (treeErrorRate < highErrorTreeDiscardThreshold) {
+                    updateGlobalOutOfBagEstimates(decisionTree);
+                    decisionTree->purgeDataSetsFromTree();
+                    decisionTrees.push_back(decisionTree);
+                    if (doPruning) {
+                        errorRateImprovements.push_back(errorRateImprovement);
+                    }
+                } else {
+                    delete decisionTree;
+                }
+            } else {
+                updateGlobalOutOfBagEstimates(decisionTree);
+                decisionTree->purgeDataSetsFromTree();
+                decisionTrees.push_back(decisionTree);
+                if (doPruning) {
+                    errorRateImprovements.push_back(errorRateImprovement);
+                }
+            }          
+        }
+        
+        double avgErrorRateImprovement = -1.0;
+        if (errorRateImprovements.size() > 0) {
+            avgErrorRateImprovement = accumulate(errorRateImprovements.begin(), errorRateImprovements.end(), 0.0);
+//            cout << "Total " << avgErrorRateImprovement << " size " << errorRateImprovements.size() << endl;
+            avgErrorRateImprovement /= errorRateImprovements.size();
          }
          
-        if (m->debug) {
-            // m->mothurOut("globalOutOfBagEstimates = " + toStringVectorMap(globalOutOfBagEstimates)+ "\n");
+        if (m->debug && doPruning) {
+            m->mothurOut("avgErrorRateImprovement:" + toString(avgErrorRateImprovement) + "\n");
          }
+        // m->mothurOut("globalOutOfBagEstimates = " + toStringVectorMap(globalOutOfBagEstimates)+ "\n");
+
          
          return 0;
      }
diff --git a/randomforest.hpp b/randomforest.hpp

index 30eb43842f8cb280e1e4e95919909885f2702d28..d0ac1ec063d46d4ff1367e87cf322870f7e49a9c 100644 (file)
--- a/randomforest.hpp
+++ b/randomforest.hpp
@@ -6,8 +6,8 @@
  //  Copyright (c) 2012 Schloss Lab. All rights reserved.
  //
  
-#ifndef rrf_fs_prototype_randomforest_hpp
-#define rrf_fs_prototype_randomforest_hpp
+#ifndef RF_RANDOMFOREST_HPP
+#define RF_RANDOMFOREST_HPP
  
  #include "macros.h"
  #include "forest.h"
@@ -17,20 +17,27 @@ class RandomForest: public Forest {
      
  public:
      
-    // DONE
-    RandomForest(const vector <vector<int> > dataSet,const int numDecisionTrees, const string);
+    RandomForest(const vector <vector<int> > dataSet,
+                 const int numDecisionTrees,
+                 const string treeSplitCriterion,
+                 const bool doPruning,
+                 const float pruneAggressiveness,
+                 const bool discardHighErrorTrees,
+                 const float highErrorTreeDiscardThreshold,
+                 const string optimumFeatureSubsetSelectionCriteria,
+                 const float featureStandardDeviationThreshold);
      
      
      //NOTE:: if you are going to dynamically cast, aren't you undoing the advantage of abstraction. Why abstract at all?
      //could cause maintenance issues later if other types of Abstract decison trees are created that cannot be cast as a decision tree.
-//    virtual ~RandomForest() {
-//        for (vector<AbstractDecisionTree*>::iterator it = decisionTrees.begin(); it != decisionTrees.end(); it++) {
-//            // we know that this is decision tree, so we can do a dynamic_case<DecisionTree*> here
-//            DecisionTree* decisionTree = dynamic_cast<DecisionTree*>(*it);
-//            // calling the destructor by deleting
-//            delete decisionTree;
-//        }
-//    }
+    virtual ~RandomForest() {
+        for (vector<AbstractDecisionTree*>::iterator it = decisionTrees.begin(); it != decisionTrees.end(); it++) {
+            // we know that this is decision tree, so we can do a dynamic_case<DecisionTree*> here
+            DecisionTree* decisionTree = dynamic_cast<DecisionTree*>(*it);
+            // calling the destructor by deleting
+            delete decisionTree;
+        }
+    }
      
      int calcForrestErrorRate();
      int calcForrestVariableImportance(string);
diff --git a/regularizedrandomforest.cpp b/regularizedrandomforest.cpp

index 9fa19aba00ceb68353f32936c21962e14791a173..0a33f4e3ba6ed2a79a301c2db5c8254ae153784c 100644 (file)
--- a/regularizedrandomforest.cpp
+++ b/regularizedrandomforest.cpp
@@ -8,8 +8,14 @@
  
  #include "regularizedrandomforest.h"
  
-RegularizedRandomForest::RegularizedRandomForest(const vector <vector<int> > dataSet,const int numDecisionTrees,
-                           const string treeSplitCriterion = "informationGain") : Forest(dataSet, numDecisionTrees, treeSplitCriterion) {
+RegularizedRandomForest::RegularizedRandomForest(const vector <vector<int> > dataSet,
+                                                 const int numDecisionTrees,
+                                                 const string treeSplitCriterion = "gainratio")
+                        // TODO: update ctor according to basic RandomForest Class
+                      : Forest(dataSet,
+                               numDecisionTrees,
+                               treeSplitCriterion,
+                               false, 0.9, true, 0.4, "log2", 0.0) {
      m = MothurOut::getInstance();
  }
  
diff --git a/renameseqscommand.cpp b/renameseqscommand.cpp

new file mode 100644 (file)

index 0000000..449454b
--- /dev/null
+++ b/renameseqscommand.cpp
@@ -0,0 +1,343 @@
+//
+//  renameseqscommand.cpp
+//  Mothur
+//
+//  Created by SarahsWork on 5/28/13.
+//  Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#include "renameseqscommand.h"
+#include "sequence.hpp"
+#include "groupmap.h"
+
+//**********************************************************************************************************************
+vector<string> RenameSeqsCommand::setParameters(){
+       try {
+               CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
+        CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
+               
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RenameSeqsCommand", "setParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string RenameSeqsCommand::getHelpString(){
+       try {
+               string helpString = "";
+               helpString += "The rename.seqs command reads a fastafile and groupfile with an optional namefile, and creates files with the sequence names concatenated with the group. For example if a line in the group file is 'seq1   group1', the new sequence name will be seq1_group1.\n";
+               helpString += "The rename.seqs command parameters are fasta, name and group. Fasta and group are required, unless a current file is available for both.\n";
+               helpString += "The rename.seqs command should be in the following format: \n";
+               helpString += "rename.seqs(fasta=yourFastaFile, group=yourGroupFile) \n";
+               helpString += "Example rename.seqs(fasta=abrecovery.unique.fasta, group=abrecovery.group).\n";
+               helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
+               return helpString;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RenameSeqsCommand", "getHelpString");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string RenameSeqsCommand::getOutputPattern(string type) {
+    try {
+        string pattern = "";
+        
+        if (type == "fasta")        {  pattern = "[filename],renamed,[extension]"; }
+        else if (type == "name")    {  pattern = "[filename],renamed,[extension]"; }
+        else if (type == "group")   {  pattern = "[filename],renamed,[extension]"; }
+        else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
+        
+        return pattern;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "RenameSeqsCommand", "getOutputPattern");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+RenameSeqsCommand::RenameSeqsCommand(){
+       try {
+               abort = true; calledHelp = true;
+               setParameters();
+               vector<string> tempOutNames;
+               outputTypes["fasta"] = tempOutNames;
+        outputTypes["name"] = tempOutNames;
+        outputTypes["group"] = tempOutNames;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RenameSeqsCommand", "RenameSeqsCommand");
+               exit(1);
+       }
+}
+/**************************************************************************************/
+RenameSeqsCommand::RenameSeqsCommand(string option)  {
+       try {
+               abort = false; calledHelp = false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; calledHelp = true; }
+               else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+               
+               else {
+                       vector<string> myArray = setParameters();
+                       
+                       OptionParser parser(option);
+                       map<string,string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+                       map<string, string>::iterator it;
+            
+                       //check to make sure all parameters are valid for command
+                       for (it = parameters.begin(); it != parameters.end(); it++) {
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       vector<string> tempOutNames;
+            outputTypes["fasta"] = tempOutNames;
+            outputTypes["name"] = tempOutNames;
+            outputTypes["group"] = tempOutNames;
+            
+                       //if the user changes the input directory command factory will send this info to us in the output parameter
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                               string path;
+                               it = parameters.find("fasta");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
+                               }
+                               
+                               it = parameters.find("name");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["name"] = inputDir + it->second;             }
+                               }
+                
+                it = parameters.find("group");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["group"] = inputDir + it->second;            }
+                               }
+
+                       }
+            
+                       
+                       //check for required parameters
+                       fastaFile = validParameter.validFile(parameters, "fasta", true);
+                       if (fastaFile == "not open") { abort = true; }
+                       else if (fastaFile == "not found") {
+                               fastaFile = m->getFastaFile();
+                               if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
+                               else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       }else { m->setFastaFile(fastaFile); }
+                       
+                       //if the user changes the output directory command factory will send this info to us in the output parameter
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){
+                               outputDir = "";
+                               outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it
+                       }
+                       
+            groupfile = validParameter.validFile(parameters, "group", true);
+            if (groupfile == "not open") { abort = true; }
+                       else if (groupfile == "not found") {
+                               groupfile = m->getGroupFile();
+                               if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); }
+                               else {  m->mothurOut("You have no current groupfile and the group parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       }else { m->setGroupFile(groupfile); }
+                       
+                       //if the user changes the output directory command factory will send this info to us in the output parameter
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){ outputDir = ""; }
+
+                       nameFile = validParameter.validFile(parameters, "name", true);
+                       if (nameFile == "not open") { abort = true; }
+                       else if (nameFile == "not found"){ nameFile =""; }
+                       else { m->setNameFile(nameFile); }
+            
+            if (nameFile == "") {
+                vector<string> files; files.push_back(fastaFile);
+                parser.getNameFile(files);
+            }
+
+               }
+        
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RenameSeqsCommand", "RenameSeqsCommand");
+               exit(1);
+       }
+}
+/**************************************************************************************/
+int RenameSeqsCommand::execute() {
+       try {
+               
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
+        
+        GroupMap groupMap(groupfile);
+        groupMap.readMap();
+                
+               //prepare filenames and open files
+        string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(fastaFile);  }
+               string outFastaFile = thisOutputDir + m->getRootName(m->getSimpleName(fastaFile));
+        map<string, string> variables;
+        variables["[filename]"] = outFastaFile;
+        variables["[extension]"] = m->getExtension(fastaFile);
+        outFastaFile = getOutputFileName("fasta", variables);
+        outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile);
+        
+        ofstream outFasta;
+               m->openOutputFile(outFastaFile, outFasta);
+        
+        ifstream in;
+        m->openInputFile(fastaFile, in);
+        
+        while (!in.eof()) {
+            if (m->control_pressed) { break; }
+            
+            Sequence seq(in); m->gobble(in);
+            string group = groupMap.getGroup(seq.getName());
+            if (group == "not found") {  m->mothurOut("[ERROR]: " + seq.getName() + " is not in your group file, please correct.\n"); m->control_pressed = true; }
+            else {
+                string newName = seq.getName() + "_" + group;
+                seq.setName(newName);
+                seq.printSequence(outFasta);
+            }
+            
+        }
+        in.close();
+        
+        if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
+        
+        bool notDone = true;
+        if (nameFile != "") {
+            thisOutputDir = outputDir;
+            if (outputDir == "") {  thisOutputDir += m->hasPath(nameFile);  }
+            string outNameFile = thisOutputDir + m->getRootName(m->getSimpleName(nameFile));
+            variables["[filename]"] = outNameFile;
+            variables["[extension]"] = m->getExtension(nameFile);
+            outNameFile = getOutputFileName("group", variables);
+            outputNames.push_back(outNameFile); outputTypes["name"].push_back(outNameFile);
+            
+            ofstream outName;
+            m->openOutputFile(outNameFile, outName);
+            
+            map<string, vector<string> > nameMap;
+            m->readNames(nameFile, nameMap);
+            
+            //process name file changing names
+            for (map<string, vector<string> >::iterator it = nameMap.begin(); it != nameMap.end(); it++) {
+                for (int i = 0; i < (it->second).size()-1; i++) {
+                    if (m->control_pressed) { break; }
+                    string group = groupMap.getGroup((it->second)[i]);
+                    if (group == "not found") {  m->mothurOut("[ERROR]: " + (it->second)[i] + " is not in your group file, please correct.\n"); m->control_pressed = true;  }
+                    else {
+                        string newName = (it->second)[i] + "_" + group;
+                        groupMap.renameSeq((it->second)[i], newName); //change in group file
+                        (it->second)[i] = newName; //change in namefile
+                    }
+                    if (i == 0) {  outName << (it->second)[i] << '\t' << (it->second)[i] << ','; }
+                    else { outName << (it->second)[i] << ','; }
+                }
+                
+                //print last one
+                if ((it->second).size() == 1) {
+                    string group = groupMap.getGroup((it->second)[0]);
+                    if (group == "not found") {  m->mothurOut("[ERROR]: " + (it->second)[0] + " is not in your group file, please correct.\n"); m->control_pressed = true;  }
+                    else {
+                        string newName = (it->second)[0] + "_" + group;
+                        groupMap.renameSeq((it->second)[0], newName); //change in group file
+                        (it->second)[0] = newName; //change in namefile
+
+                        outName << (it->second)[0] << '\t' << (it->second)[0] << endl;
+                    }
+                }
+                else {
+                    string group = groupMap.getGroup((it->second)[(it->second).size()-1]);
+                    if (group == "not found") {  m->mothurOut("[ERROR]: " + (it->second)[(it->second).size()-1] + " is not in your group file, please correct.\n"); m->control_pressed = true;  }
+                    else {
+                        string newName = (it->second)[(it->second).size()-1] + "_" + group;
+                        groupMap.renameSeq((it->second)[(it->second).size()-1], newName); //change in group file
+                        (it->second)[(it->second).size()-1] = newName; //change in namefile
+
+                        outName << (it->second)[(it->second).size()-1] << endl;
+                    }
+                }
+            }
+            notDone = false;
+            outName.close();
+        }
+        
+        if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
+        
+        if (notDone) {
+            vector<string> seqs = groupMap.getNamesSeqs();
+            for (int i = 0; i < seqs.size(); i++) {
+                if (m->control_pressed) { break; }
+                string group = groupMap.getGroup(seqs[i]);
+                string newName = seqs[i] + "_" + group;
+                groupMap.renameSeq(seqs[i], newName);
+            }
+        }
+        if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
+        
+        thisOutputDir = outputDir;
+        if (outputDir == "") {  thisOutputDir += m->hasPath(groupfile);  }
+               string outGroupFile = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
+        variables["[filename]"] = outGroupFile;
+        variables["[extension]"] = m->getExtension(groupfile);
+        outGroupFile = getOutputFileName("group", variables);
+        outputNames.push_back(outGroupFile); outputTypes["group"].push_back(outGroupFile);
+        
+        ofstream outGroup;
+               m->openOutputFile(outGroupFile, outGroup);
+        groupMap.print(outGroup);
+        outGroup.close();
+        
+        if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);  } return 0; }
+
+        m->mothurOutEndLine();
+        m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+        for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+        m->mothurOutEndLine();
+        
+        //set fasta file as new current fastafile
+        string current = "";
+        itTypes = outputTypes.find("fasta");
+        if (itTypes != outputTypes.end()) {
+            if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
+        }
+        
+        itTypes = outputTypes.find("name");
+        if (itTypes != outputTypes.end()) {
+            if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
+        }
+        
+        itTypes = outputTypes.find("group");
+        if (itTypes != outputTypes.end()) {
+            if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
+        }
+                               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RenameSeqsCommand", "execute");
+               exit(1);
+       }
+}
+/**************************************************************************************/
+
diff --git a/renameseqscommand.h b/renameseqscommand.h

new file mode 100644 (file)

index 0000000..a492404
--- /dev/null
+++ b/renameseqscommand.h
@@ -0,0 +1,46 @@
+//
+//  renameseqscommand.h
+//  Mothur
+//
+//  Created by SarahsWork on 5/28/13.
+//  Copyright (c) 2013 Schloss Lab. All rights reserved.
+//
+
+#ifndef Mothur_renameseqscommand_h
+#define Mothur_renameseqscommand_h
+
+#include "command.hpp"
+
+class RenameSeqsCommand : public Command {
+    
+public:
+       RenameSeqsCommand(string);
+       RenameSeqsCommand();
+       ~RenameSeqsCommand() {}
+       
+       vector<string> setParameters();
+       string getCommandName()                 { return "rename.seqs";         }
+       string getCommandCategory()             { return "Sequence Processing";         }
+       
+       string getHelpString();
+    string getOutputPattern(string);
+       string getCitation() { return "http://www.mothur.org/wiki/Rename.seqs"; }
+       string getDescription()         { return "rename sequences"; }
+    
+       
+       int execute();
+       void help() { m->mothurOut(getHelpString()); }
+       
+       
+private:
+    
+       string fastaFile, nameFile, groupfile, outputDir;
+       vector<string> outputNames;
+       bool abort;
+       
+       map<string, string> nameMap;
+};
+
+
+
+#endif
diff --git a/rftreenode.cpp b/rftreenode.cpp

index 170cfb16f10d5c87b98efb55969ec733eae7d826..acfae544aa7660215d71643fc741d9a65cb2dfd1 100644 (file)
--- a/rftreenode.cpp
+++ b/rftreenode.cpp
@@ -10,29 +10,35 @@
  
  /***********************************************************************/
  RFTreeNode::RFTreeNode(vector< vector<int> > bootstrappedTrainingSamples,
-           vector<int> globalDiscardedFeatureIndices,
-           int numFeatures,
-           int numSamples,
-           int numOutputClasses,
-           int generation)
+                       vector<int> globalDiscardedFeatureIndices,
+                       int numFeatures,
+                       int numSamples,
+                       int numOutputClasses,
+                       int generation,
+                       int nodeId,
+                       float featureStandardDeviationThreshold)
  
-: bootstrappedTrainingSamples(bootstrappedTrainingSamples),
-globalDiscardedFeatureIndices(globalDiscardedFeatureIndices),
-numFeatures(numFeatures),
-numSamples(numSamples),
-numOutputClasses(numOutputClasses),
-generation(generation),
-isLeaf(false),
-outputClass(-1),
-splitFeatureIndex(-1),
-splitFeatureValue(-1),
-splitFeatureEntropy(-1.0),
-ownEntropy(-1.0),
-bootstrappedFeatureVectors(numFeatures, vector<int>(numSamples, 0)),
-bootstrappedOutputVector(numSamples, 0),
-leftChildNode(NULL),
-rightChildNode(NULL),
-parentNode(NULL) {
+            : bootstrappedTrainingSamples(bootstrappedTrainingSamples),
+            globalDiscardedFeatureIndices(globalDiscardedFeatureIndices),
+            numFeatures(numFeatures),
+            numSamples(numSamples),
+            numOutputClasses(numOutputClasses),
+            generation(generation),
+            isLeaf(false),
+            outputClass(-1),
+            nodeId(nodeId),
+            testSampleMisclassificationCount(0),
+            splitFeatureIndex(-1),
+            splitFeatureValue(-1),
+            splitFeatureEntropy(-1.0),
+            ownEntropy(-1.0),
+            featureStandardDeviationThreshold(featureStandardDeviationThreshold),
+            bootstrappedFeatureVectors(numFeatures, vector<int>(numSamples, 0)),
+            bootstrappedOutputVector(numSamples, 0),
+            leftChildNode(NULL),
+            rightChildNode(NULL),
+            parentNode(NULL) {
+                
      m = MothurOut::getInstance();
      
      for (int i = 0; i < numSamples; i++) {    // just doing a simple transpose of the matrix
@@ -40,7 +46,8 @@ parentNode(NULL) {
          for (int j = 0; j < numFeatures; j++) { bootstrappedFeatureVectors[j][i] = bootstrappedTrainingSamples[i][j]; }
      }
      
-    for (int i = 0; i < numSamples; i++) { if (m->control_pressed) { break; } bootstrappedOutputVector[i] = bootstrappedTrainingSamples[i][numFeatures]; }
+    for (int i = 0; i < numSamples; i++) { if (m->control_pressed) { break; }
+        bootstrappedOutputVector[i] = bootstrappedTrainingSamples[i][numFeatures]; }
      
      createLocalDiscardedFeatureList();
      updateNodeEntropy();
@@ -48,13 +55,14 @@ parentNode(NULL) {
  /***********************************************************************/
  int RFTreeNode::createLocalDiscardedFeatureList(){
      try {
-
+        
          for (int i = 0; i < numFeatures; i++) {
+                // TODO: need to check if bootstrappedFeatureVectors == numFeatures, in python code we are using bootstrappedFeatureVectors instead of numFeatures
              if (m->control_pressed) { return 0; } 
              vector<int>::iterator it = find(globalDiscardedFeatureIndices.begin(), globalDiscardedFeatureIndices.end(), i);
-            if (it == globalDiscardedFeatureIndices.end()){                           // NOT FOUND
+            if (it == globalDiscardedFeatureIndices.end()) {                           // NOT FOUND
                  double standardDeviation = m->getStandardDeviation(bootstrappedFeatureVectors[i]);  
-                if (standardDeviation <= 0){ localDiscardedFeatureIndices.push_back(i); }
+                if (standardDeviation <= featureStandardDeviationThreshold) { localDiscardedFeatureIndices.push_back(i); }
              }
          }
          
@@ -70,7 +78,9 @@ int RFTreeNode::updateNodeEntropy() {
      try {
          
          vector<int> classCounts(numOutputClasses, 0);
-        for (int i = 0; i < bootstrappedOutputVector.size(); i++) { classCounts[bootstrappedOutputVector[i]]++; }
+        for (int i = 0; i < bootstrappedOutputVector.size(); i++) {
+            classCounts[bootstrappedOutputVector[i]]++;
+        }
          int totalClassCounts = accumulate(classCounts.begin(), classCounts.end(), 0);
          double nodeEntropy = 0.0;
          for (int i = 0; i < classCounts.size(); i++) {
diff --git a/rftreenode.hpp b/rftreenode.hpp

index 8987ebcec58b04d8eaaff87e86f514887a4bfd17..d53cc2b614db5b1e9e8d8626127cd58d14bb0c0e 100755 (executable)
--- a/rftreenode.hpp
+++ b/rftreenode.hpp
@@ -6,8 +6,8 @@
  //  Copyright (c) 2012 Schloss Lab. All rights reserved.
  //
  
-#ifndef rrf_fs_prototype_treenode_hpp
-#define rrf_fs_prototype_treenode_hpp
+#ifndef RF_RFTREENODE_HPP
+#define RF_RFTREENODE_HPP
  
  #include "mothurout.h"
  #include "macros.h"
@@ -16,7 +16,14 @@ class RFTreeNode{
      
  public:
      
-    RFTreeNode(vector< vector<int> > bootstrappedTrainingSamples, vector<int> globalDiscardedFeatureIndices, int numFeatures, int numSamples, int numOutputClasses, int generation);
+    RFTreeNode(vector< vector<int> > bootstrappedTrainingSamples,
+               vector<int> globalDiscardedFeatureIndices,
+               int numFeatures,
+               int numSamples,
+               int numOutputClasses,
+               int generation,
+               int nodeId,
+               float featureStandardDeviationThreshold = 0.0);
      
      virtual ~RFTreeNode(){}
      
@@ -41,6 +48,7 @@ public:
      const vector<int>& getBootstrappedOutputVector() { return bootstrappedOutputVector; }
      const vector<int>& getFeatureSubsetIndices() { return featureSubsetIndices; }
      const double getOwnEntropy() { return ownEntropy; }
+    const int getTestSampleMisclassificationCount() { return testSampleMisclassificationCount; }
      
      // setters
      void setIsLeaf(bool isLeaf) { this->isLeaf = isLeaf; }
@@ -77,6 +85,10 @@ private:
      double splitFeatureEntropy;
      double ownEntropy;
      
+    int nodeId;
+    float featureStandardDeviationThreshold;
+    int testSampleMisclassificationCount;
+    
      RFTreeNode* leftChildNode;
      RFTreeNode* rightChildNode;
      RFTreeNode* parentNode;
diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp

index 051aa99434f6bb21a8d3b3b54cb2cc671eab98d2..e7b2c6acedf523f8249c8159ed186a6b8d5b8892 100644 (file)
--- a/screenseqscommand.cpp
+++ b/screenseqscommand.cpp
@@ -817,7 +817,7 @@ int ScreenSeqsCommand::screenAlignReport(map<string, string>& badSeqNames){
              m->mothurRemove(outSummary+".temp");
          }
          
-        if (numFastaSeqs != count) {  m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your contigs report file, quitting.\n"); m->control_pressed = true; }
+        if (numFastaSeqs != count) {  m->mothurOut("[ERROR]: found " + toString(numFastaSeqs) + " sequences in your fasta file, and " + toString(count) + " sequences in your align report file, quitting.\n"); m->control_pressed = true; }
          
          
          return count;
@@ -1820,7 +1820,6 @@ int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
                 vector<int> longHomoPolymer;
          vector<int> numNs;
                 
-        vector<unsigned long long> positions;
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
                 positions = m->divideFile(fastafile, processors);
                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }   
@@ -1845,8 +1844,9 @@ int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
                 int pid;
                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
                 
-               if (pid == 0) { 
-                       driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, lines[0]);
+               if (pid == 0) {
+            linePair tempLine(0, positions[positions.size()-1]);
+                       driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, numNs, fastafile, tempLine);
  #else
                 int numSeqs = 0;
                 //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
diff --git a/setdircommand.cpp b/setdircommand.cpp

index dc4c9ac3baa8dcaafadc9e825ceb598ab4c1cabf..42e49a1be039ce16f6ea412ca8b9de15fea106f1 100644 (file)
--- a/setdircommand.cpp
+++ b/setdircommand.cpp
@@ -37,7 +37,7 @@ string SetDirectoryCommand::getHelpString(){
                 helpString += "The set.dir command can also be used to specify the directory where your input files are located, the directory must exist.\n";
                 helpString += "The set.dir command can also be used to override or set the default location mothur will look for files if it is unable to find them, the directory must exist.\n";
          helpString += "The set.dir command can also be used to run mothur in debug mode.\n";
-        helpString += "The set.dir command can also be used to set the modifynames parameter. Default=t, meaning if your sequence names contain ':' change them to '_' to aviod issues while making trees.  modifynames=F will leave sequence names as they are.\n";
+        helpString += "The set.dir command can also be used to set the modifynames parameter. Default=t, meaning if your sequence names contain ':' change them to '_' to avoid issues while making trees.  modifynames=F will leave sequence names as they are.\n";
                 helpString += "The set.dir command parameters are input, output, tempdefault and debug and one is required.\n";
          helpString += "To run mothur in debug mode set debug=true. Default debug=false.\n";
                 helpString += "To return the output to the same directory as the input files you may enter: output=clear.\n";
@@ -121,67 +121,67 @@ int SetDirectoryCommand::execute(){
                 
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
-        if (debugOnly) { return 0; }
-        
-               commandFactory = CommandFactory::getInstance();
-               
-               m->mothurOut("Mothur's directories:"); m->mothurOutEndLine();
-               
-               //redirect output
-               if ((output == "clear") || (output == "")) {  output = "";  commandFactory->setOutputDirectory(output);  }
-               else if (output == "default") { 
-                       string exepath = m->argv;
-                       output = exepath.substr(0, (exepath.find_last_of('m')));
-                       
-                       m->mothurOut("outputDir=" + output); m->mothurOutEndLine();  
-                       commandFactory->setOutputDirectory(output);
-               }else {
-            if (m->dirCheck(output)) {
-                m->mothurOut("outputDir=" + output); m->mothurOutEndLine();  
-                               commandFactory->setOutputDirectory(output);
+        if (debugOnly) {  }
+        else {
+            commandFactory = CommandFactory::getInstance();
+            
+            m->mothurOut("Mothur's directories:"); m->mothurOutEndLine();
+            
+            //redirect output
+            if ((output == "clear") || (output == "")) {  output = "";  commandFactory->setOutputDirectory(output);  }
+            else if (output == "default") {
+                string exepath = m->argv;
+                output = exepath.substr(0, (exepath.find_last_of('m')));
+                
+                m->mothurOut("outputDir=" + output); m->mothurOutEndLine();
+                commandFactory->setOutputDirectory(output);
+            }else {
+                if (m->dirCheck(output)) {
+                    m->mothurOut("outputDir=" + output); m->mothurOutEndLine();
+                    commandFactory->setOutputDirectory(output);
+                }
              }
-               }
-               
-               //redirect input
-               if ((input == "clear") || (input == "")) {  input = "";  commandFactory->setInputDirectory(input);  }
-               else if (input == "default") { 
-                       string exepath = m->argv;
-                       input = exepath.substr(0, (exepath.find_last_of('m')));
-                       
-                       m->mothurOut("inputDir=" + input); m->mothurOutEndLine();  
-                       commandFactory->setInputDirectory(input);
-               }else {
-            if (m->dirCheck(input)) {
-                m->mothurOut("inputDir=" + input); m->mothurOutEndLine();  
-                               commandFactory->setInputDirectory(input); 
+            
+            //redirect input
+            if ((input == "clear") || (input == "")) {  input = "";  commandFactory->setInputDirectory(input);  }
+            else if (input == "default") {
+                string exepath = m->argv;
+                input = exepath.substr(0, (exepath.find_last_of('m')));
+                
+                m->mothurOut("inputDir=" + input); m->mothurOutEndLine();
+                commandFactory->setInputDirectory(input);
+            }else {
+                if (m->dirCheck(input)) {
+                    m->mothurOut("inputDir=" + input); m->mothurOutEndLine();
+                    commandFactory->setInputDirectory(input);
+                }
              }
-        }
-               
-               //set default
-               if (tempdefault == "clear") {  
-                       #ifdef MOTHUR_FILES
-                               string temp = MOTHUR_FILES; 
-                               m->mothurOut("tempDefault=" + temp); m->mothurOutEndLine();  
+            
+            //set default
+            if (tempdefault == "clear") {
+#ifdef MOTHUR_FILES
+                               string temp = MOTHUR_FILES;
+                               m->mothurOut("tempDefault=" + temp); m->mothurOutEndLine();
                                 m->setDefaultPath(temp);
-                       #else
-                               string temp = ""; 
-                               m->mothurOut("No default directory defined at compile time."); m->mothurOutEndLine();  
+#else
+                               string temp = "";
+                               m->mothurOut("No default directory defined at compile time."); m->mothurOutEndLine();
                                 m->setDefaultPath(temp);
-                       #endif
-               }else if (tempdefault == "") {  //do nothing
-               }else if (tempdefault == "default") { 
-                       string exepath = m->argv;
-                       tempdefault = exepath.substr(0, (exepath.find_last_of('m')));
-                       
-                       m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine();  
-                       m->setDefaultPath(tempdefault);
-               }else {
-            if (m->dirCheck(tempdefault)) {
+#endif
+            }else if (tempdefault == "") {  //do nothing
+            }else if (tempdefault == "default") {
+                string exepath = m->argv;
+                tempdefault = exepath.substr(0, (exepath.find_last_of('m')));
+                
                  m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine();  
-                               m->setDefaultPath(tempdefault); 
+                m->setDefaultPath(tempdefault);
+            }else {
+                if (m->dirCheck(tempdefault)) {
+                    m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine();  
+                    m->setDefaultPath(tempdefault); 
+                }
              }
          }
-
                 return 0;
         }
         catch(exception& e) {
diff --git a/setlogfilecommand.cpp b/setlogfilecommand.cpp

index 128c5671ed55712d2293e0b3c3959da17a7e9f05..41c8e743122399563b8367fda6a9d4a94f9bda86 100644 (file)
--- a/setlogfilecommand.cpp
+++ b/setlogfilecommand.cpp
@@ -85,9 +85,14 @@ int SetLogFileCommand::execute(){
                 
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
-               commandFactory = CommandFactory::getInstance();
-               
-               commandFactory->setLogfileName(name, append);
+        commandFactory = CommandFactory::getInstance();
+        
+        string directory = m->hasPath(name);
+        if (directory == "") {
+            commandFactory->setLogfileName(name, append);
+        }else if (m->dirCheck(directory)) {
+             commandFactory->setLogfileName(name, append);
+        }
                 
                 return 0;
         }
diff --git a/sffmultiplecommand.cpp b/sffmultiplecommand.cpp

index 41002e7754dd97c57e6ab075b5e7c26380a40104..00f23ed24cc83e8eddf66f328a1fe3ce765e122d 100644 (file)
--- a/sffmultiplecommand.cpp
+++ b/sffmultiplecommand.cpp
@@ -236,11 +236,11 @@ SffMultipleCommand::SffMultipleCommand(string option)  {
              if (temp.length() > 1) {  m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
              }
              else {
-                if (toupper(temp[0]) == 'A') {  flowOrder = "TACG";   }
+                if (toupper(temp[0]) == 'A') {  flowOrder = "A";   }
                  else if(toupper(temp[0]) == 'B'){
-                    flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC";   }
+                    flowOrder = "B";   }
                  else if(toupper(temp[0]) == 'I'){
-                    flowOrder = "TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC";   }
+                    flowOrder = "I";   }
                  else {
                      m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
                  }
@@ -378,9 +378,9 @@ int SffMultipleCommand::execute(){
          string fileroot = outputDir + m->getRootName(m->getSimpleName(filename));
          map<string, string> variables; 
                 variables["[filename]"] = fileroot;
-        string fasta = fileroot + getOutputFileName("fasta",variables);
-        string name = fileroot + getOutputFileName("name",variables);
-        string group = fileroot + getOutputFileName("group",variables);
+        string fasta = getOutputFileName("fasta",variables);
+        string name = getOutputFileName("name",variables);
+        string group = getOutputFileName("group",variables);
          
          if (m->control_pressed) { return 0; }
          
diff --git a/sparcccommand.h b/sparcccommand.h

index 8add3eded2682f6e7568670e3c155d0fa40d1356..73e9f8a593072a34ba7f1f5b4efd6f5d81e972fc 100644 (file)
--- a/sparcccommand.h
+++ b/sparcccommand.h
@@ -29,7 +29,7 @@ public:
      //commmand category choices: Sequence Processing, OTU-Based Approaches, Hypothesis Testing, Phylotype Analysis, General, Clustering and Hidden
         string getHelpString();
      string getCitation() { return "Friedman J, Alm EJ (2012) Inferring Correlation Networks from Genomic Survey Data. PLoS Comput Biol 8(9): e1002687. doi:10.1371/journal.pcbi.1002687 http://www.mothur.org/wiki/Sparcc"; }
-    string getDescription()            { return "brief description"; }
+    string getDescription()            { return "Calculates correlations between OTUs using a method that is insensitive to the use of relative abundance data"; }
      
      int execute();
      void help() { m->mothurOut(getHelpString()); }
diff --git a/trimoligos.cpp b/trimoligos.cpp

index 5d375a75150c1cc86c5b6c39e3ac21cdeeb7f110..ddc1053f50d3eba28064907e4c6f0a60a958b3ee 100644 (file)
--- a/trimoligos.cpp
+++ b/trimoligos.cpp
@@ -431,10 +431,10 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr
                  break;
              }
              
-            if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
+            if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr(0,roligo.length())))) {
                  group = it->first;
                  forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
-                reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
+                reverseSeq.setUnaligned(rawRSequence.substr(roligo.length()));
                  success = 0;
                  break;
              }
@@ -577,19 +577,6 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr
                      
                  }
                  
-                /*cout << minDiff << '\t' << minCount << '\t' << endl;
-                 for (int i = 0; i < minFGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;
-                 for (int i = 0; i < minRGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;*/
                  if(minDiff > bdiffs)   {       success = minDiff;      }       //no good matches
                  else {
                      bool foundMatch = false;
@@ -654,12 +641,12 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                  break;
              }
              
-            if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
+            if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr(0,roligo.length())))) {
                  group = it->first;
                  forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
-                reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
+                reverseSeq.setUnaligned(rawRSequence.substr(roligo.length()));
                  forwardQual.trimQScores(foligo.length(), -1);
-                reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length());
+                reverseQual.trimQScores(roligo.length(), -1);
                  success = 0;
                  break;
              }
@@ -802,19 +789,6 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                      
                  }
                  
-                /*cout << minDiff << '\t' << minCount << '\t' << endl;
-                 for (int i = 0; i < minFGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;
-                 for (int i = 0; i < minRGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;*/
                  if(minDiff > bdiffs)   {       success = minDiff;      }       //no good matches
                  else {
                      bool foundMatch = false;
@@ -881,6 +855,8 @@ int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& grou
                  break;
              }
              
+            if (rawSeq.length() < (foligo.length() + roligo.length())) {  success = pdiffs + 10; break; }
+            
              if((compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length())))) {
                  group = it->first;
                  string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode
@@ -1033,19 +1009,6 @@ int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& grou
                      
                  }
                  
-                /*cout << minDiff << '\t' << minCount << '\t' << endl;
-                 for (int i = 0; i < minFGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;
-                 for (int i = 0; i < minRGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;*/
                  if(minDiff > bdiffs)   {       success = minDiff;      }       //no good matches
                  else {
                      bool foundMatch = false;
@@ -1065,6 +1028,7 @@ int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& grou
                          group = matches[0];
                          fStart = minFPos[0];
                          rStart = rawSeq.length() - minRPos[0];
+                        if (fStart > rStart) { foundMatch = false; } //only barcodes not a good sequence
                      }
                      
                      //we have an acceptable match for the forward and reverse, but do they match?
@@ -1116,6 +1080,8 @@ int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& grou
                  break;
              }
              
+            if (rawSeq.length() < (foligo.length() + roligo.length())) {  success = pdiffs + 10; break; }
+            
              if((compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length())))) {
                  group = it->first;
                  if (!keepForward) {
@@ -1165,7 +1131,7 @@ int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& grou
                      success = pdiffs + 10;
                      break;
                  }
-                //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl;
+                //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+pdiffs) << endl;
                  //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
                  alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+pdiffs));
                  oligo = alignment->getSeqAAln();
@@ -1270,19 +1236,6 @@ int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& grou
                      
                  }
                  
-                /*cout << minDiff << '\t' << minCount << '\t' << endl;
-                 for (int i = 0; i < minFGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;
-                 for (int i = 0; i < minRGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;*/
                  if(minDiff > pdiffs)   {       success = minDiff;      }       //no good matches
                  else {
                      bool foundMatch = false;
@@ -1302,6 +1255,7 @@ int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& grou
                          group = matches[0];
                          fStart = minFPos[0];
                          rStart = rawSeq.length() - minRPos[0];
+                        if (fStart > rStart) { foundMatch = false; } //only primers not a good sequence
                      }
                      
                      //we have an acceptable match for the forward and reverse, but do they match?
@@ -1355,12 +1309,12 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                  break;
              }
              
-            if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
+            if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr(0,roligo.length())))) {
                  group = it->first;
                  forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
-                reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
+                reverseSeq.setUnaligned(rawRSequence.substr(roligo.length()));
                  forwardQual.trimQScores(foligo.length(), -1);
-                reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length());
+                reverseQual.trimQScores(roligo.length(), -1);
                  success = 0;
                  break;
              }
@@ -1503,19 +1457,6 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                      
                  }
                  
-                /*cout << minDiff << '\t' << minCount << '\t' << endl;
-                 for (int i = 0; i < minFGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;
-                 for (int i = 0; i < minRGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;*/
                  if(minDiff > pdiffs)   {       success = minDiff;      }       //no good matches
                  else {
                      bool foundMatch = false;
@@ -1585,7 +1526,7 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr
              if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
                  group = it->first;
                  forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
-                reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
+                reverseSeq.setUnaligned(rawRSequence.substr(roligo.length()));
                  success = 0;
                  break;
              }
@@ -1728,19 +1669,6 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr
                      
                  }
                  
-                /*cout << minDiff << '\t' << minCount << '\t' << endl;
-                 for (int i = 0; i < minFGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;
-                 for (int i = 0; i < minRGroup.size(); i++) {
-                 cout << i << '\t';
-                 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
-                 cout << endl;
-                 }
-                 cout << endl;*/
                  if(minDiff > pdiffs)   {       success = minDiff;      }       //no good matches
                  else {
                      bool foundMatch = false;
diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp

index 16d83de2596b9cf84ab4069789917878fa89559e..4128142448b8412314dc2b1d8ce057c6e8527519 100644 (file)
--- a/trimseqscommand.cpp
+++ b/trimseqscommand.cpp
@@ -24,7 +24,7 @@ vector<string> TrimSeqsCommand::setParameters(){
          CommandParameter preorient("checkorient", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(preorient);
                 CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig);
                 CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxhomop);
-               CommandParameter pminlength("minlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pminlength);
+               CommandParameter pminlength("minlength", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pminlength);
                 CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxlength);
                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs);
@@ -256,7 +256,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                         temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "0"; }
                         m->mothurConvert(temp, maxHomoP);  
  
-                       temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "0"; }
+                       temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "1"; }
                         m->mothurConvert(temp, minLength); 
                         
                         temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
@@ -715,7 +715,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                         int currentSeqsDiffs = 0;
  
                         Sequence currSeq(inFASTA); m->gobble(inFASTA);
-                       //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
+                       //cout << currSeq.getName() << '\t' << currSeq.getUnaligned() << endl;
              Sequence savedSeq(currSeq.getName(), currSeq.getAligned());
              
                         QualityScores currQual; QualityScores savedQual;
@@ -745,7 +745,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                      }
                                         else{ currentSeqsDiffs += success;  }
                                 }
-                               
+                               //cout << currSeq.getName() << '\t' << currSeq.getUnaligned() << endl;
                  if(numSpacers != 0){
                                         success = trimOligos->stripSpacer(currSeq, currQual);
                                         if(success > sdiffs)            {       trashCode += 's';       }
@@ -775,13 +775,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                      
                      int thisBarcodeIndex = 0;
                      int thisPrimerIndex = 0;
-                    
+                    //cout << currSeq.getName() << '\t' << savedSeq.getUnaligned() << endl;
                      if(numBarcodes != 0){
                          thisSuccess = rtrimOligos->stripBarcode(savedSeq, savedQual, thisBarcodeIndex);
                          if(thisSuccess > bdiffs)               { thisTrashCode += "b"; }
                          else{ thisCurrentSeqsDiffs += thisSuccess;  }
                      }
-                    
+                    //cout << currSeq.getName() << '\t' << savedSeq.getUnaligned() << endl;
                      if(numFPrimers != 0){
                          thisSuccess = rtrimOligos->stripForward(savedSeq, savedQual, thisPrimerIndex, keepforward);
                          if(thisSuccess > pdiffs)               { thisTrashCode += "f"; }
author	Pat Schloss <pschloss@umich.edu>
	Wed, 17 Jul 2013 18:52:16 +0000 (14:52 -0400)
committer	Pat Schloss <pschloss@umich.edu>
	Wed, 17 Jul 2013 18:52:16 +0000 (14:52 -0400)
.gitignore		patch \| blob \| history
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
abstractdecisiontree.cpp		patch \| blob \| history
abstractdecisiontree.hpp		patch \| blob \| history
bayesian.cpp		patch \| blob \| history
calcsparcc.cpp		patch \| blob \| history
classify.cpp		patch \| blob \| history
classifysharedcommand.cpp		patch \| blob \| history
classifysharedcommand.h		patch \| blob \| history
cluster.hpp		patch \| blob \| history
clusterfragmentscommand.cpp		patch \| blob \| history
commandfactory.cpp		patch \| blob \| history
createdatabasecommand.cpp		patch \| blob \| history
decisiontree.cpp		patch \| blob \| history
decisiontree.hpp		patch \| blob \| history
deconvolutecommand.cpp		patch \| blob \| history
designmap.cpp	[new file with mode: 0644]	patch \| blob
designmap.h	[new file with mode: 0644]	patch \| blob
forest.cpp		patch \| blob \| history
forest.h		patch \| blob \| history
getlineagecommand.cpp		patch \| blob \| history
getmetacommunitycommand.h		patch \| blob \| history
getseqscommand.cpp		patch \| blob \| history
groupmap.cpp		patch \| blob \| history
groupmap.h		patch \| blob \| history
kruskalwalliscommand.cpp		patch \| blob \| history
kruskalwalliscommand.h		patch \| blob \| history
lefsecommand.cpp	[new file with mode: 0644]	patch \| blob
lefsecommand.h	[new file with mode: 0644]	patch \| blob
linearalgebra.cpp		patch \| blob \| history
linearalgebra.h		patch \| blob \| history
macros.h		patch \| blob \| history
makecontigscommand.cpp		patch \| blob \| history
makefile		patch \| blob \| history
makelefsecommand.cpp	[new file with mode: 0644]	patch \| blob
makelefsecommand.h	[new file with mode: 0644]	patch \| blob
makelookupcommand.cpp		patch \| blob \| history
makelookupcommand.h		patch \| blob \| history
mgclustercommand.cpp		patch \| blob \| history
mothur.h		patch \| blob \| history
mothurout.cpp		patch \| blob \| history
mothurout.h		patch \| blob \| history
otuassociationcommand.cpp		patch \| blob \| history
phylosummary.cpp		patch \| blob \| history
phylosummary.h		patch \| blob \| history
randomforest.cpp		patch \| blob \| history
randomforest.hpp		patch \| blob \| history
regularizedrandomforest.cpp		patch \| blob \| history
renameseqscommand.cpp	[new file with mode: 0644]	patch \| blob
renameseqscommand.h	[new file with mode: 0644]	patch \| blob
rftreenode.cpp		patch \| blob \| history
rftreenode.hpp		patch \| blob \| history
screenseqscommand.cpp		patch \| blob \| history
setdircommand.cpp		patch \| blob \| history
setlogfilecommand.cpp		patch \| blob \| history
sffmultiplecommand.cpp		patch \| blob \| history
sparcccommand.h		patch \| blob \| history
trimoligos.cpp		patch \| blob \| history
trimseqscommand.cpp		patch \| blob \| history