added lefse command. added check for hard filter length. fixed bug in make.biom...

author Sarah Westcott <mothur.westcott@gmail.com>

Wed, 18 Sep 2013 16:53:47 +0000 (12:53 -0400)

committer Sarah Westcott <mothur.westcott@gmail.com>

Wed, 18 Sep 2013 16:53:47 +0000 (12:53 -0400)
author Sarah Westcott <mothur.westcott@gmail.com>
Wed, 18 Sep 2013 16:53:47 +0000 (12:53 -0400)
committer Sarah Westcott <mothur.westcott@gmail.com>
Wed, 18 Sep 2013 16:53:47 +0000 (12:53 -0400)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index a17c97b9a7b3de851f05a07c91fe0fa28287f257..27607ff59b39c81ca60a3c5b7e081f6de81c73f7 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -2413,7 +2413,7 @@
                                 GCC_OPTIMIZATION_LEVEL = 3;
                                 "INSTALL_PATH[sdk=*]" = TARGET_BUILD_DIR;
                                 PRODUCT_NAME = mothur;
-                               SDKROOT = macosx10.8;
+                               SDKROOT = macosx;
                                 SKIP_INSTALL = NO;
                         };
                         name = Debug;
@@ -2449,6 +2449,7 @@
                                         "VERSION=\"\\\"1.31.0\\\"\"",
                                         "RELEASE_DATE=\"\\\"5/24/2013\\\"\"",
                                 );
+                               GCC_VERSION = "";
                                 "GCC_VERSION[arch=*]" = "";
                                 GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
                                 GCC_WARN_ABOUT_RETURN_TYPE = YES;
@@ -2466,7 +2467,7 @@
                                         "-lncurses",
                                         "-lreadline",
                                 );
-                               SDKROOT = macosx;
+                               SDKROOT = macosx10.8;
                                 SKIP_INSTALL = NO;
                                 USER_HEADER_SEARCH_PATHS = "";
                         };
@@ -2485,6 +2486,7 @@
                                         "VERSION=\"\\\"1.31.0\\\"\"",
                                         "RELEASE_DATE=\"\\\"5/24/2013\\\"\"",
                                 );
+                               GCC_VERSION = "";
                                 GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
                                 GCC_WARN_ABOUT_RETURN_TYPE = YES;
                                 GCC_WARN_MISSING_PARENTHESES = YES;
@@ -2504,7 +2506,7 @@
                                         "-lncurses",
                                         "-lreadline",
                                 );
-                               SDKROOT = macosx;
+                               SDKROOT = macosx10.8;
                                 SKIP_INSTALL = NO;
                         };
                         name = Release;
diff --git a/Mothur.xcodeproj/xcuserdata/SarahsWork.xcuserdatad/xcschemes/Mothur.xcscheme b/Mothur.xcodeproj/xcuserdata/SarahsWork.xcuserdatad/xcschemes/Mothur.xcscheme

index 3ced6e4a7c610f456b22a59d421b7ca434d8e6bb..2ce2b0ee98faa5a5cbb0e5ed87a64a11fd05a4d3 100644 (file)
--- a/Mothur.xcodeproj/xcuserdata/SarahsWork.xcuserdatad/xcschemes/Mothur.xcscheme
+++ b/Mothur.xcodeproj/xcuserdata/SarahsWork.xcuserdatad/xcschemes/Mothur.xcscheme
@@ -1,5 +1,6 @@
  <?xml version="1.0" encoding="UTF-8"?>
  <Scheme
+   LastUpgradeVersion = "0460"
     version = "1.3">
     <BuildAction
        parallelizeBuildables = "YES"
@@ -22,7 +23,7 @@
        </BuildActionEntries>
     </BuildAction>
     <TestAction
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
        selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
        shouldUseLaunchSchemeArgsEnv = "YES"
        buildConfiguration = "Debug">
@@ -39,11 +40,12 @@
        </MacroExpansion>
     </TestAction>
     <LaunchAction
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
        selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
        launchStyle = "0"
        useCustomWorkingDirectory = "NO"
        buildConfiguration = "Debug"
+      ignoresPersistentStateOnLaunch = "NO"
        debugDocumentVersioning = "YES"
        allowLocationSimulation = "YES">
        <BuildableProductRunnable>
diff --git a/aligncommand.cpp b/aligncommand.cpp

index f757a7920bb337883a2a7773f35ed533d2186c80..f9c0436c62874864279e533a3258d72bf2f6cf2d 100644 (file)
--- a/aligncommand.cpp
+++ b/aligncommand.cpp
@@ -558,7 +558,6 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam
                         if (m->control_pressed) {  break; }
                         
                         Sequence* candidateSeq = new Sequence(inFASTA);  m->gobble(inFASTA);
-            cout  << candidateSeq->getAligned() << endl;
                         report.setCandidate(candidateSeq);
  
                         int origNumBases = candidateSeq->getNumBases();
diff --git a/counttable.cpp b/counttable.cpp

index 6eca8ee2a4b5354da54412b9eb6986b6ba244657..0e9eec38234d52215af681b74a00b5713c7113b0 100644 (file)
--- a/counttable.cpp
+++ b/counttable.cpp
@@ -246,6 +246,9 @@ int CountTable::readTable(string file, bool readGroups) {
              in >> name; m->gobble(in); in >> thisTotal; m->gobble(in);
              if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + toString(thisTotal) + "\n"); }
              
+            if (thisTotal == 0) { error=true; m->mothurOut("[ERROR]: Your count table contains a sequence named " + name + " with a total=0. Please correct."); m->mothurOutEndLine();
+            }
+            
              //if group info, then read it
              vector<int> groupCounts; groupCounts.resize(numGroups, 0);
              if (columnHeaders.size() > 2) { //file contains groups
diff --git a/filters.h b/filters.h

index 21bc8bb5eec729d7b3f59d06d2a6cc6aff09080a..b00cb35afb66aff2fbe2a132ccf4491b17b8cd35 100644 (file)
--- a/filters.h
+++ b/filters.h
@@ -82,6 +82,8 @@ public:
                 fileHandle >> filter;
         
                 fileHandle.close();
+        
+        if (filter.length() != alignmentLength) {  m->mothurOut("[ERROR]: Sequences are not all the same length as the filter, please correct.\n");  m->control_pressed = true; }
         }
  
         void getFreqs(Sequence seq) {
diff --git a/lefsecommand.cpp b/lefsecommand.cpp

index c478a4d328726b2ee0010b6380cb991ad0e3d507..3575e021fd754b5d4baf897893a0633372aef69f 100644 (file)
--- a/lefsecommand.cpp
+++ b/lefsecommand.cpp
@@ -18,8 +18,28 @@ vector<string> LefseCommand::setParameters(){
          CommandParameter psubclass("subclass", "String", "", "", "", "", "","",false,false); parameters.push_back(psubclass);
                 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
                 CommandParameter pclasses("classes", "String", "", "", "", "", "","",false,false); parameters.push_back(pclasses);
-        CommandParameter palpha("anova_alpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(palpha);
-        CommandParameter pwalpha("wilcoxon_alpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(pwalpha);
+        CommandParameter palpha("aalpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(palpha);
+        CommandParameter pwalpha("walpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(pwalpha);
+        
+        CommandParameter plda("lda", "Number", "", "2.0", "", "", "","",false,false); parameters.push_back(plda);
+        CommandParameter pwilc("wilc", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pwilc);
+        CommandParameter pnormmillion("norm", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pnormmillion);
+        CommandParameter piters("iters", "Number", "", "30", "", "", "","",false,false); parameters.push_back(piters);
+        //CommandParameter pwilcsamename("wilcsamename", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pwilcsamename);
+        CommandParameter pcurv("curv", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcurv);
+        CommandParameter pfiters("fboots", "Number", "", "0.67", "", "", "","",false,false); parameters.push_back(pfiters);
+        CommandParameter pstrict("strict", "Multiple", "0-1-2", "0", "", "", "","",false,false); parameters.push_back(pstrict);
+        CommandParameter pminc("minc", "Number", "", "10", "", "", "","",false,false); parameters.push_back(pminc);
+        CommandParameter pmulticlass_strat("multiclass", "Multiple", "onevone-onevall", "onevall", "", "", "","",false,false); parameters.push_back(pmulticlass_strat);
+        //CommandParameter psubject("subject", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psubject);
+
+
+        //not used in their current code, but in parameters
+        //CommandParameter pnlogs("nlogs", "Number", "", "3", "", "", "","",false,false); parameters.push_back(pnlogs);
+        //CommandParameter pranktec("ranktec", "Multiple", "lda-svm", "lda", "", "", "","",false,false); parameters.push_back(pranktec); // svm not implemented in their source yet.
+        //CommandParameter psvmnorm("svmnorm", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(psvmnorm); //not used because svm not implemented yet.
+
+        
          //every command must have inputdir and outputdir.  This allows mothur users to redirect input and output files.
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
@@ -38,10 +58,22 @@ string LefseCommand::getHelpString(){
         try {
                 string helpString = "";
                 helpString += "The lefse command allows you to ....\n";
-               helpString += "The lefse command parameters are: shared, design, class, subclass, label, classes, wilcoxon_alpha, anova_alpha.\n";
+               helpString += "The lefse command parameters are: shared, design, class, subclass, label, classes, walpha, aalpha, lda, wilc, iters, curv, fboots, strict, minc, multiclass and norm.\n";
                 helpString += "The class parameter is used to indicate the which category you would like used for the Kruskal Wallis analysis. If none is provided first category is used.\n";
-        helpString += "The subclass parameter is used to indicate the .....If none is provided second category is used, or if only one category subclass is ignored. \n";
-        helpString += "The classes parameter is used to indicate the classes you would like to use. Clases should be inputted in the following format: classes=label<value1|value2|value3>-label<value1|value2>. For example to include groups from treatment with value early or late and age= young or old.  class=treatment<Early|Late>-age<young|old>.\n";
+        helpString += "The subclass parameter is used to indicate the .....If none is provided, second category is used, or if only one category subclass is ignored. \n";
+        helpString += "The aalpha parameter is used to set the alpha value for the Krukal Wallis Anova test Default=0.05. \n";
+        helpString += "The walpha parameter is used to set the alpha value for the Wilcoxon test. Default=0.05. \n";
+        helpString += "The lda parameter is used to set the threshold on the absolute value of the logarithmic LDA score. Default=2.0. \n";
+        helpString += "The wilc parameter is used to indicate whether to perform the Wilcoxon test. Default=T. \n";
+        helpString += "The iters parameter is used to set the number of bootstrap iteration for LDA. Default=30. \n";
+        //helpString += "The wilcsamename parameter is used to indicate whether perform the wilcoxon test only among the subclasses with the same name. Default=F. \n";
+        helpString += "The curv parameter is used to set whether perform the wilcoxon test ing the Curtis's approach [BETA VERSION] Default=F. \n";
+        helpString += "The norm parameter is used to multiply relative abundances by 1000000. Recommended when very low values are present. Default=T. \n";
+        helpString += "The fboots parameter is used to set the subsampling fraction value for each bootstrap iteration. Default=0.67. \n";
+        helpString += "The strict parameter is used to set the multiple testing correction options. 0 no correction (more strict, default), 1 correction for independent comparisons, 2 correction for independent comparison. Options = 0,1,2. Default=0. \n";
+        helpString += "The minc parameter is used to minimum number of samples per subclass for performing wilcoxon test. Default=10. \n";
+        helpString += "The multiclass parameter is used to (for multiclass tasks) set whether the test is performed in a one-against-one ( onevone - more strict!) or in a one-against-all setting ( onevall - less strict). Default=onevall. \n";
+        helpString += "The classes parameter is used to indicate the classes you would like to use. Classes should be inputted in the following format: classes=label<value1|value2|value3>-label<value1|value2>. For example to include groups from treatment with value early or late and age= young or old.  class=treatment<Early|Late>-age<young|old>.\n";
          helpString += "The label parameter is used to indicate which distances in the shared file you would like to use. labels are separated by dashes.\n";
                 helpString += "The lefse command should be in the following format: lefse(shared=final.an.shared, design=final.design, class=treatment, subclass=age).\n";
          return helpString;
@@ -57,8 +89,6 @@ string LefseCommand::getOutputPattern(string type) {
          string pattern = "";
          
          if (type == "summary") {  pattern = "[filename],[distance],lefse_summary"; }
-        else if (type == "kruskall-wallis") {  pattern = "[filename],[distance],kruskall_wallis"; }
-        else if (type == "wilcoxon") {  pattern = "[filename],[distance],wilcoxon"; }
          else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
          
          return pattern;
@@ -75,8 +105,6 @@ LefseCommand::LefseCommand(){
                 setParameters();
          vector<string> tempOutNames;
                 outputTypes["summary"] = tempOutNames;
-        outputTypes["kruskall-wallis"] = tempOutNames;
-        outputTypes["wilcoxon"] = tempOutNames;
         }
         catch(exception& e) {
                 m->errorOut(e, "LefseCommand", "LefseCommand");
@@ -109,8 +137,6 @@ LefseCommand::LefseCommand(string option)  {
                         
                         vector<string> tempOutNames;
              outputTypes["summary"] = tempOutNames;
-            outputTypes["kruskall-wallis"] = tempOutNames;
-            outputTypes["wilcoxon"] = tempOutNames;
              
                         //if the user changes the input directory command factory will send this info to us in the output parameter
                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
@@ -171,19 +197,63 @@ LefseCommand::LefseCommand(string option)  {
                         if (mclass == "not found") { mclass = ""; }
                         
              subclass = validParameter.validFile(parameters, "subclass", false);
-                       if (subclass == "not found") { subclass = ""; }
+                       if (subclass == "not found") { subclass = mclass; }
              
              classes = validParameter.validFile(parameters, "classes", false);
                         if (classes == "not found") { classes = ""; }
              
-            string temp = validParameter.validFile(parameters, "anova_alpha", false);
+            string temp = validParameter.validFile(parameters, "aalpha", false);
                         if (temp == "not found") { temp = "0.05"; }
                         m->mothurConvert(temp, anovaAlpha);
              
-            temp = validParameter.validFile(parameters, "wilcoxon_alpha", false);
+            temp = validParameter.validFile(parameters, "walpha", false);
                         if (temp == "not found") { temp = "0.05"; }
                         m->mothurConvert(temp, wilcoxonAlpha);
+            
+            temp = validParameter.validFile(parameters, "wilc", false);
+                       if (temp == "not found") { temp = "T"; }
+                       wilc = m->isTrue(temp);
+            
+            temp = validParameter.validFile(parameters, "norm", false);
+                       if (temp == "not found") { temp = "T"; }
+                       normMillion = m->isTrue(temp);
+            
+            //temp = validParameter.validFile(parameters, "subject", false);
+                       //if (temp == "not found") { temp = "F"; }
+                       //subject = m->isTrue(temp);
  
+            temp = validParameter.validFile(parameters, "lda", false);
+                       if (temp == "not found") { temp = "2.0"; }
+                       m->mothurConvert(temp, ldaThreshold);
+            
+            temp = validParameter.validFile(parameters, "iters", false);
+                       if (temp == "not found") { temp = "30"; }
+                       m->mothurConvert(temp, iters);
+            
+            temp = validParameter.validFile(parameters, "fboots", false);
+                       if (temp == "not found") { temp = "0.67"; }
+                       m->mothurConvert(temp, fBoots);
+            
+            //temp = validParameter.validFile(parameters, "wilcsamename", false);
+                       //if (temp == "not found") { temp = "F"; }
+                       //wilcsamename = m->isTrue(temp);
+            
+            temp = validParameter.validFile(parameters, "curv", false);
+                       if (temp == "not found") { temp = "F"; }
+                       curv = m->isTrue(temp);
+            
+            temp = validParameter.validFile(parameters, "strict", false);
+            if (temp == "not found"){  temp = "0";             }
+                       if ((temp != "0") && (temp != "1") && (temp != "2")) { m->mothurOut("Invalid strict option: choices are 0, 1 or 2."); m->mothurOutEndLine(); abort=true; }
+            else {  m->mothurConvert(temp, strict); }
+            
+            temp = validParameter.validFile(parameters, "minc", false);
+                       if (temp == "not found") { temp = "10"; }
+                       m->mothurConvert(temp, minC);
+            
+            multiClassStrat = validParameter.validFile(parameters, "multiclass", false);
+            if (multiClassStrat == "not found"){       multiClassStrat = "onevall";            }
+                       if ((multiClassStrat != "onevall") && (multiClassStrat != "onevone")) { m->mothurOut("Invalid multiclass option: choices are onevone or onevall."); m->mothurOutEndLine(); abort=true; }
                 }
                 
         }
@@ -196,6 +266,8 @@ LefseCommand::LefseCommand(string option)  {
  
  int LefseCommand::execute(){
         try {
+        //for reading lefse formatted file and running in mothur for testing - pass number of rows used for design file
+        if (false) {  makeShared(1); exit(1); }
                 
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
          
@@ -212,7 +284,7 @@ int LefseCommand::execute(){
          if (mclass == "") {  mclass = designMap.getDefaultClass(); m->mothurOut("\nYou did not provide a class, using " + mclass +".\n\n"); }
          
          InputData input(sharedfile, "sharedfile");
-        vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
+        vector<SharedRAbundFloatVector*> lookup = input.getSharedRAbundFloatVectors();
          string lastLabel = lookup[0]->getLabel();
          
          //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
@@ -239,7 +311,7 @@ int LefseCommand::execute(){
                  string saveLabel = lookup[0]->getLabel();
                  
                  for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
-                lookup = input.getSharedRAbundVectors(lastLabel);
+                lookup = input.getSharedRAbundFloatVectors(lastLabel);
                  m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
                  
                  process(lookup, designMap);
@@ -258,7 +330,7 @@ int LefseCommand::execute(){
              if (m->control_pressed) { return 0; }
              
              //get next line to process
-            lookup = input.getSharedRAbundVectors();
+            lookup = input.getSharedRAbundFloatVectors();
          }
          
          if (m->control_pressed) {  return 0; }
@@ -279,7 +351,7 @@ int LefseCommand::execute(){
          //run last label if you need to
          if (needToRun == true)  {
              for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
-            lookup = input.getSharedRAbundVectors(lastLabel);
+            lookup = input.getSharedRAbundFloatVectors(lastLabel);
              
              m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
              process(lookup, designMap);
@@ -303,13 +375,73 @@ int LefseCommand::execute(){
  }
  //**********************************************************************************************************************
  
-int LefseCommand::process(vector<SharedRAbundVector*>& lookup, DesignMap& designMap) {
+int LefseCommand::process(vector<SharedRAbundFloatVector*>& lookup, DesignMap& designMap) {
         try {
+        vector<string> classes;
+        vector<string> subclasses;
+        map<string, string> subclass2Class;
+        map<string, set<string> > class2SubClasses; //maps class name to vector of its subclasses
+        map<string, vector<int> > subClass2GroupIndex; //maps subclass name to vector of indexes in lookup from that subclass. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old.  Saves time below.
+        map<string, vector<int> > class2GroupIndex; //maps subclass name to vector of indexes in lookup from that class. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old.  Saves time below.
+        for (int j = 0; j < lookup.size(); j++) {
+            if (normMillion) {  normalize(lookup);  }
+            string group = lookup[j]->getGroup();
+            string treatment = designMap.get(group, mclass); //get value for this group in this category
+            string thisSub = designMap.get(group, subclass);
+            map<string, string>::iterator it = subclass2Class.find(thisSub);
+            if (it == subclass2Class.end()) {
+                subclass2Class[thisSub] = treatment;
+                vector<int> temp; temp.push_back(j);
+                subClass2GroupIndex[thisSub] = temp;
+            }
+            else {
+                if (it->second != treatment) {
+                    //m->mothurOut("[WARNING]: subclass " + thisSub + " has members in " + it->second + " and " + treatment + ". Subclass members must be from the same class for Wilcoxon. Changing " + thisSub + " to " + treatment + "_" + thisSub + ".\n");
+                    thisSub = treatment + "_" + thisSub;
+                    subclass2Class[thisSub] = treatment;
+                    vector<int> temp; temp.push_back(j);
+                    subClass2GroupIndex[thisSub] = temp;
+                }else { subClass2GroupIndex[thisSub].push_back(j); }
+            }
+            
+            map<string, set<string> >::iterator itClass = class2SubClasses.find(treatment);
+            if (itClass == class2SubClasses.end()) {
+                set<string> temp; temp.insert(thisSub);
+                class2SubClasses[treatment] = temp;
+                vector<int> temp2; temp2.push_back(j);
+                class2GroupIndex[treatment] = temp2;
+                classes.push_back(treatment);
+            }else{
+                class2SubClasses[treatment].insert(thisSub);
+                class2GroupIndex[treatment].push_back(j);
+            }
+        }
+        //sort classes so order is right
+        sort(classes.begin(), classes.end());
+        
+        vector< vector<double> > means = getMeans(lookup, class2GroupIndex); //[numOTUs][classes] - classes in same order as class2GroupIndex
+        
          //run kruskal wallis on each otu
-        vector<int> significantOtuLabels = runKruskalWallis(lookup, designMap);
+        map<int, double> significantOtuLabels = runKruskalWallis(lookup, designMap);
+        
+        int numSigBeforeWilcox = significantOtuLabels.size();
          
          //check for subclass
-        if (subclass != "") {  significantOtuLabels = runWilcoxon(lookup, designMap, significantOtuLabels);  }
+        string wilcoxString = "";
+        if ((subclass != "") && wilc) {  significantOtuLabels = runWilcoxon(lookup, designMap, significantOtuLabels, class2SubClasses, subClass2GroupIndex, subclass2Class);  wilcoxString += " ( " + toString(numSigBeforeWilcox) + " ) before internal wilcoxon"; }
+        
+        int numSigAfterWilcox = significantOtuLabels.size();
+        
+        m->mothurOut("\nNumber of significantly discriminative features: " + toString(numSigAfterWilcox) + wilcoxString + ".\n"); 
+        
+        map<int, double> sigOTUSLDA;
+        if (numSigAfterWilcox > 0) {
+            sigOTUSLDA = testLDA(lookup, significantOtuLabels, class2GroupIndex, subClass2GroupIndex);
+            m->mothurOut("Number of discriminative features with abs LDA score > " + toString(ldaThreshold) + " : " + toString(significantOtuLabels.size()) + ".\n");
+        }
+        else { m->mothurOut("No features with significant differences between the classes.\n"); }
+        
+        printResults(means, significantOtuLabels, sigOTUSLDA, lookup[0]->getLabel(), classes);
          
          return 0;
      }
@@ -319,20 +451,30 @@ int LefseCommand::process(vector<SharedRAbundVector*>& lookup, DesignMap& design
         }
  }
  //**********************************************************************************************************************
-
-vector<int> LefseCommand::runKruskalWallis(vector<SharedRAbundVector*>& lookup, DesignMap& designMap) {
+int LefseCommand::normalize(vector<SharedRAbundFloatVector*>& lookup) {
         try {
-        map<string, string> variables;
-        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
-        variables["[distance]"] = lookup[0]->getLabel();
-               string outputFileName = getOutputFileName("kruskall-wallis",variables);
+        vector<double> mul;
+        for (int i = 0; i < lookup.size(); i++) {
+            double sum = 0.0;
+            for (int j = 0; j < lookup[i]->getNumBins(); j++) { sum += lookup[i]->getAbundance(j); }
+            mul.push_back(1000000.0/sum);
+        }
          
-               ofstream out;
-               m->openOutputFile(outputFileName, out);
-               outputNames.push_back(outputFileName); outputTypes["kruskall-wallis"].push_back(outputFileName);
-        out << "OTULabel\tKW\tPvalue\n";
+        for (int i = 0; i < lookup.size(); i++) {
+            for (int j = 0; j < lookup[i]->getNumBins(); j++) { lookup[i]->set(j, lookup[i]->getAbundance(j)*mul[i], lookup[i]->getGroup()); }
+        }
          
-        vector<int> significantOtuLabels;
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "LefseCommand", "normalize");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+map<int, double> LefseCommand::runKruskalWallis(vector<SharedRAbundFloatVector*>& lookup, DesignMap& designMap) {
+       try {        
+        map<int, double> significantOtuLabels;
          int numBins = lookup[0]->getNumBins();
          //sanity check to make sure each treatment has a group in the shared file
          set<string> treatments;
@@ -357,13 +499,9 @@ vector<int> LefseCommand::runKruskalWallis(vector<SharedRAbundVector*>& lookup,
              
              double pValue = 0.0;
              double H = linear.calcKruskalWallis(values, pValue);
-            
-            //output H and signifigance
-            out << m->currentBinLabels[i] << '\t' << H << '\t' << pValue << endl;
-            
-            if (pValue < anovaAlpha) {  significantOtuLabels.push_back(i);  }
+             
+            if (pValue < anovaAlpha) {  significantOtuLabels[i] = pValue;  }
          }
-        out.close();
          
          return significantOtuLabels;
      }
@@ -374,110 +512,691 @@ vector<int> LefseCommand::runKruskalWallis(vector<SharedRAbundVector*>& lookup,
  }
  //**********************************************************************************************************************
  //assumes not neccessarily paired
-vector<int> LefseCommand::runWilcoxon(vector<SharedRAbundVector*>& lookup, DesignMap& designMap, vector<int> bins) {
+map<int, double> LefseCommand::runWilcoxon(vector<SharedRAbundFloatVector*>& lookup, DesignMap& designMap, map<int, double> bins, map<string, set<string> >& class2SubClasses, map<string, vector<int> >& subClass2GroupIndex, map<string, string> subclass2Class) {
      try {
-        LinearAlgebra linear;
-        vector<int> significantOtuLabels;
+        map<int, double> significantOtuLabels;
+        map<int, double>::iterator it;
          //if it exists and meets the following requirements run Wilcoxon
          /*
           1. Subclass members all belong to same main class
           anything else
          */
-        vector<string> subclasses;
-        map<string, string> subclass2Class;
-        map<string, int> subclassCounts;
-        map<string, vector<int> > subClass2GroupIndex; //maps subclass name to vector of indexes in lookup from that subclass. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old.  Saves time below.
-        bool error = false;
-        for (int j = 0; j < lookup.size(); j++) {
-            string group = lookup[j]->getGroup();
-            string treatment = designMap.get(group, mclass); //get value for this group in this category
-            string thisSub = designMap.get(group, subclass);
-            map<string, string>::iterator it = subclass2Class.find(thisSub);
-            if (it == subclass2Class.end()) {
-                subclass2Class[thisSub] = treatment;
-                subclassCounts[thisSub] = 1;
-                vector<int> temp; temp.push_back(j);
-                subClass2GroupIndex[thisSub] = temp;
+        
+        int numBins = lookup[0]->getNumBins();
+        for (int i = 0; i < numBins; i++) {
+            if (m->control_pressed) { break; }
+            
+            it = bins.find(i);
+            if (it != bins.end()) { //flagged in Kruskal Wallis
+                
+                vector<float> abunds;  for (int j = 0; j < lookup.size(); j++) { abunds.push_back(lookup[j]->getAbundance(i)); }
+                
+                bool sig = testOTUWilcoxon(class2SubClasses, abunds, subClass2GroupIndex, subclass2Class);
+                if (sig) { significantOtuLabels[i] = it->second;  }
+                
+            }//bins flagged from kw
+        }//for bins
+        
+        return significantOtuLabels;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "runWilcoxon");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+//lefse.py - test_rep_wilcoxon_r function
+bool LefseCommand::testOTUWilcoxon(map<string, set<string> >& class2SubClasses, vector<float> abunds, map<string, vector<int> >& subClass2GroupIndex, map<string, string> subclass2Class) {
+    try {
+        int totalOk = 0;
+        double alphaMtc = wilcoxonAlpha;
+        vector< set<string> > allDiffs;
+        LinearAlgebra linear;
+        
+        //for each subclass comparision
+        map<string, set<string> >::iterator itB;
+        for(map<string, set<string> >::iterator it=class2SubClasses.begin();it!=class2SubClasses.end();it++){
+            itB = it;itB++;
+            for(itB;itB!=class2SubClasses.end();itB++){
+                if (m->control_pressed) { return false; }
+                bool first = true;
+                int dirCmp = 0; // not set?? dir_cmp = "not_set" # 0=notset or none, 1=true, 2=false.
+                int curv_sign = 0;
+                int ok = 0;
+                int count = 0;
+                for (set<string>::iterator itClass1 = (it->second).begin(); itClass1 != (it->second).end(); itClass1++) {
+                    bool br = false;
+                    for (set<string>::iterator itClass2 = (itB->second).begin(); itClass2 != (itB->second).end(); itClass2++) {
+                        string subclass1 = *itClass1;
+                        string subclass2 = *itClass2;
+                        count++;
+                        
+                        if (m->debug) { m->mothurOut( "[DEBUG comparing " + it->first + "-" + *itClass1 + " to " + itB->first + "-" + *itClass2 + "\n"); }
+                        
+                        string treatment1 = subclass2Class[subclass1];
+                        string treatment2 = subclass2Class[subclass2];
+                        int numSubs1 = class2SubClasses[treatment1].size();
+                        int numSubs2 = class2SubClasses[treatment2].size();
+                        
+                        //if mul_cor != 0: alpha_mtc = th*l_subcl1*l_subcl2 if mul_cor == 2 else 1.0-math.pow(1.0-th,l_subcl1*l_subcl2)
+                        if (strict != 0) { alphaMtc = wilcoxonAlpha * numSubs1 * numSubs2 ; }
+                        if (strict == 2) {}else{ alphaMtc = 1.0-pow((1.0-wilcoxonAlpha),(double)(numSubs1 * numSubs2)); }
+                        
+                        //fill x and y with this comparisons data
+                        vector<double> x; vector<double> y;
+                        
+                        //fill x and y
+                        vector<int> xIndexes = subClass2GroupIndex[subclass1]; //indexes in lookup for this subclass
+                        vector<int> yIndexes = subClass2GroupIndex[subclass2]; //indexes in lookup for this subclass
+                        for (int k = 0; k < yIndexes.size(); k++) { y.push_back(abunds[yIndexes[k]]);  }
+                        for (int k = 0; k < xIndexes.size(); k++) { x.push_back(abunds[xIndexes[k]]);  }
+                        
+                        // med_comp = False
+                        //if len(cl1) < min_c or len(cl2) < min_c:
+                        //med_comp = True
+                        bool medComp = false; // are there enough samples per subclass
+                        if ((xIndexes.size() < minC) || (yIndexes.size() < minC)) { medComp = true; }
+                        
+                        double sx = m->median(x);
+                        double sy = m->median(y);
+                       
+                        //if cl1[0] == cl2[0] and len(set(cl1)) == 1 and  len(set(cl2)) == 1:
+                        //tres, first = False, False
+                        double pValue = 0.0;
+                        double H = 0.0;
+                        bool tres = true; //don't think this is set in the python source.  Not sure how that is handled, but setting it here.
+                        if ((x[0] == y[0]) && (x.size() == 1) && (y.size() == 1)) { tres = false; first = false; }
+                        else if (!medComp) {
+                            H = linear.calcWilcoxon(x, y, pValue);
+                            if (pValue < (alphaMtc*2.0)) { tres = true; }
+                            else { tres = false; }
+                        }
+                        /*if first:
+                         first = False
+                         if not curv and ( med_comp or tres ):
+                         dir_cmp = sx < sy
+                         if sx == sy: br = True
+                         elif curv:
+                         dir_cmp = None
+                         if med_comp or tres:
+                         curv_sign += 1
+                         dir_cmp = sx < sy
+                         else: br = True
+                         elif not curv and med_comp:
+                         if ((sx < sy) != dir_cmp or sx == sy): br = True
+                         elif curv:
+                         if tres and dir_cmp == None:
+                         curv_sign += 1
+                         dir_cmp = sx < sy
+                         if tres and dir_cmp != (sx < sy):
+                         br = True
+                         curv_sign = -1
+                         elif not tres or (sx < sy) != dir_cmp or sx == sy: br = True
+                         */
+                        int sxSy = 2; //false
+                        if (sx<sy) {  sxSy = 1; } //true
+                        
+                        if (first) {
+                            first = false;
+                            if ((!curv) && (medComp || tres)) {
+                                dirCmp = 2; if (sx<sy) { dirCmp = 1; } //dir_cmp = sx < sy
+                                if (sx == sy) { br = true; }
+                            }else if (curv) {
+                                dirCmp = 0;
+                                if (medComp || tres) {
+                                    curv_sign++;
+                                    dirCmp = 2; if (sx<sy) { dirCmp = 1; } //dir_cmp = sx < sy
+                                }
+                            }else { br = true; }
+                        }else if (!curv && medComp) {
+                            if (sxSy != dirCmp || sx == sy) { br = true; }
+                        }else if (curv) {
+                            if (tres && dirCmp == 0) { curv_sign++; }
+                            dirCmp = 2; if (sx<sy) { dirCmp = 1; } //dir_cmp = sx < sy
+                            if (tres && dirCmp != sxSy) { //if tres and dir_cmp != (sx < sy):
+                                br = true;
+                                curv_sign = -1;
+                            }
+                        }else if (!tres || sxSy != dirCmp || sx == sy) { br = true; } //elif not tres or (sx < sy) != dir_cmp or sx == sy: br = True
+                        if (br) { break; }
+                        ok++;
+                    }//for class2 subclasses
+                    if (br) { break; }
+                }//for class1 subclasses
+                bool diff = false;
+                if (curv) { diff = false; if (curv_sign > 0) { diff = true; } } //if curv: diff = curv_sign > 0
+                else { //else: diff = (ok == len(cl_hie[pair[1]])*len(cl_hie[pair[0]]))
+                    diff = false;
+                    if (ok == count) { diff = true; }
+                }
+                if (diff) { totalOk++; }
+                if (!diff && (multiClassStrat == "onevone")) { return false; }
+                if (diff && (multiClassStrat == "onevall")) { //all_diff.append(pair)
+                    set<string> pair; pair.insert(it->first); pair.insert(itB->first);
+                    allDiffs.push_back(pair);
+                }
+            }//classes
+        }//classes
+        
+        if (multiClassStrat == "onevall") {
+            int tot_k = class2SubClasses.size();
+            for(map<string, set<string> >::iterator it=class2SubClasses.begin();it!=class2SubClasses.end();it++){
+                if (m->control_pressed) { return false; }
+                int nk = 0;
+                //is this class okay in all comparisons
+                for (int h = 0; h < allDiffs.size(); h++) {
+                    if (allDiffs[h].count(it->first) != 0) {  nk++; }
+                }
+                if (nk == (tot_k-1)) { return true;  }//if nk == tot_k-1: return True
              }
-            else {
-                subclassCounts[thisSub]++;
-                subClass2GroupIndex[thisSub].push_back(j);
-                if (it->second != treatment) {
-                    error = true;
-                    m->mothurOut("[ERROR]: subclass " + thisSub + " has members in " + it->second + " and " + treatment + ". Subclass members must be from the same class. Ignoring wilcoxon.\n");
+            return false;
+        }
+        
+        return true;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "testOTUWilcoxon");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+//modelled after lefse.py test_lda_r function
+map<int, double> LefseCommand::testLDA(vector<SharedRAbundFloatVector*>& lookup, map<int, double> bins, map<string, vector<int> >& class2GroupIndex, map<string, vector<int> >& subClass2GroupIndex) {
+    try {
+        map<int, double> sigOTUS;
+        map<int, double>::iterator it;
+        LinearAlgebra linear;
+    
+        int numBins = lookup[0]->getNumBins();
+        vector< vector<double> > adjustedLookup;
+        
+        for (int i = 0; i < numBins; i++) {
+            if (m->control_pressed) { break; }
+            
+            it = bins.find(i);
+            if (it != bins.end()) { //flagged in Kruskal Wallis and Wilcoxon(if we ran it)
+                
+                //fill x with this OTUs abundances
+                vector<double> x;
+                for (int j = 0; j < lookup.size(); j++) {  x.push_back(lookup[j]->getAbundance(i));  } 
+                
+                //go through classes
+                for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) {
+                    //max(float(feats['class'].count(c))*0.5,4)
+                    //max(numGroups in this class*0.5, 4.0)
+                    double necessaryNum = ((double)((it->second).size())*0.5);
+                    if (4.0 > necessaryNum) { necessaryNum = 4.0; }
+                    
+                    set<double> uniques;
+                    for (int j = 0; j < (it->second).size(); j++) { uniques.insert(x[(it->second)[j]]); }
+                    
+                    //if len(set([float(v[1]) for v in ff if v[0] == c])) > max(float(feats['class'].count(c))*0.5,4): continue
+                    if ((double)(uniques.size()) > necessaryNum) {  }
+                    else {
+                        //feats[k][i] = math.fabs(feats[k][i] + lrand.normalvariate(0.0,max(feats[k][i]*0.05,0.01)))
+                        for (int j = 0; j < (it->second).size(); j++) { //(it->second) contains indexes of abundance for this class
+                            double sigma = max((x[(it->second)[j]]*0.05), 0.01);
+                            x[(it->second)[j]] = abs(x[(it->second)[j]] + linear.normalvariate(0.0, sigma));
+                        }
+                    }
                  }
+                adjustedLookup.push_back(x); 
+            }
+        }
+                
+        //go through classes
+        int minCl = 1e6;
+        map<int, string> indexToClass;
+        vector<string> classes;
+        for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) {
+            //class with minimum number of groups
+            if ((it->second).size() < minCl) { minCl = (it->second).size(); }
+            for (int i = 0; i < (it->second).size(); i++) { indexToClass[(it->second)[i]] = it->first; }
+            classes.push_back(it->first);
+        }
+        
+        int numGroups = lookup.size(); //lfk
+        int fractionNumGroups = numGroups * fBoots; //rfk
+        minCl = (int)((float)(minCl*fBoots*fBoots*0.05));
+        minCl = max(minCl, 1);
+        
+        vector< vector< vector<double> > > results;//[iters][numComparison][numOTUs]
+        for (int j = 0; j < iters; j++) {
+            if (m->control_pressed) { return sigOTUS; }
+            
+            //find "good" random vector
+            vector<int> rand_s;
+            for (int h = 0; h < 1000; h++) { //generate a vector of length fractionNumGroups with range 0 to numGroups-1
+                rand_s.clear();
+                for (int k = 0; k < fractionNumGroups; k++) {  rand_s.push_back(m->getRandomIndex(numGroups-1)); }
+                if (!contastWithinClassesOrFewPerClass(adjustedLookup, rand_s, minCl, class2GroupIndex, indexToClass)) { h+=1000; } //break out of loop
              }
+            if (m->control_pressed) { return sigOTUS; }
+            
+            //print data in R input format for testing
+            if (false) {
+                vector<string> groups; for (int h = 0; h < rand_s.size(); h++) {  groups.push_back(lookup[rand_s[h]]->getGroup()); }
+                printToCoutForRTesting(adjustedLookup, rand_s, class2GroupIndex, bins, subClass2GroupIndex, groups);
+            }
+            
+            //for each pair of classes
+            vector< vector<double> > temp = lda(adjustedLookup, rand_s, indexToClass, classes); //[numComparison][numOTUs]
+            if (temp.size() != 0) { results.push_back(temp); }
          }
          
-        if (error) { return significantOtuLabels; }
+        if (m->control_pressed) { return sigOTUS; }
          
+        //m = max([numpy.mean([means[k][kk][p] for kk in range(boots)]) for p in range(len(pairs))])
+        int k = 0;
+        for (it = bins.begin(); it != bins.end(); it++) { //[numOTUs] - need to go through bins so we can tie adjustedLookup back to the binNumber. adjustedLookup[0] ->bins entry[0]. 
+            vector<double> averageForEachComparison; averageForEachComparison.resize(results[0].size(), 0.0);
+            double maxM = 0.0; //max of averages for each comparison
+            for (int j = 0; j < results[0].size(); j++) { //numComparisons
+                for (int i = 0; i < results.size(); i++) { //iters
+                    averageForEachComparison[j]+= results[i][j][k];
+                }
+                averageForEachComparison[j] /= (double) results.size();
+                if (averageForEachComparison[j] > maxM) { maxM = averageForEachComparison[j]; }
+            }
+            //res[k] = math.copysign(1.0,m)*math.log(1.0+math.fabs(m),10) 
+            double multiple = 1.0; if (maxM < 0.0) { multiple = -1.0; }
+            double resK = multiple * log10(1.0+abs(maxM));
+            if (resK > ldaThreshold) { sigOTUS[it->first] = resK; }
+            k++;
+        }
+        
+        return sigOTUS;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "testLDA");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+vector< vector<double> > LefseCommand::getMeans(vector<SharedRAbundFloatVector*>& lookup, map<string, vector<int> >& class2GroupIndex) {
+    try {
          int numBins = lookup[0]->getNumBins();
-        vector<compGroup> comp;
-        //find comparisons and fill comp
-        map<string, int>::iterator itB;
-        for(map<string, int>::iterator it=subclassCounts.begin();it!=subclassCounts.end();it++){
-            itB = it;itB++;
-            for(itB;itB!=subclassCounts.end();itB++){
-                compGroup temp(it->first,itB->first);
-                comp.push_back(temp);
-            }                  
+        int numClasses = class2GroupIndex.size();
+        vector< vector<double> > means; //[numOTUS][classes]
+        means.resize(numBins);
+        for (int i = 0; i < means.size(); i++) {  means[i].resize(numClasses, 0.0); }
+        
+        map<int, string> indexToClass;
+        int count = 0;
+        //shortcut for vectors below
+        map<string, int> quickIndex;
+        vector<int> classCounts;
+        for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) {
+            for (int i = 0; i < (it->second).size(); i++) { indexToClass[(it->second)[i]] = it->first; }
+            quickIndex[it->first] = count; count++;
+            classCounts.push_back((it->second).size());
+        }
+        
+        for (int i = 0; i < numBins; i++) {
+            for (int j = 0; j < lookup.size(); j++) {
+                if (m->control_pressed) { return means; }
+                means[i][quickIndex[indexToClass[j]]] += lookup[j]->getAbundance(i);
+            }
+        }
+        
+        for (int i = 0; i < numBins; i++) {
+            for (int j = 0; j < numClasses; j++) { means[i][j] /= (double) classCounts[j];  }
+        }
+        
+        return means;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "getMeans");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+vector< vector<double> > LefseCommand::lda(vector< vector<double> >& adjustedLookup, vector<int> rand_s, map<int, string>& indexToClass, vector<string> classes) {
+    try {
+        //shortcut for vectors below
+        map<string, int> quickIndex;
+        for (int i = 0; i < classes.size(); i++) { quickIndex[classes[i]] = i; }
+        
+        vector<string> randClass; //classes for rand sample
+        vector<int> counts; counts.resize(classes.size(), 0);
+        for (int i = 0; i < rand_s.size(); i++) {
+            string thisClass = indexToClass[rand_s[i]];
+            randClass.push_back(thisClass);
+            counts[quickIndex[thisClass]]++;
          }
  
-        int numComp = comp.size();
-        if (numComp < 2) {  m->mothurOut("[ERROR]: Need at least 2 subclasses, Ignoring Wilcoxon.\n");
-            return significantOtuLabels;  }
+        vector< vector<double> > a; //[numOTUs][numSampled]
+        for (int i = 0; i < adjustedLookup.size(); i++) {
+            vector<double> temp;
+            for (int j = 0; j < rand_s.size(); j++) {
+                temp.push_back(adjustedLookup[i][rand_s[j]]);
+            }
+            a.push_back(temp);
+        }
+        
+        LinearAlgebra linear;
+        vector< vector<double> > means; bool ignore;
+        vector< vector<double> > scaling = linear.lda(a, randClass, means, ignore); //means are returned sorted, quickIndex sorts as well since it uses a map. means[class][otu] =
+        if (ignore) { scaling.clear(); return scaling; }
+        if (m->control_pressed) { return scaling; }
+        
+        vector< vector<double> > w; w.resize(a.size()); //w.unit <- w/sqrt(sum(w^2))
+        double denom = 0.0;
+        for (int i = 0; i < scaling.size(); i++) { w[i].push_back(scaling[i][0]); denom += (w[i][0]*w[i][0]); }
+        denom = sqrt(denom);
+        for (int i = 0; i < w.size(); i++) {  w[i][0] /= denom;  } //[numOTUs][1] - w.unit
+        
+        //robjects.r('LD <- xy.matrix%*%w.unit') [numSampled][numOtus] * [numOTUs][1]
+        vector< vector<double> > LD = linear.matrix_mult(linear.transpose(a), w);
          
+        //find means for each groups LDs
+        vector<double> LDMeans; LDMeans.resize(classes.size(), 0.0); //means[0] -> average for [group0].
+        for (int i = 0; i < LD.size(); i++) {  LDMeans[quickIndex[randClass[i]]] += LD[i][0]; } 
+        for (int i = 0; i < LDMeans.size(); i++) { LDMeans[i] /= (double) counts[i]; }
+    
+               //calculate for each comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
+        vector< vector<double> > results;// [numComparison][numOTUs]
+               for (int i = 0; i < LDMeans.size(); i++) {
+                       for (int l = 0; l < i; l++) {
+                if (m->control_pressed) { return scaling; }
+                //robjects.r('effect.size <- abs(mean(LD[sub_d[,"class"]=="'+p[0]+'"]) - mean(LD[sub_d[,"class"]=="'+p[1]+'"]))')
+                double effectSize = abs(LDMeans[i] - LDMeans[l]);
+                //scal = robjects.r('wfinal <- w.unit * effect.size')
+                vector<double> compResults;
+                for (int j = 0; j < w.size(); j++) { //[numOTUs][1]
+                    //coeff = [abs(float(v)) if not math.isnan(float(v)) else 0.0 for v in scal]
+                    double coeff = abs(w[j][0]*effectSize); if (isnan(coeff) || isinf(coeff)) { coeff = 0.0; }
+                    //gm = abs(res[p[0]][j] - res[p[1]][j]) - res is the means for each group for each otu
+                    double gm = abs(means[i][j] - means[l][j]);
+                    //means[k][i].append((gm+coeff[j])*0.5)
+                    compResults.push_back((gm+coeff)*0.5);
+                }
+                results.push_back(compResults);
+            }
+               }
+        
+        return results;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "lda");
+        exit(1);
+    }
+}
+
+//**********************************************************************************************************************
+//modelled after lefse.py contast_within_classes_or_few_per_class function
+bool LefseCommand::contastWithinClassesOrFewPerClass(vector< vector<double> >& lookup, vector<int> rands, int minCl, map<string, vector<int> > class2GroupIndex, map<int, string> indexToClass) {
+    try {
+        set<string> cls;
+        int countFound = 0;
+        
+        for (int i = 0; i < rands.size(); i++) { //fill cls with the classes represented in the random selection
+            for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) {
+                if (m->inUsersGroups(rands[i], (it->second))) {
+                    cls.insert(it->first);
+                    countFound++;
+                }
+            }
+        }
+        
+        //sanity check
+        if (rands.size() != countFound) { m->mothurOut("oops, should never get here, missing something.\n"); }
+        
+        if (cls.size() < class2GroupIndex.size()) { return true; } //some classes are not present in sampling
+        
+        for (set<string>::iterator it = cls.begin(); it != cls.end(); it++) {
+            if (cls.count(*it) < minCl) { return true; } //this sampling has class count below minimum
+        }
+        
+        //for this otu
+        int numBins = lookup.size();
+        for (int i = 0; i < numBins; i++) {
+            if (m->control_pressed) { break; }
+                
+            //break up random sampling by class
+            map<string, set<double> > class2Values; //maps class name -> set of abunds present in random sampling. F003Early -> 0.001, 0.003... 
+            for (int j = 0; j < rands.size(); j++) {
+                class2Values[indexToClass[rands[j]]].insert(lookup[i][rands[j]]);
+                //rands[j] = index of randomly selected group in lookup, randIndex2Class[rands[j]] = class this group belongs to. lookup[rands[j]]->getAbundance(i) = abundance of this group for this OTU.
+            }
+            //are the unique values less than we want
+            //if (len(set(col)) <= min_cl and min_cl > 1) or (min_cl == 1 and len(set(col)) <= 1):
+            for (map<string, set<double> >::iterator it = class2Values.begin(); it != class2Values.end(); it++) {
+                if (((it->second).size() <= minCl && minCl > 1) || (minCl == 1 && (it->second).size() <= 1)) {  return true; }
+            }
+        }
+        
+        return false;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "contastWithinClassesOrFewPerClass");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+int LefseCommand::printResults(vector< vector<double> > means, map<int, double> sigKW, map<int, double> sigLDA, string label, vector<string> classes) {
+    try {
          map<string, string> variables;
          variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
-        variables["[distance]"] = lookup[0]->getLabel();
-        string outputFileName = getOutputFileName("wilcoxon",variables);
+        variables["[distance]"] = label;
+        string outputFileName = getOutputFileName("summary",variables);
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
          
-        ofstream out;
-        m->openOutputFile(outputFileName, out);
-        outputNames.push_back(outputFileName); outputTypes["wilcoxon"].push_back(outputFileName);
-        out << "OTULabel\tComparision\tWilcoxon\tPvalue\n";
+        //output headers
+        out << "OTU\tLogMaxMean\tClass\tLDA\tpValue\n";
          
-        for (int i = 0; i < numBins; i++) {
+        string temp = "";
+        for (int i = 0; i < means.size(); i++) { //[numOTUs][classes]
+            //find max mean of classes
+            double maxMean = -1.0; string maxClass = "none";
+            for (int j = 0; j < means[i].size(); j++) {   if (means[i][j] > maxMean) { maxMean = means[i][j]; maxClass = classes[j]; } }
+            
+            //str(math.log(max(max(v),1.0),10.0))
+            double logMaxMean = 1.0;
+            if (maxMean > logMaxMean) { logMaxMean = maxMean; }
+            logMaxMean = log10(logMaxMean);
+            
+            out << m->currentBinLabels[i] << '\t' << logMaxMean << '\t';
+            if (m->debug) { temp = m->currentBinLabels[i] + '\t' + toString(logMaxMean) + '\t'; }
+            
+            map<int, double>::iterator it = sigLDA.find(i);
+            if (it != sigLDA.end()) {
+                out << maxClass << '\t' << it->second << '\t' << sigKW[i] << endl; //sigLDA is a subset of sigKW so no need to look
+                if (m->debug) { temp += maxClass + '\t' + toString(it->second) + '\t' + toString(sigKW[i]) + '\n'; m->mothurOut(temp); temp = ""; }
+            }else { out << '-' << endl; }
+        }
+        
+        out.close();
+        
+        return 0;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "printResults");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+//printToCoutForRTesting(adjustedLookup, rand_s, class2GroupIndex, numBins);
+bool LefseCommand::printToCoutForRTesting(vector< vector<double> >& adjustedLookup, vector<int> rand_s, map<string, vector<int> >& class2GroupIndex, map<int, double> bins, map<string, vector<int> >& subClass2GroupIndex, vector<string> groups) {
+    try {
+        cout << "rand_s = ";
+        for (int h = 0; h < rand_s.size(); h++) { cout << rand_s[h] << '\t'; } cout << endl;
+        
+        //print otu data
+        int count = 0;
+        for (map<int, double>::iterator it = bins.begin(); it != bins.end(); it++) {
+            if (m->control_pressed) { break; }
+            
+            cout << m->currentBinLabels[it->first] << " <- c(";
+            for (int h = 0; h < rand_s.size()-1; h++) {  cout << (adjustedLookup[count][rand_s[h]]) << ", "; }
+            cout << (adjustedLookup[count][rand_s[rand_s.size()-1]]) << ")\n";
+            count++;
+        }
+        /*
+        string tempOutput = "";
+        for (int h = 0; h < rand_s.size(); h++) {
+            //find class this index is in
+            for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it!= class2GroupIndex.end(); it++) {
+                if (m->inUsersGroups(rand_s[h], (it->second)) ) {   cout << (h+1) << " <- c(\"" +it->first + "\")\n" ; }
+            }
+        }*/
+        
+       
+        string tempOutput = "treatments <- c(";
+        for (int h = 0; h < rand_s.size(); h++) {
+            //find class this index is in
+            for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it!= class2GroupIndex.end(); it++) {
+                if (m->inUsersGroups(rand_s[h], (it->second)) ) {   tempOutput += "\"" +it->first + "\"" + ","; } //"\"" +it->first + "\""
+            }
+        }
+        tempOutput = tempOutput.substr(0, tempOutput.length()-1);
+        tempOutput += ")\n";
+        cout << tempOutput;
+        
+         /*
+        if (subclass != "") {
+            string tempOutput = "sub <- c(";
+            for (int h = 0; h < rand_s.size(); h++) {
+                //find class this index is in
+                for (map<string, vector<int> >::iterator it = subClass2GroupIndex.begin(); it!= subClass2GroupIndex.end(); it++) {
+                    if (m->inUsersGroups(rand_s[h], (it->second)) ) {   tempOutput += "\"" +it->first + "\"" + ','; }
+                }
+            }
+            tempOutput = tempOutput.substr(0, tempOutput.length()-1);
+            tempOutput += ")\n";
+            cout << tempOutput;
+        }
+        
+        if (subject) {
+            string tempOutput = "group <- c(";
+            for (int h = 0; h < groups.size(); h++) {
+                tempOutput += "\"" +groups[h] + "\"" + ','; 
+            }
+            tempOutput = tempOutput.substr(0, tempOutput.length()-1);
+            tempOutput += ")\n";
+            cout << tempOutput;
+        }*/
+
+        
+        //print data frame
+        tempOutput = "dat <- data.frame(";
+        for (map<int, double>::iterator it = bins.begin(); it != bins.end(); it++) {
              if (m->control_pressed) { break; }
              
-            if (m->inUsersGroups(i, bins)) { //flagged in Kruskal Wallis
+            tempOutput += "\"" + m->currentBinLabels[it->first] + "\"=" + m->currentBinLabels[it->first] + ",";
+        }
+        //tempOutput = tempOutput.substr(0, tempOutput.length()-1);
+        tempOutput += " class=treatments";
+        //if (subclass != "") { tempOutput += ", subclass=sub"; }
+        //if (subject) { tempOutput += ", subject=group"; }
+        tempOutput += ")\n";
+        cout << tempOutput;
                  
-                bool sig = false;
-                //for each subclass comparision
-                for (int j = 0; j < numComp; j++) {
-                    //fill x and y with this comparisons data
-                    vector<double> x; vector<double> y; 
-                    
-                    cout << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << " x <- (";
-                    //fill x and y
-                    vector<int> xIndexes = subClass2GroupIndex[comp[j].group1]; //indexes in lookup for this subclass
-                    for (int k = 0; k < xIndexes.size(); k++) { x.push_back(lookup[xIndexes[k]]->getAbundance(i)); cout << lookup[xIndexes[k]]->getAbundance(i) << ", "; }
-                    cout << ")\n";
-                    
-                    cout << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << " y <- (";
-                    
-                    vector<int> yIndexes = subClass2GroupIndex[comp[j].group2]; //indexes in lookup for this subclass
-                    for (int k = 0; k < yIndexes.size(); k++) { y.push_back(lookup[yIndexes[k]]->getAbundance(i)); cout << lookup[yIndexes[k]]->getAbundance(i) << ", ";}
-                    cout << ")\n";
-                    
-                    double pValue = 0.0;
-                    double H = linear.calcWilcoxon(x, y, pValue);
+        tempOutput = "z <- suppressWarnings(mylda(as.formula(class ~ ";
+        for (map<int, double>::iterator it = bins.begin(); it != bins.end(); it++) {
+            if (m->control_pressed) { break; }
              
-                    //output H and signifigance
-                    if (!isnan(pValue)) { out << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << '\t' << H << '\t' << pValue << endl; }
-                    else { out << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << '\t' << H << '\t' << "NA" << endl; }
-                    
-                    //set sig - not sure how yet
-                }
-                if (sig) {  significantOtuLabels.push_back(i);  }
+            tempOutput +=  m->currentBinLabels[it->first] + "+";
+        }
+        tempOutput = tempOutput.substr(0, tempOutput.length()-1); //rip off extra plus sign
+        tempOutput += "), data = dat, tol = 1e-10))";
+        cout << tempOutput + "\nz\n";
+        cout << "w <- z$scaling[,1]\n"; //robjects.r('w <- z$scaling[,1]')
+        cout << "w.unit <- w/sqrt(sum(w^2))\n"; //robjects.r('w.unit <- w/sqrt(sum(w^2))')
+        cout << "ss <- dat[,-match(\"class\",colnames(dat))]\n"; //robjects.r('ss <- sub_d[,-match("class",colnames(sub_d))]')
+        //if (subclass != "") { cout << "ss <- ss[,-match(\"subclass\",colnames(ss))]\n";  }//robjects.r('ss <- ss[,-match("subclass",colnames(ss))]')
+        //if (subject) { cout << "ss <- ss[,-match(\"subject\",colnames(ss))]\n";  }//robjects.r('ss <- ss[,-match("subject",colnames(ss))]')
+        cout << "xy.matrix <- as.matrix(ss)\n"; //robjects.r('xy.matrix <- as.matrix(ss)')
+        cout << "LD <- xy.matrix%*%w.unit\n"; //robjects.r('LD <- xy.matrix%*%w.unit')
+        cout << "effect.size <- abs(mean(LD[dat[,\"class\"]==\"'+p[0]+'\"]) - mean(LD[dat[,\"class\"]==\"'+p[1]+'\"]))\n"; //robjects.r('effect.size <- abs(mean(LD[sub_d[,"class"]=="'+p[0]+'"]) - mean(LD[sub_d[,"class"]=="'+p[1]+'"]))')
+        cout << "wfinal <- w.unit * effect.size\n"; //scal = robjects.r('wfinal <- w.unit * effect.size')
+        cout << "mm <- z$means\n"; //rres = robjects.r('mm <- z$means')
+
+        return true;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "LefseCommand", "printToCoutForRTesting");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+int LefseCommand::makeShared(int numDesignLines) {
+    try {
+        ifstream in;
+        m->openInputFile(sharedfile, in);
+        vector< vector<string> > lines;
+        for(int i = 0; i < numDesignLines; i++) {
+            if (m->control_pressed) { return 0; }
+            
+            string line = m->getline(in);
+            cout << line << endl;
+            vector<string> pieces = m->splitWhiteSpace(line);
+            lines.push_back(pieces);
+        }
+        
+        ofstream out;
+        m->openOutputFile(sharedfile+".design", out); out << "group" << '\t';
+        for (int j = 0; j < lines.size(); j++) { out << lines[j][0] << '\t'; } out << endl;
+        for (int j = 1; j < lines[0].size(); j++) {
+            out <<(j-1) << '\t';
+            for (int i = 0; i < lines.size(); i++) {
+                 out << lines[i][j] << '\t';
              }
+            out << endl;
          }
          out.close();
+        DesignMap design(sharedfile+".design");
          
-        return significantOtuLabels;
+        vector<SharedRAbundFloatVector*> lookup;
+        for (int k = 0; k < lines[0].size()-1; k++) {
+            SharedRAbundFloatVector* temp = new SharedRAbundFloatVector();
+            temp->setLabel("0.03");
+            temp->setGroup(toString(k));
+            lookup.push_back(temp);
+        }
+        
+        m->currentBinLabels.clear();
+        int count = 0;
+        while (!in.eof()) {
+            if (m->control_pressed) { return 0; }
+            
+            string line = m->getline(in);
+            vector<string> pieces = m->splitWhiteSpace(line);
+            
+            float sum = 0.0;
+            for (int i = 1; i < pieces.size(); i++) {
+                float value; m->mothurConvert(pieces[i], value);
+                sum += value;
+            }
+            
+            if (sum != 0.0) {
+                //cout << count << '\t';
+                for (int i = 1; i < pieces.size(); i++) {
+                    float value; m->mothurConvert(pieces[i], value);
+                    lookup[i-1]->push_back(value, toString(i-1));
+                    //cout << pieces[i] << '\t';
+                }
+                m->currentBinLabels.push_back(toString(count));
+                //m->currentBinLabels.push_back(pieces[0]);
+                //cout << line<< endl;
+                //cout << endl;
+            }
+            count++;
+        }
+        in.close();
+        
+        for (int k = 0; k < lookup.size(); k++) {
+            //cout << "0.03" << '\t' << toString(k) << endl; lookup[k]->print(cout);
+        }
+        
+        process(lookup, design);
+        
+        return 0;
      }
      catch(exception& e) {
-        m->errorOut(e, "LefseCommand", "runWilcoxon");
+        m->errorOut(e, "LefseCommand", "printToCoutForRTesting");
          exit(1);
      }
  }
diff --git a/lefsecommand.h b/lefsecommand.h

index c9c4092e99f6bb402145f2e3432753ce642c7c6e..1c5ac6f0c251e588986e7f0efa1e027026e88f20 100644 (file)
--- a/lefsecommand.h
+++ b/lefsecommand.h
@@ -48,16 +48,27 @@ public:
      void help() { m->mothurOut(getHelpString()); }
      
  private:
-    bool abort, allLines;
-    string outputDir, sharedfile, designfile, mclass, subclass, classes;
+    bool abort, allLines, wilc, wilcsamename, curv, subject, normMillion;
+    string outputDir, sharedfile, designfile, mclass, subclass, classes, rankTec, multiClassStrat;
      vector<string> outputNames;
      set<string> labels;
-    float anovaAlpha, wilcoxonAlpha;
+    double anovaAlpha, wilcoxonAlpha, fBoots, ldaThreshold;
+    int nlogs, iters, strict, minC;
      
-    int process(vector<SharedRAbundVector*>&, DesignMap&);
-    vector<int> runKruskalWallis(vector<SharedRAbundVector*>&, DesignMap&);
-    vector<int> runWilcoxon(vector<SharedRAbundVector*>&, DesignMap&, vector<int>);
-
+    int process(vector<SharedRAbundFloatVector*>&, DesignMap&);
+    int normalize(vector<SharedRAbundFloatVector*>&);
+    map<int, double> runKruskalWallis(vector<SharedRAbundFloatVector*>&, DesignMap&);
+    map<int, double> runWilcoxon(vector<SharedRAbundFloatVector*>&, DesignMap&, map<int, double>, map<string, set<string> >& class2SubClasses, map<string, vector<int> >& subClass2GroupIndex, map<string, string>);
+    bool testOTUWilcoxon(map<string, set<string> >& class2SubClasses, vector<float> abunds, map<string, vector<int> >& subClass2GroupIndex, map<string, string>);
+    map<int, double> testLDA(vector<SharedRAbundFloatVector*>&, map<int, double>, map<string, vector<int> >& class2GroupIndex, map<string, vector<int> >&);
+    bool contastWithinClassesOrFewPerClass(vector< vector<double> >&, vector<int> rands, int minCl, map<string, vector<int> > class2GroupIndex,  map<int, string> indexToClass);
+    vector< vector<double> > lda(vector< vector<double> >& adjustedLookup, vector<int> rand_s, map<int, string>& indexToClass, vector<string>);
+    vector< vector<double> > getMeans(vector<SharedRAbundFloatVector*>& lookup, map<string, vector<int> >& class2GroupIndex);
+    int printResults(vector< vector<double> >, map<int, double>, map<int, double>, string, vector<string>);
+    
+    //for testing
+    bool printToCoutForRTesting(vector< vector<double> >& adjustedLookup, vector<int> rand_s, map<string, vector<int> >& class2GroupIndex, map<int, double> bins, map<string, vector<int> >&, vector<string>);
+    int makeShared(int);
  };
  
  /**************************************************************************************************/
diff --git a/linearalgebra.cpp b/linearalgebra.cpp

index cd2ca0397bb21493761c69f729893103fc314613..fdd95ec9602cb3b44cf3e1aa54934fc3045982c8 100644 (file)
--- a/linearalgebra.cpp
+++ b/linearalgebra.cpp
@@ -253,7 +253,7 @@ double LinearAlgebra::betacf(const double a, const double b, const double x) {
         }
  }
  /*********************************************************************************************************************************/
-
+//[3][4] * [4][5] - columns in first must match rows in second, returns matrix[3][5]
  vector<vector<double> > LinearAlgebra::matrix_mult(vector<vector<double> > first, vector<vector<double> > second){
         try {
                 vector<vector<double> > product;
@@ -287,7 +287,23 @@ vector<vector<double> > LinearAlgebra::matrix_mult(vector<vector<double> > first
         }
         
  }
+/*********************************************************************************************************************************/
  
+vector<vector<double> > LinearAlgebra::transpose(vector<vector<double> >matrix){
+       try {
+               vector<vector<double> > trans; trans.resize(matrix[0].size());
+        for (int i = 0; i < trans.size(); i++) {
+            for (int j = 0; j < matrix.size(); j++) { trans[i].push_back(matrix[j][i]); }
+        }
+                               
+               return trans;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "LinearAlgebra", "transpose");
+               exit(1);
+       }
+       
+}
  /*********************************************************************************************************************************/
  
  void LinearAlgebra::recenter(double offset, vector<vector<double> > D, vector<vector<double> >& G){
@@ -1304,6 +1320,31 @@ double LinearAlgebra::calcKruskalWallis(vector<spearmanRank>& values, double& pV
         }
  }
  /*********************************************************************************************************************************/
+//python random.normalvariate - thanks http://madssj.com/blog/2008/05/07/porting-normalvariate-from-python-to-c/
+double LinearAlgebra::normalvariate(double mu, double sigma) {
+    try {
+        double NV_MAGICCONST = 1.7155277699214135; /* (4 * exp(-0.5) / sqrt(2.0)); */
+        unsigned long int MAX_RANDOM = 2147483647; /* (2 ** 31) - 1; */
+        
+        double u1, u2, z, zz;
+        for (;;) {
+            if (m->control_pressed) { break; }
+            u1 = ((float)random()) / MAX_RANDOM;
+            u2 = 1.0 - (((float)random()) / MAX_RANDOM);
+            z = NV_MAGICCONST * (u1 - 0.5) / u2;
+            zz = z * z / 4.0;
+            if (zz <= -(log(u2))) {
+                break;
+            }
+        }
+        return mu + z * sigma;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "LinearAlgebra", "normalvariate");
+               exit(1);
+       }
+}
+/*********************************************************************************************************************************/
  //thanks http://www.johndcook.com/cpp_phi.html
  double LinearAlgebra::pnorm(double x){
      try {
@@ -2014,3 +2055,565 @@ vector<vector<double> > LinearAlgebra::getInverse(vector<vector<double> > matrix
                 exit(1);
         }
  }
+/*********************************************************************************************************************************/
+//modelled R lda function - MASS:::lda.default
+vector< vector<double> > LinearAlgebra::lda(vector< vector<double> >& a, vector<string> groups, vector< vector<double> >& means, bool& ignore) {
+    try {
+        
+        set<string> uniqueGroups;
+        for (int i = 0; i < groups.size(); i++) { uniqueGroups.insert(groups[i]); }
+        int numGroups = uniqueGroups.size();
+        
+        map<string, int> quickIndex; //className to index. hoping to save counts, proportions and means in vectors to save time. This map will allow us to know index 0 in counts refers to group1.
+        int count = 0;
+        for (set<string>::iterator it = uniqueGroups.begin(); it != uniqueGroups.end(); it++) { quickIndex[*it] = count; count++; }
+        
+        int numSampled = groups.size(); //number of sampled groups
+        int numOtus = a.size(); //number of flagged bins
+        
+        //counts <- as.vector(table(g)) //number of samples from each class in random sampling
+        vector<int> counts; counts.resize(numGroups, 0);
+        for (int i = 0; i < groups.size(); i++) {
+            counts[quickIndex[groups[i]]]++;
+        }
+        
+        vector<double> proportions; proportions.resize(numGroups, 0.0);
+        for (int i = 0; i < numGroups; i++) {  proportions[i] = counts[i] / (double) numSampled; }
+        
+        means.clear(); //means[0] -> means[0][0] average for [group0][OTU0].
+        means.resize(numGroups); for (int i = 0; i < means.size(); i++) { means[i].resize(numOtus, 0.0); }
+        for (int j = 0; j < numSampled; j++) { //total for each class for each OTU
+            for (int i = 0; i < numOtus; i++) { means[quickIndex[groups[j]]][i] += a[i][j]; }
+        }
+        //average for each class for each OTU
+        for (int j = 0; j < numGroups; j++) { for (int i = 0; i < numOtus; i++) { means[j][i] /= counts[j]; }  }
+        
+        //randCov <- x - group.means[g, ]
+        vector< vector<double> > randCov; //randCov[0][0] -> (random sample value0 for OTU0 - average for samples group in OTU0). example OTU0, random sample 0.01 from class early. average of class early for OTU0 is 0.005. randCov[0][0] = (0.01-0.005)
+        for (int i = 0; i < numOtus; i++) { //for each flagged OTU
+            vector<double> tempRand;
+            for (int j = 0; j < numSampled; j++) { tempRand.push_back(a[i][j] - means[quickIndex[groups[j]]][i]);  }
+            randCov.push_back(tempRand);
+        }
+        
+        //find variance and std for each OTU
+        //f1 <- sqrt(diag(var(x - group.means[g, ])))
+        vector<double> stdF1;
+        vector<double> ave;
+        for (int i = 0; i < numOtus; i++) {
+            stdF1.push_back(0.0);
+            ave.push_back(m->getAverage(randCov[i]));
+        }
+        
+        for (int i = 0; i < numOtus; i++) {
+            for (int j = 0; j < numSampled; j++) { stdF1[i] += ((randCov[i][j] - ave[i]) * (randCov[i][j] - ave[i]));  }
+        }
+        
+        //fac <- 1/(n - ng)
+        double fac = 1 / (double) (numSampled-numGroups);
+        
+        for (int i = 0; i < stdF1.size(); i++) {
+            stdF1[i] /= (double) (numSampled-1);
+            stdF1[i] = sqrt(stdF1[i]);
+        }
+        
+        vector< vector<double> > scaling; //[numOTUS][numOTUS]
+        for (int i = 0; i < numOtus; i++) {
+            vector<double> temp;
+            for (int j = 0; j < numOtus; j++) {
+                if (i == j) { temp.push_back(1.0/stdF1[i]); }
+                else { temp.push_back(0.0); }
+                
+            }
+            scaling.push_back(temp);
+        }
+        /*
+         cout << "scaling = " << endl;
+         for (int i = 0; i < scaling.size(); i++) {
+         for (int j = 0; j < scaling[i].size(); j++) { cout << scaling[i][j] << '\t'; }
+         cout << endl;
+         }*/
+        
+        //X <- sqrt(fac) * ((x - group.means[g, ]) %*% scaling)
+        vector< vector<double> > X = randCov; //[numOTUS][numSampled]
+        //((x - group.means[g, ]) %*% scaling)
+        //matrix multiplication of randCov and scaling
+        LinearAlgebra linear;
+        X = linear.matrix_mult(scaling, randCov); //[numOTUS][numOTUS] * [numOTUS][numSampled] = [numOTUS][numSampled]
+        fac = sqrt(fac);
+        
+        for (int i = 0; i < X.size(); i++) {
+            for (int j = 0; j < X[i].size(); j++) { X[i][j] *= fac;  }
+        }
+        
+        vector<double> d;
+        vector< vector<double> > v;
+        vector< vector<double> > Xcopy; //X = [numOTUS][numSampled]
+        bool transpose = false; //svd requires rows < columns, so if they are not then I need to transpose and look for the results in v.
+        if (X.size() < X[0].size()) { Xcopy = linear.transpose(X); transpose=true; }
+        else                        { Xcopy = X;                    }
+        linear.svd(Xcopy, d, v); //Xcopy gets the results we want for v below, because R's version is [numSampled][numOTUS]
+        
+        /*cout << "Xcopy = " << endl;
+        for (int i = 0; i < Xcopy.size(); i++) {
+            for (int j = 0; j < Xcopy[i].size(); j++) { cout << Xcopy[i][j] << '\t'; }
+            cout << endl;
+        }
+        cout << "v = " << endl;
+        for (int i = 0; i < v.size(); i++) {
+            for (int j = 0; j < v[i].size(); j++) { cout << v[i][j] << '\t'; }
+            cout << endl;
+        }
+         */
+        
+        int rank = 0;
+        set<int> goodColumns;
+        //cout << "d = " << endl;
+        for (int i = 0; i < d.size(); i++) {  if (d[i] > 0.0000000001) { rank++; goodColumns.insert(i); } } //cout << d[i] << endl;
+        
+        if (rank == 0) {
+            ignore=true; //m->mothurOut("[ERROR]: rank = 0: variables are numerically const\n"); m->control_pressed = true;
+            return scaling; }
+        
+        //scaling <- scaling %*% X.s$v[, 1L:rank] %*% diag(1/X.s$d[1L:rank], , rank)
+        //X.s$v[, 1L:rank] = columns in Xcopy that correspond to "good" d values
+        //diag(1/X.s$d[1L:rank], , rank) = matrix size rank * rank where the diagonal is 1/"good" dvalues
+        /*example:
+         d
+         [1] 3.721545e+00 3.034607e+00 2.296649e+00 7.986927e-16 6.922408e-16
+         [6] 5.471102e-16
+         
+         $v
+         [,1]        [,2]        [,3]        [,4]        [,5]        [,6]
+         [1,]  0.31122175  0.10944725  0.20183340 -0.30136820  0.60786235 -0.13537095
+         [2,] -0.29563726 -0.20568893  0.11233366 -0.05073289  0.48234270  0.21965978
+         ...
+         
+         [1] "X.s$v[, 1L:rank]"
+         [,1]        [,2]        [,3]
+         [1,]  0.31122175  0.10944725  0.20183340
+         [2,] -0.29563726 -0.20568893  0.11233366
+         ...
+         [1] "1/X.s$d[1L:rank]"
+         [1] 0.2687056 0.3295320 0.4354170
+         
+         [1] "diag(1/X.s$d[1L:rank], , rank)"
+         [,1]     [,2]     [,3]
+         [1,] 0.2687056 0.000000 0.000000
+         [2,] 0.0000000 0.329532 0.000000
+         [3,] 0.0000000 0.000000 0.435417
+         */
+        if (transpose) {
+            Xcopy = linear.transpose(v);
+            /*
+            cout << "Xcopy = " << endl;
+            for (int i = 0; i < Xcopy.size(); i++) {
+                for (int j = 0; j < Xcopy[i].size(); j++) { cout << Xcopy[i][j] << '\t'; }
+                cout << endl;
+            }*/
+        }
+        v.clear(); //store "good" columns - X.s$v[, 1L:rank]
+        v.resize(Xcopy.size()); //[numOTUS]["good" columns]
+        for (set<int>::iterator it = goodColumns.begin(); it != goodColumns.end(); it++) {
+            for (int i = 0; i < Xcopy.size(); i++) {
+                v[i].push_back(Xcopy[i][*it]);
+            }
+        }
+        
+        vector< vector<double> > diagRanks; diagRanks.resize(rank);
+        for (int i = 0; i < rank; i++) { diagRanks[i].resize(rank, 0.0); }
+        count = 0;
+        for (set<int>::iterator it = goodColumns.begin(); it != goodColumns.end(); it++) {  diagRanks[count][count] = 1.0 / d[*it]; count++; }
+        
+        scaling = linear.matrix_mult(linear.matrix_mult(scaling, v), diagRanks); //([numOTUS][numOTUS]*[numOTUS]["good" columns]) = [numOTUS]["good" columns] then ([numOTUS]["good" columns] * ["good" columns]["good" columns] = scaling = [numOTUS]["good" columns]
+        
+        /*cout << "scaling = " << endl;
+        for (int i = 0; i < scaling.size(); i++) {
+            for (int j = 0; j < scaling[i].size(); j++) { cout << scaling[i][j] << '\t'; }
+            cout << endl;
+        }*/
+        
+        //Note: linear.matrix_mult [1][numGroups] * [numGroups][numOTUs] - columns in first must match rows in second, returns matrix[1][numOTUs]
+        vector< vector<double> > prior; prior.push_back(proportions);
+        vector< vector<double> >  xbar = linear.matrix_mult(prior, means);
+        vector<double> xBar = xbar[0]; //length numOTUs
+        
+        /*cout << "xbar" << endl;
+        for (int j = 0; j < numOtus; j++) {  cout << xBar[j] <<'\t'; }cout <<  endl;*/
+        //fac <- 1/(ng - 1)
+        fac = 1 / (double) (numGroups-1);
+        //scale(group.means, center = xbar, scale = FALSE) %*% scaling
+        vector< vector<double> > scaledMeans = means; //[numGroups][numOTUs]
+        for (int i = 0; i < numGroups; i++) {
+            for (int j = 0; j < numOtus; j++) {  scaledMeans[i][j] -= xBar[j]; }
+        }
+        scaledMeans = linear.matrix_mult(scaledMeans, scaling); //[numGroups][numOTUS]*[numOTUS]["good"columns] = [numGroups]["good"columns]
+        
+        
+        //sqrt((n * prior) * fac)
+        vector<double> temp = proportions; //[numGroups]
+        for (int i = 0; i < temp.size(); i++) { temp[i] *= numSampled * fac; temp[i] = sqrt(temp[i]);  }
+        
+        //X <- sqrt((n * prior) * fac) * (scale(group.means, center = xbar, scale = FALSE) %*% scaling)
+        //X <- temp * scaledMeans
+        X.clear(); X = scaledMeans; //[numGroups]["good"columns]
+        for (int i = 0; i < X.size(); i++) {
+            for (int j = 0; j < X[i].size(); j++) {  X[i][j] *= temp[j];  }
+        }
+        /*
+        cout << "X = " << endl;
+        for (int i = 0; i < X.size(); i++) {
+            for (int j = 0; j < X[i].size(); j++) { cout << X[i][j] << '\t'; }
+            cout << endl;
+        }
+        */
+        
+        d.clear(); v.clear();
+        //we want to transpose so results are in Xcopy, but if that makes rows > columns then we don't since svd requires rows < cols.
+        transpose=false;
+        if (X.size() > X[0].size()) {   Xcopy = X;  transpose=true;     }
+        else                        {   Xcopy = linear.transpose(X);    }
+        linear.svd(Xcopy, d, v); //Xcopy gets the results we want for v below
+        /*cout << "Xcopy = " << endl;
+        for (int i = 0; i < Xcopy.size(); i++) {
+            for (int j = 0; j < Xcopy[i].size(); j++) { cout << Xcopy[i][j] << '\t'; }
+            cout << endl;
+        }
+        
+        cout << "v = " << endl;
+        for (int i = 0; i < v.size(); i++) {
+            for (int j = 0; j < v[i].size(); j++) { cout << v[i][j] << '\t'; }
+            cout << endl;
+        }
+        
+        cout << "d = " << endl;
+        for (int i = 0; i < d.size(); i++) { cout << d[i] << endl; }*/
+        
+        //rank <- sum(X.s$d > tol * X.s$d[1L])
+        //X.s$d[1L] = larger value in d vector
+        double largeD = m->max(d);
+        rank = 0; goodColumns.clear();
+        for (int i = 0; i < d.size(); i++) { if (d[i] > (0.0000000001*largeD)) { rank++; goodColumns.insert(i); } }
+        
+        if (rank == 0) {
+            ignore=true;//m->mothurOut("[ERROR]: rank = 0: class means are numerically identical.\n"); m->control_pressed = true;
+            return scaling; }
+        
+        if (transpose) { Xcopy = linear.transpose(v);  }
+        //scaling <- scaling %*% X.s$v[, 1L:rank] - scaling * "good" columns
+        v.clear(); //store "good" columns - X.s$v[, 1L:rank]
+        v.resize(Xcopy.size()); //Xcopy = ["good"columns][numGroups]
+        for (set<int>::iterator it = goodColumns.begin(); it != goodColumns.end(); it++) {
+            for (int i = 0; i < Xcopy.size(); i++) {
+                v[i].push_back(Xcopy[i][*it]);
+            }
+        }
+        
+        scaling = linear.matrix_mult(scaling, v); //[numOTUS]["good" columns] * ["good"columns][new "good" columns]
+        
+        /*cout << "scaling = " << endl;
+        for (int i = 0; i < scaling.size(); i++) {
+            for (int j = 0; j < scaling[i].size(); j++) { cout << scaling[i][j] << '\t'; }
+            cout << endl;
+        }*/
+        ignore=false;
+        return scaling;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "LinearAlgebra", "lda");
+               exit(1);
+       }
+}
+/*********************************************************************************************************************************/
+//Singular value decomposition (SVD) - adapted from http://svn.lirec.eu/libs/magicsquares/src/SVD.cpp
+/*
+ * svdcomp - SVD decomposition routine.
+ * Takes an mxn matrix a and decomposes it into udv, where u,v are
+ * left and right orthogonal transformation matrices, and d is a
+ * diagonal matrix of singular values.
+ *
+ * This routine is adapted from svdecomp.c in XLISP-STAT 2.1 which is
+ * code from Numerical Recipes adapted by Luke Tierney and David Betz.
+ *
+ * Input to dsvd is as follows:
+ *   a = mxn matrix to be decomposed, gets overwritten with u
+ *   m = row dimension of a
+ *   n = column dimension of a
+ *   w = returns the vector of singular values of a
+ *   v = returns the right orthogonal transformation matrix
+ */
+
+int LinearAlgebra::svd(vector< vector<double> >& a, vector<double>& w, vector< vector<double> >& v) {
+    try {
+        int flag, i, its, j, jj, k, l, nm;
+        double c, f, h, s, x, y, z;
+        double anorm = 0.0, g = 0.0, scale = 0.0;
+
+        int numRows = a.size(); if (numRows == 0) { return 0; }
+        int numCols = a[0].size();
+        w.resize(numCols, 0.0);
+        v.resize(numCols); for (int i = 0; i < numCols; i++) { v[i].resize(numRows, 0.0); }
+    
+        vector<double> rv1; rv1.resize(numCols, 0.0);
+        if (numRows < numCols){  m->mothurOut("[ERROR]: numRows < numCols\n"); m->control_pressed = true; return 0; }
+
+        /* Householder reduction to bidiagonal form */
+        for (i = 0; i < numCols; i++)
+        {
+            /* left-hand reduction */
+            l = i + 1;
+            rv1[i] = scale * g;
+            g = s = scale = 0.0;
+            if (i < numRows)
+            {
+                for (k = i; k < numRows; k++)
+                    scale += fabs((double)a[k][i]);
+                if (scale)
+                {
+                    for (k = i; k < numRows; k++)
+                    {
+                        a[k][i] = (double)((double)a[k][i]/scale);
+                        s += ((double)a[k][i] * (double)a[k][i]);
+                    }
+                    f = (double)a[i][i];
+                    g = -SIGN(sqrt(s), f);
+                    h = f * g - s;
+                    a[i][i] = (double)(f - g);
+                    if (i != numCols - 1)
+                    {
+                        for (j = l; j < numCols; j++)
+                        {
+                            for (s = 0.0, k = i; k < numRows; k++)
+                                s += ((double)a[k][i] * (double)a[k][j]);
+                            f = s / h;
+                            for (k = i; k < numRows; k++)
+                                a[k][j] += (double)(f * (double)a[k][i]);
+                        }
+                    }
+                    for (k = i; k < numRows; k++)
+                        a[k][i] = (double)((double)a[k][i]*scale);
+                }
+            }
+            w[i] = (double)(scale * g);
+            
+            /* right-hand reduction */
+            g = s = scale = 0.0;
+            if (i < numRows && i != numCols - 1)
+            {
+                for (k = l; k < numCols; k++)
+                    scale += fabs((double)a[i][k]);
+                if (scale)
+                {
+                    for (k = l; k < numCols; k++)
+                    {
+                        a[i][k] = (double)((double)a[i][k]/scale);
+                        s += ((double)a[i][k] * (double)a[i][k]);
+                    }
+                    f = (double)a[i][l];
+                    g = -SIGN(sqrt(s), f);
+                    h = f * g - s;
+                    a[i][l] = (double)(f - g);
+                    for (k = l; k < numCols; k++)
+                        rv1[k] = (double)a[i][k] / h;
+                    if (i != numRows - 1)
+                    {
+                        for (j = l; j < numRows; j++)
+                        {
+                            for (s = 0.0, k = l; k < numCols; k++)
+                                s += ((double)a[j][k] * (double)a[i][k]);
+                            for (k = l; k < numCols; k++)
+                                a[j][k] += (double)(s * rv1[k]);
+                        }
+                    }
+                    for (k = l; k < numCols; k++)
+                        a[i][k] = (double)((double)a[i][k]*scale);
+                }
+            }
+            anorm = max(anorm, (fabs((double)w[i]) + fabs(rv1[i])));
+        }
+        
+        /* accumulate the right-hand transformation */
+        for (i = numCols - 1; i >= 0; i--)
+        {
+            if (i < numCols - 1)
+            {
+                if (g)
+                {
+                    for (j = l; j < numCols; j++)
+                        v[j][i] = (double)(((double)a[i][j] / (double)a[i][l]) / g);
+                    /* double division to avoid underflow */
+                    for (j = l; j < numCols; j++)
+                    {
+                        for (s = 0.0, k = l; k < numCols; k++)
+                            s += ((double)a[i][k] * (double)v[k][j]);
+                        for (k = l; k < numCols; k++)
+                            v[k][j] += (double)(s * (double)v[k][i]);
+                    }
+                }
+                for (j = l; j < numCols; j++)
+                    v[i][j] = v[j][i] = 0.0;
+            }
+            v[i][i] = 1.0;
+            g = rv1[i];
+            l = i;
+        }
+        
+        /* accumulate the left-hand transformation */
+        for (i = numCols - 1; i >= 0; i--)
+        {
+            l = i + 1;
+            g = (double)w[i];
+            if (i < numCols - 1)
+                for (j = l; j < numCols; j++)
+                    a[i][j] = 0.0;
+            if (g)
+            {
+                g = 1.0 / g;
+                if (i != numCols - 1)
+                {
+                    for (j = l; j < numCols; j++)
+                    {
+                        for (s = 0.0, k = l; k < numRows; k++)
+                            s += ((double)a[k][i] * (double)a[k][j]);
+                        f = (s / (double)a[i][i]) * g;
+                        for (k = i; k < numRows; k++)
+                            a[k][j] += (double)(f * (double)a[k][i]);
+                    }
+                }
+                for (j = i; j < numRows; j++)
+                    a[j][i] = (double)((double)a[j][i]*g);
+            }
+            else
+            {
+                for (j = i; j < numRows; j++)
+                    a[j][i] = 0.0;
+            }
+            ++a[i][i];
+        }
+        
+        /* diagonalize the bidiagonal form */
+        for (k = numCols - 1; k >= 0; k--)
+        {                             /* loop over singular values */
+            for (its = 0; its < 30; its++)
+            {                         /* loop over allowed iterations */
+                flag = 1;
+                for (l = k; l >= 0; l--)
+                {                     /* test for splitting */
+                    nm = l - 1;
+                    if (fabs(rv1[l]) + anorm == anorm)
+                    {
+                        flag = 0;
+                        break;
+                    }
+                    if (fabs((double)w[nm]) + anorm == anorm)
+                        break;
+                }
+                if (flag)
+                {
+                    c = 0.0;
+                    s = 1.0;
+                    for (i = l; i <= k; i++)
+                    {
+                        f = s * rv1[i];
+                        if (fabs(f) + anorm != anorm)
+                        {
+                            g = (double)w[i];
+                            h = pythag(f, g);
+                            w[i] = (double)h;
+                            h = 1.0 / h;
+                            c = g * h;
+                            s = (- f * h);
+                            for (j = 0; j < numRows; j++)
+                            {
+                                y = (double)a[j][nm];
+                                z = (double)a[j][i];
+                                a[j][nm] = (double)(y * c + z * s);
+                                a[j][i] = (double)(z * c - y * s);
+                            }
+                        }
+                    }
+                }
+                z = (double)w[k];
+                if (l == k)
+                {                  /* convergence */
+                    if (z < 0.0)
+                    {              /* make singular value nonnegative */
+                        w[k] = (double)(-z);
+                        for (j = 0; j < numCols; j++)
+                            v[j][k] = (-v[j][k]);
+                    }
+                    break;
+                }
+                if (its >= 30) {
+                    m->mothurOut("No convergence after 30,000! iterations \n"); m->control_pressed = true;
+                    return(0);
+                }
+                
+                /* shift from bottom 2 x 2 minor */
+                x = (double)w[l];
+                nm = k - 1;
+                y = (double)w[nm];
+                g = rv1[nm];
+                h = rv1[k];
+                f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+                g = pythag(f, 1.0);
+                f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
+                
+                /* next QR transformation */
+                c = s = 1.0;
+                for (j = l; j <= nm; j++)
+                {
+                    i = j + 1;
+                    g = rv1[i];
+                    y = (double)w[i];
+                    h = s * g;
+                    g = c * g;
+                    z = pythag(f, h);
+                    rv1[j] = z;
+                    c = f / z;
+                    s = h / z;
+                    f = x * c + g * s;
+                    g = g * c - x * s;
+                    h = y * s;
+                    y = y * c;
+                    for (jj = 0; jj < numCols; jj++)
+                    {
+                        x = (double)v[jj][j];
+                        z = (double)v[jj][i];
+                        v[jj][j] = (float)(x * c + z * s);
+                        v[jj][i] = (float)(z * c - x * s);
+                    }
+                    z = pythag(f, h);
+                    w[j] = (float)z;
+                    if (z) 
+                    {
+                        z = 1.0 / z;
+                        c = f * z;
+                        s = h * z;
+                    }
+                    f = (c * g) + (s * y);
+                    x = (c * y) - (s * g);
+                    for (jj = 0; jj < numRows; jj++)
+                    {
+                        y = (double)a[jj][j];
+                        z = (double)a[jj][i];
+                        a[jj][j] = (double)(y * c + z * s);
+                        a[jj][i] = (double)(z * c - y * s);
+                    }
+                }
+                rv1[l] = 0.0;
+                rv1[k] = f;
+                w[k] = (double)x;
+            }
+        }
+        
+        return(0);
+        
+    }
+       catch(exception& e) {
+               m->errorOut(e, "LinearAlgebra", "svd");
+               exit(1);
+       }
+}
+
+
diff --git a/linearalgebra.h b/linearalgebra.h

index 3c86cb1de9fb697bac4aae04adc298c8994b4e83..7453f4ebd8b86ed20f530c2e1ba7989808361fb2 100644 (file)
--- a/linearalgebra.h
+++ b/linearalgebra.h
@@ -20,9 +20,12 @@ public:
         ~LinearAlgebra() {}
         
         vector<vector<double> > matrix_mult(vector<vector<double> >, vector<vector<double> >);
+    vector<vector<double> >transpose(vector<vector<double> >);
         void recenter(double, vector<vector<double> >, vector<vector<double> >&);
-       int tred2(vector<vector<double> >&, vector<double>&, vector<double>&);
+       //eigenvectors
+    int tred2(vector<vector<double> >&, vector<double>&, vector<double>&);
         int qtli(vector<double>&, vector<double>&, vector<vector<double> >&);
+    
         vector< vector<double> > calculateEuclidianDistance(vector<vector<double> >&, int); //pass in axes and number of dimensions
         vector< vector<double> > calculateEuclidianDistance(vector<vector<double> >&); //pass in axes
         vector<vector<double> > getObservedEuclideanDistance(vector<vector<double> >&);
@@ -44,7 +47,9 @@ public:
      vector<float> solveEquations(vector<vector<float> >, vector<float>);
      vector<vector<double> > getInverse(vector<vector<double> >);
      double choose(double, double);
-    
+    double normalvariate(double mu, double sigma);
+    vector< vector<double> > lda(vector< vector<double> >& a, vector<string> groups, vector< vector<double> >& means, bool&); //Linear discriminant analysis - a is [features][valuesFromGroups] groups indicates which group each sampling comes from. For example if groups = early, late, mid, early, early. a[0][0] = value for feature0 from groupEarly.
+    int svd(vector< vector<double> >& a, vector<double>& w, vector< vector<double> >& v); //Singular value decomposition
  private:
         MothurOut* m;
         
@@ -72,8 +77,6 @@ private:
      void ludcmp(vector<vector<float> >&, vector<int>&, float&);
      void lubksb(vector<vector<float> >&, vector<int>&, vector<float>&);
      
-    
-    
  };
  
  #endif
diff --git a/makebiomcommand.cpp b/makebiomcommand.cpp

index a19e2e39dd805a40daee5a1ca61c3f045b6899cc..a1018833e6268b6d0fe04aa25d4598f37df9cb4d 100644 (file)
--- a/makebiomcommand.cpp
+++ b/makebiomcommand.cpp
@@ -405,6 +405,7 @@ int MakeBiomCommand::getBiom(vector<SharedRAbundVector*>& lookup){
          out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n";
          out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n";
          
+        int numBins = lookup[0]->getNumBins();
          vector<string> metadata = getMetaData(lookup);  
          
          if (m->control_pressed) {  out.close(); return 0; }
@@ -420,11 +421,11 @@ int MakeBiomCommand::getBiom(vector<SharedRAbundVector*>& lookup){
          out << spaces + "\"rows\":[\n";
          string rowFront = spaces + spaces + "{\"id\":\"";
          string rowBack = "\", \"metadata\":";
-        for (int i = 0; i < m->currentBinLabels.size()-1; i++) {
+        for (int i = 0; i < numBins-1; i++) {
              if (m->control_pressed) {  out.close(); return 0; }
              out << rowFront << m->currentBinLabels[i] << rowBack << metadata[i] << "},\n";
          }
-        out << rowFront << m->currentBinLabels[(m->currentBinLabels.size()-1)] << rowBack << metadata[(m->currentBinLabels.size()-1)] << "}\n" + spaces + "],\n";
+        out << rowFront << m->currentBinLabels[(numBins-1)] << rowBack << metadata[(numBins-1)] << "}\n" + spaces + "],\n";
          
          //get column info
          /*"columns": [
@@ -624,8 +625,8 @@ vector<string> MakeBiomCommand::getMetaData(vector<SharedRAbundVector*>& lookup)
  //**********************************************************************************************************************
  int MakeBiomCommand::getSampleMetaData(vector<SharedRAbundVector*>& lookup){
         try {
-        
-        if (metadatafile == "") { for (int i = 0; i < lookup.size(); i++) {  sampleMetadata.push_back("null");  } }
+        sampleMetadata.clear();
+        if (metadatafile == "") {  for (int i = 0; i < lookup.size(); i++) {  sampleMetadata.push_back("null");  } }
          else {
              ifstream in;
              m->openInputFile(metadatafile, in);
diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp

index f888fbe4928a9526d34e9b8f25200fb32a329f6f..0bf56e7c2166daf33816de12216d318a5fba466b 100644 (file)
--- a/makecontigscommand.cpp
+++ b/makecontigscommand.cpp
@@ -19,6 +19,8 @@ vector<string> MakeContigsCommand::setParameters(){
          CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(prqual);
          CommandParameter pfile("file", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "none","fasta-qfile",false,false,true); parameters.push_back(pfile);
          CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos);
+        CommandParameter pfindex("findex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pfindex);
+        CommandParameter prindex("rindex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(prindex);
                 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
                 CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs);
          CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs);
@@ -50,15 +52,17 @@ vector<string> MakeContigsCommand::setParameters(){
  string MakeContigsCommand::getHelpString(){    
         try {
                 string helpString = "";
-               helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta.  It will also provide new quality files if the fastq or file parameter is used.\n";
+               helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. \n";
          helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n";
-               helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n";
+        helpString += "If a forward index or reverse index file is provided barcodes be trimmed, and a group file will be created. The oligos parameter is required if an index file is given.\n";
+               helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, findex, rindex, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n";
                 helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n";
-        helpString += "The file parameter is 2 or 3 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file.  Mothur will process each pair and create a combined fasta and report file with all the sequences.\n";
+        helpString += "The file parameter is 2, 3 or 4 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file.  If you only have one index file add 'none' for the other one.  Mothur will process each pair and create a combined fasta and report file with all the sequences.\n";
          helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process.  If you provide one, you must provide the other.\n";
          helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process.  If you provide one, you must provide the other.\n";
          helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters.  If you provide one, you must provide the other.\n";
                 helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n";
+        helpString += "The findex and rindex parameters are used to provide a forward index and reverse index files to process.  \n";
          helpString += "The align parameter allows you to specify the alignment method to use.  Your options are: gotoh and needleman. The default is needleman.\n";
          helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
                 helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
@@ -216,6 +220,22 @@ MakeContigsCommand::MakeContigsCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["oligos"] = inputDir + it->second;           }
                                 }
+                
+                it = parameters.find("findex");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["findex"] = inputDir + it->second;           }
+                               }
+                
+                it = parameters.find("rindex");
+                               //user has given a template file
+                               if(it != parameters.end()){
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["rindex"] = inputDir + it->second;           }
+                               }
              }
              
              ffastqfile = validParameter.validFile(parameters, "ffastq", true);
@@ -264,6 +284,29 @@ MakeContigsCommand::MakeContigsCommand(string option)  {
                         else if(oligosfile == "not open")   {   abort = true;       } 
                         else {   m->setOligosFile(oligosfile);          }
              
+            findexfile = validParameter.validFile(parameters, "findex", true);
+                       if (findexfile == "not found")      {   findexfile = "";        }
+                       else if(findexfile == "not open")   {   abort = true;       }
+            
+            rindexfile = validParameter.validFile(parameters, "rindex", true);
+                       if (rindexfile == "not found")      {   rindexfile = "";        }
+                       else if(rindexfile == "not open")   {   abort = true;       }
+        
+            if ((rindexfile != "") || (findexfile != "")) {
+                               if (oligosfile == ""){
+                                       oligosfile = m->getOligosFile();
+                                       if (oligosfile != "") {  m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter.\n");  }
+                                       else {
+                        m->mothurOut("You need to provide an oligos file if you are going to use an index file.\n"); abort = true;
+                                       }
+                               }
+                
+                //can only use an index file with the fastq parameters not fasta and qual
+                if ((ffastafile != "") || (rfastafile != "")) {
+                    m->mothurOut("[ERROR]: You can only use an index file with the fastq parameters or the file option.\n"); abort = true;
+                }
+                       }
+            
              //if the user changes the output directory command factory will send this info to us in the output parameter 
                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
                   outputDir = ""; 
@@ -421,7 +464,6 @@ int MakeContigsCommand::execute(){
                          
              m->mothurOut("Making contigs...\n"); 
              createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames, l);
-             m->mothurOut("Here...\n"); 
              
              //remove temp fasta and qual files
              for (int i = 0; i < processors; i++) { for(int j = 0; j < filesToProcess[l][i].size(); j++) { m->mothurRemove(filesToProcess[l][i][j]); }  }
@@ -572,7 +614,7 @@ vector< vector< vector<string> > > MakeContigsCommand::preProcessData(unsigned l
          vector< vector< vector<string> > > filesToProcess;
          
          if (ffastqfile != "") { //reading one file
-            vector< vector<string> > files = readFastqFiles(numReads, ffastqfile, rfastqfile); 
+            vector< vector<string> > files = readFastqFiles(numReads, ffastqfile, rfastqfile, findexfile, rindexfile);
              //adjust for really large processors or really small files
              if (numReads == 0) {  m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; }
              if (numReads < processors) { 
@@ -593,12 +635,20 @@ vector< vector< vector<string> > > MakeContigsCommand::preProcessData(unsigned l
                      if (m->control_pressed) { for (int l = 0; l < filesToProcess.size(); l++) { for (int k = 0; k < filesToProcess[l].size(); k++) { for(int j = 0; j < filesToProcess[l][k].size(); j++) { m->mothurRemove(filesToProcess[l][k][j]); } filesToProcess[l][k].clear(); } return filesToProcess; } }
                      
                      unsigned long int thisFilesReads;
-                    vector< vector<string> > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1]); 
+                    vector< vector<string> > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1], filePairsToProcess[i][2], filePairsToProcess[i][3]);
                      
                      //adjust for really large processors or really small files
                      if (thisFilesReads < processors) { 
                          m->mothurOut("[ERROR]: " + filePairsToProcess[i][0] + " has less than " + toString(processors) + " good reads, skipping\n"); 
                          for (int k = 0; k < files.size(); k++) { for(int j = 0; j < files[k].size(); j++) { m->mothurRemove(files[k][j]); } files[k].clear(); }
+                        //remove from file2Group if necassary
+                        map<int, string> cFile2Group;
+                        for (map<int, string>::iterator it = file2Group.begin(); it != file2Group.end(); it++) {
+                            if ((it->first) < i) { cFile2Group[it->first] = it->second; }
+                            else if ((it->first) == i) { } //do nothing, we removed files for i
+                            else { cFile2Group[(it->first-1)] = it->second; } //adjust files because i was removed
+                        }
+                        file2Group = cFile2Group;
                      }else {
                          filesToProcess.push_back(files);
                          numReads += thisFilesReads;
@@ -870,10 +920,12 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
          string thisfqualfile = files[1];
          string thisrfastafile = files[2];
          string thisrqualfile = files[3];
+        string thisfindexfile = files[4];
+        string thisrindexfile = files[5];
          
-        if (m->debug) {  m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
+        if (m->debug) {  m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); }
          
-        ifstream inFFasta, inRFasta, inFQual, inRQual;
+        ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex;
          ofstream outFasta, outMisMatch, outScrapFasta;
          m->openInputFile(thisffastafile, inFFasta);
          m->openInputFile(thisrfastafile, inRFasta);
@@ -881,6 +933,10 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
              m->openInputFile(thisfqualfile, inFQual);
              m->openInputFile(thisrqualfile, inRQual);
          }
+        
+        if (thisfindexfile != "") { m->openInputFile(thisfindexfile, inFIndex);  }
+        if (thisrindexfile != "") { m->openInputFile(thisrindexfile, inRIndex);  }
+        
          m->openOutputFile(outputFasta, outFasta);
          m->openOutputFile(outputScrapFasta, outScrapFasta);
          m->openOutputFile(outputMisMatches, outMisMatch);
@@ -904,13 +960,27 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
                  fQual = new QualityScores(inFQual); m->gobble(inFQual);
                  rQual = new QualityScores(inRQual); m->gobble(inRQual);
              }
+            Sequence findexBarcode("findex", "NONE");  Sequence rindexBarcode("rindex", "NONE");
+            if (thisfindexfile != "") {
+                Sequence temp(inFIndex); m->gobble(inFIndex);
+                findexBarcode.setAligned(temp.getAligned());
+            }
+            
+            if (thisrindexfile != "") {
+                Sequence temp(inRIndex); m->gobble(inRIndex);
+                rindexBarcode.setAligned(temp.getAligned());
+            }
              
              int barcodeIndex = 0;
              int primerIndex = 0;
-            
+                        
              if(barcodes.size() != 0){
                  if (thisfqualfile != "") {
-                    success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
+                    if ((thisfindexfile != "") || (thisrindexfile != "")) {
+                        success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex);
+                    }else {
+                        success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
+                    }
                  }else {
                      success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
                  }
@@ -1097,11 +1167,11 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
         }
  }
  //**********************************************************************************************************************
-vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq){
+vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq, string findex, string rindex){
      try {
          vector< vector<string> > files;
          //maps processors number to file pointer
-        map<int, vector<ofstream*> > tempfiles;  //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual
+        map<int, vector<ofstream*> > tempfiles;  //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual, tempfiles[4] = forwardIndex, [4] = forwardReverse
          map<int, vector<ofstream*> >::iterator it;
          
          //create files to write to
@@ -1111,6 +1181,8 @@ vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& c
              ofstream* outFQ = new ofstream;     temp.push_back(outFQ);
              ofstream* outRF = new ofstream;     temp.push_back(outRF);
              ofstream* outRQ = new ofstream;     temp.push_back(outRQ);
+            ofstream* outFI = new ofstream;     temp.push_back(outFI);  
+            ofstream* outRI = new ofstream;     temp.push_back(outRI);  
              tempfiles[i] = temp;
              
              vector<string> names;
@@ -1120,8 +1192,13 @@ vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& c
              string rfastafilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rfastatemp";
              string fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(ffastq)) + toString(i) + "fqualtemp";
              string rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rqualtemp";
+            string findexfilename = ""; string rindexfilename = "";
+            noneOk = false; //flag to oligos file read that its okay to allow for non paired barcodes
+            if (findex != "") {  findexfilename = thisOutputDir + m->getRootName(m->getSimpleName(findex)) + toString(i) + "findextemp"; m->openOutputFile(findexfilename, *outFI);  noneOk = true; }
+            if (rindex != "") {  rindexfilename = thisOutputDir + m->getRootName(m->getSimpleName(rindex)) + toString(i) + "rindextemp";  m->openOutputFile(rindexfilename, *outRI); noneOk = true; }
              names.push_back(ffastafilename); names.push_back(fqualfilename);
              names.push_back(rfastafilename); names.push_back(rqualfilename);
+            names.push_back(findexfilename); names.push_back(rindexfilename);
              files.push_back(names);
              
              m->openOutputFile(ffastafilename, *outFF);
@@ -1135,7 +1212,7 @@ vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& c
              for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close();  delete (it->second)[i]; } }
              //remove files
              for (int i = 0; i < files.size(); i++) {  
-                for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); }
+                for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") {  m->mothurRemove(files[i][j]); }  }
              }
          }
          
@@ -1145,31 +1222,52 @@ vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& c
          ifstream inReverse;
          m->openInputFile(rfastq, inReverse);
          
+        ifstream infIndex, inrIndex;
+        bool findexIsGood = false;
+        bool rindexIsGood = false;
+        if (findex != "") { m->openInputFile(findex, infIndex); findexIsGood = true; }
+        if (rindex != "") { m->openInputFile(rindex, inrIndex); rindexIsGood = true; }
+        
          count = 0;
          map<string, fastqRead> uniques;
+        map<string, fastqRead> iUniques;
+        map<string, pairFastqRead> pairUniques;
          map<string, fastqRead>::iterator itUniques;
-        while ((!inForward.eof()) || (!inReverse.eof())) {
+        while ((!inForward.eof()) || (!inReverse.eof()) || (findexIsGood) || (rindexIsGood)) {
              
-            if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close();  delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) {  for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; }
+            if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close();  delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) {  for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; }
              
              //get a read from forward and reverse fastq files
-            bool ignoref, ignorer;
-            fastqRead thisFread, thisRread;
+            bool ignoref, ignorer, ignorefi, ignoreri;
+            fastqRead thisFread, thisRread, thisFIread, thisRIread;
              if (!inForward.eof()) {  thisFread = readFastq(inForward, ignoref); }
              else { ignoref = true; }
              if (!inReverse.eof()) { thisRread = readFastq(inReverse, ignorer);  }
              else { ignorer = true; }
+            if (findexIsGood) { thisFIread = readFastq(infIndex, ignorefi); if (infIndex.eof()) { findexIsGood = false; } }
+            else { ignorefi = true; }
+            if (rindexIsGood) { thisRIread = readFastq(inrIndex, ignoreri);  if (inrIndex.eof()) { rindexIsGood = false; } }
+            else { ignoreri = true; }
+            
+            bool allowOne = false;
+            if ((findex == "") || (rindex == "")) { allowOne = true; }
+            vector<pairFastqRead> frReads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false);
+            vector<pairFastqRead> friReads = getReads(ignorefi, ignoreri, thisFIread, thisRIread, iUniques, allowOne);
+            
+            //add in index info if provided
+            vector<pairFastqRead> reads = frReads;
+            if ((findex != "") || (rindex != "")) {  reads = mergeReads(frReads, friReads, pairUniques); }
              
-            vector<pairFastqRead> reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques);
-           
              for (int i = 0; i < reads.size(); i++) {
                  fastqRead fread = reads[i].forward;
                  fastqRead rread = reads[i].reverse;
+                fastqRead firead = reads[i].findex;
+                fastqRead riread = reads[i].rindex;
                  
-                if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); }
+                if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); if (findex != "") { m->mothurOut(toString(count) + '\t' + firead.name + '\n'); } if (rindex != "") { m->mothurOut(toString(count) + '\t' + riread.name + '\n'); } }
                 
                  //if (checkReads(fread, rread, ffastq, rfastq)) {
-                    if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close();  delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) {  for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; }
+                    if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close();  delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) {  for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; }
                      
                      //if the reads are okay write to output files
                      int process = count % processors;
@@ -1182,7 +1280,9 @@ vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& c
                      *(tempfiles[process][3]) << ">" << rread.name << endl;
                      for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; }
                      *(tempfiles[process][3]) << endl;
-                    
+                    if (findex != "") {  *(tempfiles[process][4]) << ">" << firead.name << endl << firead.sequence << endl; }
+                    if (rindex != "") {  *(tempfiles[process][5]) << ">" << riread.name << endl << riread.sequence << endl; }
+                
                      count++;
                      
                      //report progress
@@ -1197,6 +1297,9 @@ vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& c
              for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) {
                  m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n");
              }
+            for (map<string, pairFastqRead>:: iterator it = pairUniques.begin(); it != pairUniques.end(); it++) {
+                m->mothurOut("[WARNING]: did not find paired read for " + (it->first).substr(1) + ", ignoring.\n");
+            }
              m->mothurOutEndLine();
          }
          
@@ -1204,7 +1307,9 @@ vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& c
          for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close();  delete (it->second)[i]; } }
          inForward.close();
          inReverse.close();
-        
+        if (findex != "") { infIndex.close(); }
+        if (rindex != "") { inrIndex.close(); }
+       
          return files;
      }
      catch(exception& e) {
@@ -1238,8 +1343,10 @@ vector< vector<string> > MakeContigsCommand::readFastaFiles(unsigned long int& c
              if (fqualfile != "") { fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(fqualfile)) + toString(i) + "fqual.temp";  m->openOutputFile(fqualfilename, *outFQ); }
              string rqualfilename = "";
              if (rqualfile != "") { rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rqualfile)) + toString(i) + "rqual.temp"; m->openOutputFile(rqualfilename, *outRQ); }
+            string findexfilename = ""; string rindexfilename = "";
              names.push_back(ffastafilename); names.push_back(fqualfilename);
              names.push_back(rfastafilename); names.push_back(rqualfilename);
+            names.push_back(findexfilename); names.push_back(rindexfilename);
              files.push_back(names);
              
              m->openOutputFile(ffastafilename, *outFF);
@@ -1303,7 +1410,7 @@ vector< vector<string> > MakeContigsCommand::readFastaFiles(unsigned long int& c
                  }else { ignorer = true; }
              }
              
-            vector<pairFastqRead> reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques);
+            vector<pairFastqRead> reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false);
              
              for (int i = 0; i < reads.size(); i++) {
                  fastqRead fread = reads[i].forward;
@@ -1358,7 +1465,7 @@ vector< vector<string> > MakeContigsCommand::readFastaFiles(unsigned long int& c
      }
  }
  //**********************************************************************************************************************
-vector<pairFastqRead> MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques){
+vector<pairFastqRead> MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques, bool allowOne){
      try {
          vector<pairFastqRead> reads;
          map<string, fastqRead>::iterator itUniques;
@@ -1406,25 +1513,36 @@ vector<pairFastqRead> MakeContigsCommand::getReads(bool ignoref, bool ignorer, f
                                  
              }
          }else if (!ignoref && ignorer) { //ignore reverse keep forward
-            //look for forward pair
-            itUniques = uniques.find(forward.name);
-            if (itUniques != uniques.end()) {  //we have the pair for this read
-                pairFastqRead temp(forward, itUniques->second);
+            if (allowOne) {
+                fastqRead dummy;
+                pairFastqRead temp(forward, dummy);
                  reads.push_back(temp);
-                uniques.erase(itUniques);
-            }else { //save this read for later
-                uniques[forward.name] = forward;
+            }else {
+                //look for forward pair
+                itUniques = uniques.find(forward.name);
+                if (itUniques != uniques.end()) {  //we have the pair for this read
+                    pairFastqRead temp(forward, itUniques->second);
+                    reads.push_back(temp);
+                    uniques.erase(itUniques);
+                }else { //save this read for later
+                    uniques[forward.name] = forward;
+                }
              }
-
          }else if (ignoref && !ignorer) { //ignore forward keep reverse
-            //look for reverse pair
-            itUniques = uniques.find(reverse.name);
-            if (itUniques != uniques.end()) {  //we have the pair for this read
-                pairFastqRead temp(itUniques->second, reverse);
+            if (allowOne) {
+                fastqRead dummy;
+                pairFastqRead temp(dummy, reverse);
                  reads.push_back(temp);
-                uniques.erase(itUniques);
-            }else { //save this read for later
-                uniques[reverse.name] = reverse;
+            }else {
+                //look for reverse pair
+                itUniques = uniques.find(reverse.name);
+                if (itUniques != uniques.end()) {  //we have the pair for this read
+                    pairFastqRead temp(itUniques->second, reverse);
+                    reads.push_back(temp);
+                    uniques.erase(itUniques);
+                }else { //save this read for later
+                    uniques[reverse.name] = reverse;
+                }
              }
          }//else ignore both and do nothing
          
@@ -1436,6 +1554,73 @@ vector<pairFastqRead> MakeContigsCommand::getReads(bool ignoref, bool ignorer, f
      }
  }
  //**********************************************************************************************************************
+//look through the reads from the forward and reverse files and try to find matching reads from index files.
+vector<pairFastqRead> MakeContigsCommand::mergeReads(vector<pairFastqRead> thisReads, vector<pairFastqRead> indexes, map<string, pairFastqRead>& uniques){
+    try {
+        vector<pairFastqRead> reads;
+        map<string, pairFastqRead>::iterator itUniques;
+        
+        set<int> foundIndexes;
+        for (int i = 0; i < thisReads.size(); i++) {
+            bool found = false;
+            for (int j = 0; j < indexes.size(); j++) {
+                
+                //incase only one index
+                string indexName = indexes[j].forward.name;
+                if (indexName == "") { indexName = indexes[j].reverse.name; }
+                
+                if (thisReads[i].forward.name == indexName){
+                    thisReads[i].findex = indexes[j].forward;
+                    thisReads[i].rindex = indexes[j].reverse;
+                    reads.push_back(thisReads[i]);
+                    found = true;
+                    foundIndexes.insert(j);
+                }
+            }
+            
+            if (!found) {
+                //look for forward pair
+                itUniques = uniques.find('i'+thisReads[i].forward.name);
+                if (itUniques != uniques.end()) {  //we have the pair for this read
+                    thisReads[i].findex = itUniques->second.forward;
+                    thisReads[i].rindex = itUniques->second.reverse;
+                    reads.push_back(thisReads[i]);
+                    uniques.erase(itUniques);
+                }else { //save this read for later
+                    uniques['r'+thisReads[i].forward.name] = thisReads[i];
+                }
+            }
+        }
+        
+        if (foundIndexes.size() != indexes.size()) { //if we didnt match all the indexes look for them in uniques
+            for (int j = 0; j < indexes.size(); j++) {
+                if (foundIndexes.count(j) == 0) { //we didnt find this one
+                    //incase only one index
+                    string indexName = indexes[j].forward.name;
+                    if (indexName == "") { indexName = indexes[j].reverse.name; }
+                    
+                    //look for forward pair
+                    itUniques = uniques.find('r'+indexName);
+                    if (itUniques != uniques.end()) {  //we have the pair for this read
+                        pairFastqRead temp(itUniques->second.forward, itUniques->second.reverse, indexes[j].forward, indexes[j].reverse);
+                        reads.push_back(temp);
+                        uniques.erase(itUniques);
+                    }else { //save this read for later
+                        uniques['i'+indexName] = indexes[j];
+                    }
+                }
+            }
+        }
+       
+        
+        return reads;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "MakeContigsCommand", "mergeReads");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
  fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){
      try {
          fastqRead read;
@@ -1475,6 +1660,8 @@ fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){
          read.name = name;
          read.sequence = sequence;
          read.scores = qualScores;
+        
+        if (m->debug) { m->mothurOut("[DEBUG]: " + read.name + " " + read.sequence + " " + quality + "\n"); }
  
          return read;
      }
@@ -1508,10 +1695,16 @@ bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse, stri
      }
  }*/
  //***************************************************************************************************************
+//lines can be 2, 3, or 4 columns
+// forward.fastq reverse.fastq -> 2 column
+// groupName forward.fastq reverse.fastq -> 3 column
+// forward.fastq reverse.fastq forward.index.fastq  reverse.index.fastq  -> 4 column
+// forward.fastq reverse.fastq none  reverse.index.fastq  -> 4 column
+// forward.fastq reverse.fastq forward.index.fastq  none  -> 4 column
  vector< vector<string> > MakeContigsCommand::readFileNames(string filename){
         try {
          vector< vector<string> > files;
-        string forward, reverse;
+        string forward, reverse, findex, rindex;
          
          ifstream in;
          m->openInputFile(filename, in);
@@ -1520,28 +1713,35 @@ vector< vector<string> > MakeContigsCommand::readFileNames(string filename){
              
              if (m->control_pressed) { return files; }
              
-            in >> forward; m->gobble(in);
-            in >> reverse;
+            string line = m->getline(in);  m->gobble(in);
+            vector<string> pieces = m->splitWhiteSpace(line);
              
              string group = "";
-            while (!in.eof())  { //do we have a group assigned to this pair
-                char c = in.get();
-                if (c == 10 || c == 13 || c == -1){    break;  }
-                else if (c == 32 || c == 9){;} //space or tab
-                else {         group += c;  }
-            }
-            
-            if (group != "") {
-                //line in file look like: group forward reverse
-                string temp = forward;
-                forward = reverse;
-                reverse = group;
-                group = temp;
+            if (pieces.size() == 2) {
+                forward = pieces[0];
+                reverse = pieces[1];
+                group = "";
+                findex = "";
+                rindex = "";
+            }else if (pieces.size() == 3) {
+                group = pieces[0];
+                forward = pieces[1];
+                reverse = pieces[2];
+                findex = "";
+                rindex = "";
                  createFileGroup = true;
+            }else if (pieces.size() == 4) {
+                forward = pieces[0];
+                reverse = pieces[1];
+                findex = pieces[2];
+                rindex = pieces[3];
+                if ((findex == "none") || (findex == "NONE")){ findex = ""; }
+                if ((rindex == "none") || (rindex == "NONE")){ rindex = ""; }
+            }else {
+                m->mothurOut("[ERROR]: file lines can be 2, 3, or 4 columns. The forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file.  If you only have one index file add 'none' for the other one. \n"); m->control_pressed = true;
              }
-            m->gobble(in);
              
-            if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ".\n"); }
+            if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ", forwardIndex = " + findex + ", reverseIndex = " + rindex + ".\n"); }
              
              //check to make sure both are able to be opened
              ifstream in2;
@@ -1606,11 +1806,83 @@ vector< vector<string> > MakeContigsCommand::readFileNames(string filename){
                  m->mothurOut("[WARNING]: can't find " + reverse + ", ignoring pair.\n"); 
              }else{  in3.close();  }
              
-            if ((openForward != 1) && (openReverse != 1)) { //good pair
+            int openFindex = 0;
+            if (findex != "") {
+                ifstream in4;
+                openFindex = m->openInputFile(findex, in4, "noerror"); in4.close();
+                
+                //if you can't open it, try default location
+                if (openFindex == 1) {
+                    if (m->getDefaultPath() != "") { //default path is set
+                        string tryPath = m->getDefaultPath() + m->getSimpleName(findex);
+                        m->mothurOut("Unable to open " + findex + ". Trying default " + tryPath); m->mothurOutEndLine();
+                        ifstream in5;
+                        openFindex = m->openInputFile(tryPath, in5, "noerror");
+                        in5.close();
+                        findex = tryPath;
+                    }
+                }
+                
+                //if you can't open it, try output location
+                if (openFindex == 1) {
+                    if (m->getOutputDir() != "") { //default path is set
+                        string tryPath = m->getOutputDir() + m->getSimpleName(findex);
+                        m->mothurOut("Unable to open " + findex + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+                        ifstream in6;
+                        openFindex = m->openInputFile(tryPath, in6, "noerror");
+                        findex = tryPath;
+                        in6.close();
+                    }
+                }
+                
+                if (openFindex == 1) { //can't find it
+                    m->mothurOut("[WARNING]: can't find " + findex + ", ignoring pair.\n");
+                }
+            }
+            
+            int openRindex = 0;
+            if (rindex != "") {
+                ifstream in7;
+                openRindex = m->openInputFile(rindex, in7, "noerror"); in7.close();
+                
+                //if you can't open it, try default location
+                if (openRindex == 1) {
+                    if (m->getDefaultPath() != "") { //default path is set
+                        string tryPath = m->getDefaultPath() + m->getSimpleName(rindex);
+                        m->mothurOut("Unable to open " + rindex + ". Trying default " + tryPath); m->mothurOutEndLine();
+                        ifstream in8;
+                        openRindex = m->openInputFile(tryPath, in8, "noerror");
+                        in8.close();
+                        rindex = tryPath;
+                    }
+                }
+                
+                //if you can't open it, try output location
+                if (openRindex == 1) {
+                    if (m->getOutputDir() != "") { //default path is set
+                        string tryPath = m->getOutputDir() + m->getSimpleName(rindex);
+                        m->mothurOut("Unable to open " + rindex + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+                        ifstream in9;
+                        openRindex = m->openInputFile(tryPath, in9, "noerror");
+                        rindex = tryPath;
+                        in9.close();
+                    }
+                }
+                
+                if (openRindex == 1) { //can't find it
+                    m->mothurOut("[WARNING]: can't find " + rindex + ", ignoring pair.\n");
+                }
+            }
+
+            
+            if ((openForward != 1) && (openReverse != 1) && (openFindex != 1) && (openRindex != 1)) { //good pair
                  file2Group[files.size()] = group;
                  vector<string> pair;
                  pair.push_back(forward);
                  pair.push_back(reverse);
+                pair.push_back(findex);
+                pair.push_back(rindex);
+                if (((findex != "") || (rindex != "")) && (oligosfile == "")) { m->mothurOut("[ERROR]: You need to provide an oligos file if you are going to use an index file.\n"); m->control_pressed = true;  }
                  files.push_back(pair);
              }
          }
@@ -1619,7 +1891,7 @@ vector< vector<string> > MakeContigsCommand::readFileNames(string filename){
          return files;
      }
      catch(exception& e) {
-        m->errorOut(e, "MakeContigsCommand", "checkReads");
+        m->errorOut(e, "MakeContigsCommand", "readFileNames");
          exit(1);
      }
  }
@@ -1692,7 +1964,7 @@ bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, stri
                      oligosPair newPrimer(foligo, roligo);
                      
                      if (m->debug) { m->mothurOut("[DEBUG]: primer pair " + newPrimer.forward + " " + newPrimer.reverse + ", and group = " + group + ".\n"); }
-                                       
+                    
                                         //check for repeat barcodes
                      string tempPair = foligo+roligo;
                      if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine();  }
@@ -1715,6 +1987,8 @@ bool MakeContigsCommand::getOligos(vector<vector<string> >& fastaFileNames, stri
                      
                      oligosPair newPair(foligo, roligo);
                      
+                    if ((foligo == "NONE") || (roligo == "NONE")) { if (!noneOk) { m->mothurOut("[ERROR]: barcodes must be paired unless you are using an index file.\n"); m->control_pressed = true; } }
+                    
                      group = "";
                      while (!in.eof())  {       
                                                 char c = in.get(); 
diff --git a/makecontigscommand.h b/makecontigscommand.h

index b45501e0da726b182eccd0a01d9fba6086ee9bb8..1ea38358c7ec4c56e2d1adb926f7767301daef9d 100644 (file)
--- a/makecontigscommand.h
+++ b/makecontigscommand.h
@@ -32,9 +32,12 @@ struct fastqRead {
  struct pairFastqRead {
         fastqRead forward;
      fastqRead reverse;
+    fastqRead findex;
+    fastqRead rindex;
         
         pairFastqRead() {};
         pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
+    pairFastqRead(fastqRead f, fastqRead r, fastqRead fi, fastqRead ri) : forward(f), reverse(r), findex(fi), rindex(ri) {};
         ~pairFastqRead() {};
  };
  /**************************************************************************************************/
@@ -59,8 +62,8 @@ public:
      void help() { m->mothurOut(getHelpString()); }     
      
  private:
-    bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup, makeCount;
-    string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file, format;
+    bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup, makeCount, noneOk;
+    string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, findexfile, rindexfile, file, format;
         float match, misMatch, gapOpen, gapExtend;
         int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq;
      vector<string> outputNames;
@@ -81,14 +84,15 @@ private:
      fastqRead readFastq(ifstream&, bool&);
      vector< vector< vector<string> > > preProcessData(unsigned long int&);
      vector< vector<string> > readFileNames(string);
-    vector< vector<string> > readFastqFiles(unsigned long int&, string, string);
+    vector< vector<string> > readFastqFiles(unsigned long int&, string, string, string, string);
      vector< vector<string> > readFastaFiles(unsigned long int&, string, string);
      //bool checkReads(fastqRead&, fastqRead&, string, string);
      int createProcesses(vector< vector<string> >, string, string, string, vector<vector<string> >, int);
      int driver(vector<string>, string, string, string, vector<vector<string> >, int, string);
      bool getOligos(vector<vector<string> >&, string);
      string reverseOligo(string);
-    vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques);
+    vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques, bool);
+    vector<pairFastqRead> mergeReads(vector<pairFastqRead> frReads, vector<pairFastqRead> friReads, map<string, pairFastqRead>& pairUniques);
  };
  
  /**************************************************************************************************/
@@ -166,8 +170,10 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
          string thisfqualfile = pDataArray->files[1];
          string thisrfastafile = pDataArray->files[2];
          string thisrqualfile = pDataArray->files[3];
+        string thisfindexfile = pDataArray->files[4];
+        string thisrindexfile = pDataArray->files[5];
          
-        if (pDataArray->m->debug) {  pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
+        if (pDataArray->m->debug) {  pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); }
          
                 if(pDataArray->allFiles){
                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
@@ -180,7 +186,7 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
                         }
                 }
          
-        ifstream inFFasta, inRFasta, inFQual, inRQual;
+        ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex;
          ofstream outFasta, outMisMatch, outScrapFasta;
          pDataArray->m->openInputFile(thisffastafile, inFFasta);
          pDataArray->m->openInputFile(thisrfastafile, inRFasta);
@@ -188,6 +194,10 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
              pDataArray->m->openInputFile(thisfqualfile, inFQual);
              pDataArray->m->openInputFile(thisrqualfile, inRQual);
          }
+        
+        if (thisfindexfile != "") { pDataArray->m->openInputFile(thisfindexfile, inFIndex);  }
+        if (thisrindexfile != "") { pDataArray->m->openInputFile(thisrindexfile, inRIndex);  }
+        
          pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
          pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
          pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
@@ -213,12 +223,27 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){
                  rQual = new QualityScores(inRQual); pDataArray->m->gobble(inRQual);
              }
              
+            Sequence findexBarcode("findex", "NONE");  Sequence rindexBarcode("rindex", "NONE");
+            if (thisfindexfile != "") {
+                Sequence temp(inFIndex); pDataArray->m->gobble(inFIndex);
+                findexBarcode.setAligned(temp.getAligned());
+            }
+            
+            if (thisrindexfile != "") {
+                Sequence temp(inRIndex); pDataArray->m->gobble(inRIndex);
+                rindexBarcode.setAligned(temp.getAligned());
+            }
+
              int barcodeIndex = 0;
              int primerIndex = 0;
              
              if(pDataArray->barcodes.size() != 0){
                  if (thisfqualfile != "") {
-                    success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
+                    if ((thisfindexfile != "") || (thisrindexfile != "")) {
+                        success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex);
+                    }else {
+                        success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
+                    }
                  }else {
                      success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
                  }
diff --git a/makefile b/makefile

index 163db6bd0c547f2b73055ab5b4498dbd82ebfea3..04de53382601600c77590d109e065819f4f80a98 100644 (file)
--- a/makefile
+++ b/makefile
@@ -15,8 +15,8 @@ USEREADLINE ?= yes
  CYGWIN_BUILD ?= no
  USECOMPRESSION ?= no
  MOTHUR_FILES="\"Enter_your_default_path_here\""
-RELEASE_DATE = "\"5/29/2013\""
-VERSION = "\"1.31.1\""
+RELEASE_DATE = "\"8/19/2013\""
+VERSION = "\"1.31.2\""
  FORTAN_COMPILER = gfortran
  FORTRAN_FLAGS = 
  
@@ -37,10 +37,9 @@ ifeq  ($(strip $(64BIT_VERSION)),yes)
         #TARGET_ARCH += -m64 -static
  
         #if you are a linux user use the following line
-       #CXXFLAGS += -mtune=native -march=native -m64
+       #CXXFLAGS += -mtune=native -march=native 
         
         CXXFLAGS += -DBIT_VERSION 
-    FORTRAN_FLAGS = -m64
  endif
  
  
diff --git a/makelefsecommand.cpp b/makelefsecommand.cpp

index dd44249281fa90fa6d5c6dbefaa1380e5462a215..9f895f60d42b9bc6bc1527f6826bfd6fd0d23767 100644 (file)
--- a/makelefsecommand.cpp
+++ b/makelefsecommand.cpp
@@ -264,22 +264,28 @@ int MakeLefseCommand::runRelabund(map<string, consTax2>& consTax, vector<SharedR
              designMap = new DesignMap(designfile);
              vector<string> categories = designMap->getNamesOfCategories();
              
+            if (categories.size() > 3) {  m->mothurOut("\n[NOTE]: LEfSe input files allow for a class, subclass and subject.  More than 3 categories can cause formatting errors.\n\n"); }
+            
              for (int j = 0; j < categories.size(); j++) {
                  out << categories[j] << "\t";
-                for (int i = 0; i < lookup.size(); i++) {
+                for (int i = 0; i < lookup.size()-1; i++) {
                      if (m->control_pressed) { out.close(); delete designMap; return 0; }
                      string value = designMap->get(lookup[i]->getGroup(), categories[j]);
                      if (value == "not found") {
                          m->mothurOut("[ERROR]: " + lookup[i]->getGroup() + " is not in your design file, please correct.\n"); m->control_pressed = true;
                      }else { out << value << '\t'; }
                  }
+                string value = designMap->get(lookup[lookup.size()-1]->getGroup(), categories[j]);
+                if (value == "not found") {
+                    m->mothurOut("[ERROR]: " + lookup[lookup.size()-1]->getGroup() + " is not in your design file, please correct.\n"); m->control_pressed = true;
+                }else { out << value; }
                  out << endl;
              }
          }
          
          out << "group\t";
-        for (int i = 0; i < lookup.size(); i++) {  out << lookup[i]->getGroup() << '\t'; }
-        out << endl;
+        for (int i = 0; i < lookup.size()-1; i++) {  out << lookup[i]->getGroup() << '\t'; }
+        out << lookup[lookup.size()-1]->getGroup() << endl;
          
          for (int i = 0; i < lookup[0]->getNumBins(); i++) { //process each otu
              if (m->control_pressed) { break; }
@@ -306,8 +312,9 @@ int MakeLefseCommand::runRelabund(map<string, consTax2>& consTax, vector<SharedR
              out << nameOfOtu << '\t';
              
              //print out relabunds for each otu
-            for (int j = 0; j < lookup.size(); j++) { out << lookup[j]->getAbundance(i) << '\t'; }
-            out << endl;
+            for (int j = 0; j < lookup.size()-1; j++) { out << lookup[j]->getAbundance(i) << '\t'; }
+            
+            out << lookup[lookup.size()-1]->getAbundance(i)<< endl;
          }
          out.close();
  
diff --git a/mothurout.cpp b/mothurout.cpp

index bfd1cc1da84a5140869c6766fc1f3de8dc02bf0b..713bafd0c2a79ec637abdf9d481a0c96c183ee77 100644 (file)
--- a/mothurout.cpp
+++ b/mothurout.cpp
@@ -2864,7 +2864,50 @@ unsigned int MothurOut::fromBase36(string base36){
         }
  }
  /***********************************************************************/
-
+string  MothurOut::findEdianness() {
+    try {
+        // find real endian type
+        unsigned char EndianTest[2] = {1,0};
+        short x = *(short *)EndianTest;
+        
+        string endianType = "unknown";
+        if(x == 1) { endianType = "BIG_ENDIAN"; }
+        else { endianType = "LITTLE_ENDIAN";    }
+    
+        return endianType;
+    }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "findEdianness");
+               exit(1);
+       }
+}
+/***********************************************************************/
+double  MothurOut::median(vector<double> x) {
+    try {
+        double value = 0.0;
+        
+        if (x.size() == 0) { } //error
+        else {
+            //For example, if a < b < c, then the median of the list {a, b, c} is b, and, if a < b < c < d, then the median of the list {a, b, c, d} is the mean of b and c; i.e., it is (b + c)/2.
+            sort(x.begin(), x.end());
+            //is x.size even?
+            if ((x.size()%2) == 0) { //size() is even. median = average of 2 midpoints
+                int midIndex1 = (x.size()/2)-1;
+                int midIndex2 = (x.size()/2);
+                value = (x[midIndex1]+ x[midIndex2]) / 2.0;
+            }else { 
+                int midIndex = (x.size()/2);
+                value = x[midIndex];
+            }
+        }
+        return value;
+    }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "median");
+               exit(1);
+       }
+}
+/***********************************************************************/
  int MothurOut::factorial(int num){
         try {
                 int total = 1;
@@ -3499,7 +3542,7 @@ vector<double> MothurOut::getStandardDeviation(vector< vector<double> >& dists,
          return stdDev;
      }
         catch(exception& e) {
-               errorOut(e, "MothurOut", "getAverages");                
+               errorOut(e, "MothurOut", "getStandardDeviation");               
                 exit(1);
         }
  }
@@ -3782,6 +3825,44 @@ double MothurOut::getStandardDeviation(vector<int>& featureVector){
         }
  }
  /**************************************************************************************************/
+// returns largest value in vector
+double MothurOut::max(vector<double>& featureVector){
+    try {
+        if (featureVector.size() == 0) { mothurOut("[ERROR]: vector size = 0!\n"); control_pressed=true; return 0.0; }
+        
+        //finds largest
+        double largest = featureVector[0];
+        for (int i = 1; i < featureVector.size(); i++) {
+            if (featureVector[i] > largest) { largest = featureVector[i]; }
+        }
+                
+        return largest;
+    }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "max");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+// returns smallest value in vector
+double MothurOut::min(vector<double>& featureVector){
+    try {
+        if (featureVector.size() == 0) { mothurOut("[ERROR]: vector size = 0!\n"); control_pressed=true; return 0.0; }
+        
+        //finds smallest
+        double smallest = featureVector[0];
+        for (int i = 1; i < featureVector.size(); i++) {
+            if (featureVector[i] < smallest) { smallest = featureVector[i]; }
+        }
+        
+        return smallest;
+    }
+       catch(exception& e) {
+               errorOut(e, "MothurOut", "min");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
  
  
  
diff --git a/mothurout.h b/mothurout.h

index 3cf6fa4963672b5d69acb7de389eb266b957ba24..9987374e6c824cafe4aaf343c319d5bca14f209b 100644 (file)
--- a/mothurout.h
+++ b/mothurout.h
@@ -138,6 +138,7 @@ class MothurOut {
                 bool isTrue(string);
                 bool isContainingOnlyDigits(string);
                 bool isNumeric1(string);
+        string findEdianness();
         
                 
                 //string manipulation
@@ -158,11 +159,14 @@ class MothurOut {
          map<string, vector<string> > parseClasses(string);
                 
                 //math operation
+        double max(vector<double>&); //returns largest value in vector
+        double min(vector<double>&); //returns smallest value in vector
                 int factorial(int num);
                 vector<vector<double> > binomial(int);
                 float ceilDist(float, int);
                 float roundDist(float, int);
                 unsigned int fromBase36(string);
+        double median(vector<double>);
                 int getRandomIndex(int); //highest
          double getStandardDeviation(vector<int>&);
          vector<double> getStandardDeviation(vector< vector<double> >&);
diff --git a/pairwiseseqscommand.cpp b/pairwiseseqscommand.cpp

index 7d96e31d1eefb534729e421b266092db1536aa46..f874e9a869d269546c04ce8fd9648786dcc4bab1 100644 (file)
--- a/pairwiseseqscommand.cpp
+++ b/pairwiseseqscommand.cpp
@@ -671,14 +671,11 @@ int PairwiseSeqsCommand::driver(int startLine, int endLine, string dFileName, fl
                                 alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
                                 seqI.setAligned(alignment->getSeqAAln());
                                 seqJ.setAligned(alignment->getSeqBAln());
-
-                               //cout << seqI.getName() << '\t' << seqJ.getName() << endl;
-                //cout << alignment->getSeqAAln() << endl << alignment->getSeqBAln() << endl;
                  
                                 distCalculator->calcDist(seqI, seqJ);
                                 double dist = distCalculator->getDist();
                  
-                //cout << "dist = " << dist << endl;
+                if (m->debug) { m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' +  alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
                                                 
                                 if(dist <= cutoff){
                                         if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
@@ -777,7 +774,9 @@ int PairwiseSeqsCommand::driver(int startLine, int endLine, string dFileName, st
                                 distCalculator->calcDist(seqI, seqJ);
                                 double dist = distCalculator->getDist();
                                                                 
-                               outFile << dist << '\t'; 
+                               outFile << dist << '\t';
+                
+                if (m->debug) { m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' +  alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
                         }
                         
                         outFile << endl; 
@@ -860,6 +859,8 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI,
                                 
                                 distCalculator->calcDist(seqI, seqJ);
                                 double dist = distCalculator->getDist();
+                
+                if (m->debug) { cout << ("[DEBUG]: " + seqI.getName() + '\t' +  alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
                                 
                                 if(dist <= cutoff){
                                          outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); 
@@ -968,6 +969,8 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsi
                                 
                                 distCalculator->calcDist(seqI, seqJ);
                                 double dist = distCalculator->getDist();
+                
+                if (m->debug) { cout << ("[DEBUG]: " + seqI.getName() + '\t' +  alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
                                 
                                 outputString += toString(dist) + "\t"; 
                         }
@@ -1072,7 +1075,9 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsi
                                 distCalculator->calcDist(seqI, seqJ);
                                 double dist = distCalculator->getDist();
                                 
-                               outputString += toString(dist) + "\t"; 
+                               outputString += toString(dist) + "\t";
+                
+                if (m->debug) { cout << ("[DEBUG]: " + seqI.getName() + '\t' +  alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
                         }
                         
                         outputString += "\n"; 
diff --git a/pairwiseseqscommand.h b/pairwiseseqscommand.h

index cb68e7fa8ef44ce231284e11f3f9a669396d6a44..90c21b25a7c35c47834ea7311f3e7f969163fc98 100644 (file)
--- a/pairwiseseqscommand.h
+++ b/pairwiseseqscommand.h
@@ -193,6 +193,8 @@ static DWORD WINAPI MyPairwiseSquareThreadFunction(LPVOID lpParam){
                                 distCalculator->calcDist(seqI, seqJ);
                                 double dist = distCalculator->getDist();
                  
+                if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' +  alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
+                
                                 outFile << dist << '\t'; 
                         }
                         
@@ -293,6 +295,8 @@ static DWORD WINAPI MyPairwiseThreadFunction(LPVOID lpParam){
                                 distCalculator->calcDist(seqI, seqJ);
                                 double dist = distCalculator->getDist();
                  
+                if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' +  alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
+                
                                 if(dist <= pDataArray->cutoff){
                                         if (pDataArray->output == "column") { outFile << pDataArray->alignDB.get(i).getName() << ' ' << pDataArray->alignDB.get(j).getName() << ' ' << dist << endl; }
                                 }
diff --git a/qualityscores.cpp b/qualityscores.cpp

index 0408b83ebaf0d79f31e5290278fa29e567c1cb87..1f86efc51304cc81fa2f359c8b2c996da52e8c30 100644 (file)
--- a/qualityscores.cpp
+++ b/qualityscores.cpp
@@ -44,7 +44,7 @@ QualityScores::QualityScores(ifstream& qFile){
              while(qFile.peek() != '>' && qFile.peek() != EOF){
                  if (m->control_pressed) { break; }
                  string temp = m->getline(qFile); m->gobble(qFile);
-                if (m->debug) { m->mothurOut("[DEBUG]: scores = '" + temp + "'\n.");  }
+                //if (m->debug) { m->mothurOut("[DEBUG]: scores = '" + temp + "'\n.");  }
                  qScoreString +=  ' ' + temp;
              }
              //cout << "done reading " << endl;
@@ -55,7 +55,7 @@ QualityScores::QualityScores(ifstream& qFile){
                  string temp;
                  qScoreStringStream >> temp;  m->gobble(qScoreStringStream);
                  
-                if (m->debug) { m->mothurOut("[DEBUG]: score " + toString(qScores.size()) + " = '" + temp + "'\n.");  }
+                //if (m->debug) { m->mothurOut("[DEBUG]: score " + toString(qScores.size()) + " = '" + temp + "'\n.");  }
                  
                  //check temp to make sure its a number
                  if (!m->isContainingOnlyDigits(temp)) { m->mothurOut("[ERROR]: In sequence " + seqName + "'s quality scores, expected a number and got " + temp + ", setting score to 0."); m->mothurOutEndLine(); temp = "0"; }
diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp

index e6b037b6bcafafa6c7ccf5aafa9731de3d7ba864..0e3b70c1db3824084a51fb0db1c88630285878c2 100644 (file)
--- a/seqsummarycommand.cpp
+++ b/seqsummarycommand.cpp
@@ -376,7 +376,7 @@ int SeqSummaryCommand::execute(){
                  
          double meanstartPosition, meanendPosition, meanseqLength, meanambigBases, meanlongHomoPolymer;
                  
-               meanstartPosition /= (double) size; meanendPosition /= (double) size; meanlongHomoPolymer /= (double) size; meanseqLength /= (double) size; meanambigBases /= (double) size;
+               meanstartPosition = meanStartPosition / (double) size; meanendPosition = meanEndPosition /(double) size; meanlongHomoPolymer = meanLongHomoPolymer / (double) size; meanseqLength = meanSeqLength / (double) size; meanambigBases = meanAmbigBases /(double) size;
                                 
                 int ptile0_25   = int(size * 0.025);
                 int ptile25             = int(size * 0.250);
diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp

index c80ac2e51859331f17f02bcdf5abdd3f89bec6f3..399f253570feaefcd39f979a0af2b7f3c5bb8632 100644 (file)
--- a/sffinfocommand.cpp
+++ b/sffinfocommand.cpp
@@ -559,6 +559,7 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){
                         if (m->control_pressed) { count = 0; break;   }
                         
                         if (count >= header.numReads) { break; }
+            //if (count >= 100) { break; }
                 }
                 
                 //report progress
@@ -651,6 +652,8 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){
                         char buffer4 [4];
                         in.read(buffer4, 4);
                         header.numReads =  be_int4(*(unsigned int *)(&buffer4));
+            
+            if (m->debug) { m->mothurOut("[DEBUG]: numReads = " + toString(header.numReads) + "\n"); }
                                 
                         //read header length
                         char buffer5 [2];
@@ -716,7 +719,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  out.write(mybuffer, in.gcount()); 
                  out.close();
              }
@@ -729,7 +732,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  out.write(mybuffer, in.gcount()); 
                  out.close();
              }
@@ -752,7 +755,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
                  thisbuffer[6] = (offset >> 8) & 0xFF;
                  thisbuffer[7] = offset & 0xFF;
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  out.write(thisbuffer, 8);
                  out.close();
              }
@@ -766,8 +769,8 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
-                int offset = 0;
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
+                unsigned int offset = 0;
                  char* thisbuffer = new char[4];
                  thisbuffer[0] = (offset >> 24) & 0xFF;
                  thisbuffer[1] = (offset >> 16) & 0xFF;
@@ -786,13 +789,20 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  //convert number of reads to 4 byte char*
                  char* thisbuffer = new char[4];
-                thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF;
-                thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF;
-                thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF;
-                thisbuffer[3] = numSplitReads[i][j] & 0xFF;
+                if ((m->findEdianness()) == "BIG_ENDIAN") {
+                    thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF;
+                    thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF;
+                    thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF;
+                    thisbuffer[3] = numSplitReads[i][j] & 0xFF;
+                }else {
+                    thisbuffer[0] = numSplitReads[i][j] & 0xFF;
+                    thisbuffer[1] = (numSplitReads[i][j] >> 8) & 0xFF;
+                    thisbuffer[2] = (numSplitReads[i][j] >> 16) & 0xFF;
+                    thisbuffer[3] = (numSplitReads[i][j] >> 24) & 0xFF;
+                 }
                  out.write(thisbuffer, 4);
                  out.close();
                  delete[] thisbuffer;
@@ -805,7 +815,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  out.write(mybuffer, in.gcount()); 
                  out.close();
              }
@@ -818,7 +828,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  out.write(mybuffer, in.gcount()); 
                  out.close();
              }
@@ -831,7 +841,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  out.write(mybuffer, in.gcount()); 
                  out.close();
              }
@@ -844,7 +854,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  out.write(mybuffer, in.gcount()); 
                  out.close();
              }
@@ -857,7 +867,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  out.write(mybuffer, in.gcount()); 
                  out.close();
              }
@@ -870,7 +880,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){
          for (int i = 0; i < filehandlesHeaders.size(); i++) {  
              for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
                  ofstream out;
-                m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+                m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
                  out.write(mybuffer, in.gcount()); 
                  out.close();
              }
@@ -913,7 +923,7 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, H
              
              //read header length
                         char buffer [2];
-                       in.read(buffer, 2);
+                       in.read(buffer, 2); 
                         header.headerLength = be_int2(*(unsigned short *)(&buffer));
              
                         //read name length
@@ -925,33 +935,39 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, H
                         char buffer3 [4];
                         in.read(buffer3, 4);
                         header.numBases =  be_int4(*(unsigned int *)(&buffer3));
+            
                         
                         //read clip qual left
                         char buffer4 [2];
                         in.read(buffer4, 2);
                         header.clipQualLeft =  be_int2(*(unsigned short *)(&buffer4));
-                       header.clipQualLeft = 5; 
+                       header.clipQualLeft = 5;
+            
                         
                         //read clip qual right
                         char buffer5 [2];
                         in.read(buffer5, 2);
                         header.clipQualRight =  be_int2(*(unsigned short *)(&buffer5));
+           
              
                         //read clipAdapterLeft
                         char buffer6 [2];
                         in.read(buffer6, 2);
                         header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6));
              
+            
                         //read clipAdapterRight
                         char buffer7 [2];
                         in.read(buffer7, 2);
                         header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7));
              
+            
                         //read name
                         char* tempBuffer = new char[header.nameLength];
                         in.read(&(*tempBuffer), header.nameLength);
                         header.name = tempBuffer;
                         if (header.name.length() > header.nameLength) { header.name = header.name.substr(0, header.nameLength);  }
+            
                         delete[] tempBuffer;
                         
                         //extract info from name
@@ -1014,11 +1030,12 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, H
                  int trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex);
                                  
                  if(trashCodeLength == 0){
+                    //cout << header.name << " length = " << spot << '\t' << startSpotInFile << '\t' << in2.gcount() << endl;
+                    
                      ofstream out;
                      m->openOutputFileBinaryAppend(filehandles[barcodeIndex][primerIndex], out);
                      out.write(mybuffer, in2.gcount()); 
                      out.close();
-                    delete[] mybuffer;
                      numSplitReads[barcodeIndex][primerIndex]++;
                                 }
                                 else{
@@ -1026,9 +1043,8 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, H
                      m->openOutputFileBinaryAppend(noMatchFile, out);
                      out.write(mybuffer, in2.gcount()); 
                      out.close();
-                    delete[] mybuffer;
                                 }
-                               
+                               delete[] mybuffer;
                         }
                 }else{
                         m->mothurOut("Error reading."); m->mothurOutEndLine();
@@ -1771,7 +1787,7 @@ bool SffInfoCommand::readOligos(string oligoFile){
                                         }
                                         
                                         filehandles[itBar->second][itPrimer->second] = thisFilename;
-                                       m->openOutputFile(thisFilename, temp);          temp.close();
+                                       temp.open(thisFilename.c_str(), ios::binary);           temp.close();
                                 }
                         }
                 }
diff --git a/shhhercommand.cpp b/shhhercommand.cpp

index 4a6e5ff2375ce40560db1aa706edbdacefac0e0d..c405e38060cec0db4b610c2e6c9e24a8810d7aed 100644 (file)
--- a/shhhercommand.cpp
+++ b/shhhercommand.cpp
@@ -1690,10 +1690,8 @@ void ShhherCommand::writeQualities(vector<int> otuCounts){
              if (m->control_pressed) { break; }
              
              int index = 0;
-            int base = 0;
              
              if(nSeqsPerOTU[i] > 0){
-                qualities[i].assign(1024, -1);
                  
                  while(index < numFlowCells){
                      double maxPrValue = 1e8;
@@ -1742,8 +1740,7 @@ void ShhherCommand::writeQualities(vector<int> otuCounts){
                              value = (int)floor(temp);
                              if(value > 100){   value = 100;    }
                              
-                            qualities[i][base] = (int)value;
-                            base++;
+                            qualities[i].push_back((int)value);
                          }
                      }
                      
@@ -1754,12 +1751,7 @@ void ShhherCommand::writeQualities(vector<int> otuCounts){
              
              if(otuCounts[i] > 0){
                  qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
-                
-                int j=4;       //need to get past the first four bases
-                while(qualities[i][j] != -1){
-                    qualityFile << qualities[i][j] << ' ';
-                    j++;
-                }
+                for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
                  qualityFile << endl;
              }
          }
@@ -2394,16 +2386,22 @@ int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFil
                  m->mothurOut("\nFinalizing...\n");
                  fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
                  
+                if (m->debug) { m->mothurOut("[DEBUG]: done fill().\n"); }
+
                  if (m->control_pressed) { break; }
                  
                  setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
                  
+                if (m->debug) { m->mothurOut("[DEBUG]: done setOTUs().\n"); }
+                
                  if (m->control_pressed) { break; }
                  
                  vector<int> otuCounts(numOTUs, 0);
                  for(int j=0;j<numSeqs;j++)     {       otuCounts[otuData[j]]++;        }
                  
-                calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);        
+                calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+                
+                if (m->debug) { m->mothurOut("[DEBUG]: done calcCentroidsDriver().\n"); }
                  
                  if (m->control_pressed) { break; }
                  
@@ -3280,12 +3278,11 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string quality
                         if (m->control_pressed) { break; }
                         
                         int index = 0;
-                       int base = 0;
-                       
+            
                         if(nSeqsPerOTU[i] > 0){
-                               qualities[i].assign(1024, -1);
                                 
                                 while(index < numFlowCells){
+                    
                                         double maxPrValue = 1e8;
                                         short maxPrIndex = -1;
                                         double count = 0.0000;
@@ -3304,7 +3301,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string quality
                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
                                                 }
                                         }
-                                       
+                    
                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
                                         maxPrValue = pr[maxPrIndex];
                                         
@@ -3315,7 +3312,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string quality
                                                 for(int s=0;s<HOMOPS;s++){
                                                         norm += exp(-(pr[s] - maxPrValue));
                                                 }
-                                               
+                        
                                                 for(int s=1;s<=maxPrIndex;s++){
                                                         int value = 0;
                                                         double temp = 0.0000;
@@ -3332,28 +3329,24 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string quality
                                                         value = (int)floor(temp);
                                                         if(value > 100){        value = 100;    }
                                                         
-                                                       qualities[i][base] = (int)value;
-                                                       base++;
+                            qualities[i].push_back((int)value);
                                                 }
-                                       }
+                                       }//end if
                                         
                                         index++;
-                               }
-                       }
-                       
+                    
+                               }//end while
+                
+                       }//end if
                         
+            
                         if(otuCounts[i] > 0){
                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
-                       
-                               int j=4;        //need to get past the first four bases
-                               while(qualities[i][j] != -1){
-                    qualityFile << qualities[i][j] << ' ';
-                    if (j > qualities[i].size()) { break; }
-                    j++;
-                               }
+                //need to get past the first four bases
+                for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
                                 qualityFile << endl;
                         }
-               }
+               }//end for
                 qualityFile.close();
                 outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
          
diff --git a/splitgroupscommand.cpp b/splitgroupscommand.cpp

index c15ba3a4d588b3eedcaca7713c6d5acb8610aee2..5c0103708bc99daaf6d0046435882c72a6c631a7 100644 (file)
--- a/splitgroupscommand.cpp
+++ b/splitgroupscommand.cpp
@@ -362,7 +362,7 @@ int SplitGroupCommand::runCount(){
                  for (int i = 0; i < thisSeqsGroups.size(); i++) {
                      if (m->inUsersGroups(thisSeqsGroups[i], Groups)) { //if this sequence belongs to a group we want them print
                          seq.printSequence(*(ffiles[thisSeqsGroups[i]]));
-                        int numSeqs = ct.getGroupCount(seq.getName(), Groups[i]);
+                        int numSeqs = ct.getGroupCount(seq.getName(), thisSeqsGroups[i]);
                          (*(cfiles[thisSeqsGroups[i]])) << seq.getName() << '\t' << numSeqs << '\t' << numSeqs << endl;
                      }
                  }
diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp

index fe79fbbae883a409187730e1899fde8b05493193..9e27fae51eca7ef564892e112d8a17d24eba6089 100644 (file)
--- a/subsamplecommand.cpp
+++ b/subsamplecommand.cpp
@@ -552,7 +552,7 @@ int SubSampleCommand::getSubSampleFasta() {
                      else{
                          itGroupCounts = groupCounts.find(group);
                          if (itGroupCounts != groupCounts.end()) {
-                            if (groupCounts[group] < size) {   subset.insert(names[j]);        groupCounts[group]++; }
+                            if (itGroupCounts->second < size) {        subset.insert(names[j]);        (itGroupCounts->second)++; }
                          }
                      }                          
                  }
diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp

index 2a163516952c27f8a3dc7411536023f69e1f5b08..fd55edf25ede0b4de807bef17dadb861b44c03a8 100644 (file)
--- a/trimflowscommand.cpp
+++ b/trimflowscommand.cpp
@@ -637,8 +637,7 @@ void TrimFlowsCommand::getOligos(vector<vector<string> >& outFlowFileNames){
                         primers[""] = 0;
                         primerNameVector.push_back("");                 
                 }
-               
-               
+
                 outFlowFileNames.resize(barcodeNameVector.size());
                 for(int i=0;i<outFlowFileNames.size();i++){
                         outFlowFileNames[i].assign(primerNameVector.size(), "");
diff --git a/trimoligos.cpp b/trimoligos.cpp

index ddc1053f50d3eba28064907e4c6f0a60a958b3ee..0f2a9cbb0e584ddfa393a53015b02b37ccfa3903 100644 (file)
--- a/trimoligos.cpp
+++ b/trimoligos.cpp
@@ -368,6 +368,8 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
                  temp = temp.substr(0,alnLength);
                  int numDiff = countDiffs(oligo, temp);
                  
+                if (m->debug) { m->mothurOut("[DEBUG]: " + seq.getName() + " aligned fragment =" + temp + ", barcode =" + oligo + ", numDiffs = " + toString(numDiff) + "\n"); }
+                
                  if(numDiff < minDiff){
                      minDiff = numDiff;
                      minCount = 1;
@@ -487,6 +489,8 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr
                  temp = temp.substr(0,alnLength);
                  int numDiff = countDiffs(oligo, temp);
                  
+                if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n");  }
+                
                  if (alnLength == 0) { numDiff = bdiffs + 100; }
                  //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
                  
@@ -548,6 +552,9 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr
                      oligo = oligo.substr(0,alnLength);
                      temp = temp.substr(0,alnLength);
                      int numDiff = countDiffs(oligo, temp);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: reverse " + forwardSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n");  }
+                    
                      if (alnLength == 0) { numDiff = bdiffs + 100; }
                      
                      //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
@@ -700,7 +707,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                  int numDiff = countDiffs(oligo, temp);
                  
                  if (alnLength == 0) { numDiff = bdiffs + 100; }
-                //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
+                if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n");  }
                  
                  if(numDiff < minDiff){
                      minDiff = numDiff;
@@ -762,6 +769,8 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                      int numDiff = countDiffs(oligo, temp);
                      if (alnLength == 0) { numDiff = bdiffs + 100; }
                      
+                    if (m->debug) { m->mothurOut("[DEBUG]: reverse " + reverseSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n");  }
+                    
                      //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
                      if(numDiff < minDiff){
                          minDiff = numDiff;
@@ -1370,6 +1379,8 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                  if (alnLength == 0) { numDiff = pdiffs + 100; }
                  //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
                  
+                if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n");  }
+                
                  if(numDiff < minDiff){
                      minDiff = numDiff;
                      minCount = 1;
@@ -1430,6 +1441,8 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                      int numDiff = countDiffs(oligo, temp);
                      if (alnLength == 0) { numDiff = pdiffs + 100; }
                      
+                    if (m->debug) { m->mothurOut("[DEBUG]: reverse " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n");  }
+                    
                      //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
                      if(numDiff < minDiff){
                          minDiff = numDiff;
@@ -1579,6 +1592,8 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr
                  temp = temp.substr(0,alnLength);
                  int numDiff = countDiffs(oligo, temp);
                  
+                if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n");  }
+                
                  if (alnLength == 0) { numDiff = pdiffs + 100; }
                  //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
                  
@@ -1640,6 +1655,9 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr
                      oligo = oligo.substr(0,alnLength);
                      temp = temp.substr(0,alnLength);
                      int numDiff = countDiffs(oligo, temp);
+                    
+                    if (m->debug) { m->mothurOut("[DEBUG]: reverse " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n");  }
+                    
                      if (alnLength == 0) { numDiff = pdiffs + 100; }
                      
                      //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
@@ -1773,6 +1791,8 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){
                  
                  int numDiff = countDiffs(oligo, temp);
                  
+                if (m->debug) { m->mothurOut("[DEBUG]: " + seq.getName() + " aligned fragment =" + temp + ", barcode =" + oligo + ", numDiffs = " + toString(numDiff) + "\n"); }
+                
                  if(numDiff < minDiff){
                      minDiff = numDiff;
                      minCount = 1;
diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp

index 272ae8255b6566e0f2c0ff04489580a9a36acdfd..d2c459d9381a7d28481d8808f31f08cf7e30d577 100644 (file)
--- a/unifracunweightedcommand.cpp
+++ b/unifracunweightedcommand.cpp
@@ -736,7 +736,7 @@ int UnifracUnweightedCommand::runRandomCalcs(Tree* thisTree, vector<double> user
              float rcumul = 1.0000;
      
              //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
-            for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) { 
+            for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
                  //make rscoreFreq map and rCumul
                  map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
                  rCumul[a][it->first] = rcumul;
diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp

index 3b0c53b7437111cfe5045663a8c78370274b939e..f96264f4af2ad46a478ec9ada9b26ec03cea8865 100644 (file)
--- a/unifracweightedcommand.cpp
+++ b/unifracweightedcommand.cpp
@@ -915,7 +915,7 @@ void UnifracWeightedCommand::printWeightedFile() {
                 for(int a = 0; a < numComp; a++) {
                         output->initFile(groupComb[a], tags);
                         //print each line
-                       for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) { 
+                       for (map<double,double>::iterator it = validScores.begin(); it != validScores.end(); it++) {
                                 data.push_back(it->first);  data.push_back(rScoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]); 
                                 output->output(data);
                                 data.clear();
@@ -1085,7 +1085,7 @@ void UnifracWeightedCommand::calculateFreqsCumuls() {
                 for (int f = 0; f < numComp; f++) {
                         for (int i = 0; i < rScores[f].size(); i++) { //looks like 0,0,1,1,1,2,4,7...  you want to make a map that say rScoreFreq[0] = 2, rScoreFreq[1] = 3...
                                 validScores[rScores[f][i]] = rScores[f][i];
-                               map<float,float>::iterator it = rScoreFreq[f].find(rScores[f][i]);
+                               map<double,double>::iterator it = rScoreFreq[f].find(rScores[f][i]);
                                 if (it != rScoreFreq[f].end()) {
                                         rScoreFreq[f][rScores[f][i]]++;
                                 }else{
@@ -1098,9 +1098,9 @@ void UnifracWeightedCommand::calculateFreqsCumuls() {
                 for(int a = 0; a < numComp; a++) {
                         float rcumul = 1.0000;
                         //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
-                       for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
+                       for (map<double,double>::iterator it = validScores.begin(); it != validScores.end(); it++) {
                                 //make rscoreFreq map and rCumul
-                               map<float,float>::iterator it2 = rScoreFreq[a].find(it->first);
+                               map<double,double>::iterator it2 = rScoreFreq[a].find(it->first);
                                 rCumul[a][it->first] = rcumul;
                                 //get percentage of random trees with that info
                                 if (it2 != rScoreFreq[a].end()) {  rScoreFreq[a][it->first] /= iters; rcumul-= it2->second;  }
diff --git a/unifracweightedcommand.h b/unifracweightedcommand.h

index 10cbc7b83f51d70a11b42ae761fe66864b8f7898..a5507cdb6cc2d95152eec0241f386acd66de1aab 100644 (file)
--- a/unifracweightedcommand.h
+++ b/unifracweightedcommand.h
@@ -54,9 +54,9 @@ class UnifracWeightedCommand : public Command {
                 int iters, numGroups, numComp, counter;
                 vector< vector<double> > rScores;  //vector<weighted scores for random trees.> each group comb has an entry
                 vector< vector<double> > uScores;  //vector<weighted scores for user trees.> each group comb has an entry
-               vector< map<float, float> > rScoreFreq;  //map <weighted score, number of random trees with that score.> -vector entry for each combination.
-               vector< map<float, float> > rCumul;  //map <weighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each c                                                                
-               map<float, float>  validScores;  //map contains scores from random
+               vector< map<double, double> > rScoreFreq;  //map <weighted score, number of random trees with that score.> -vector entry for each combination.
+               vector< map<double, double> > rCumul;  //map <weighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each c                                                              
+               map<double, double>  validScores;  //map contains scores from random
                 
                 bool abort, phylip, random, includeRoot, subsample, consensus;
                 string groups, itersString, outputForm, treefile, groupfile, namefile, countfile;
author	Sarah Westcott <mothur.westcott@gmail.com>
	Wed, 18 Sep 2013 16:53:47 +0000 (12:53 -0400)
committer	Sarah Westcott <mothur.westcott@gmail.com>
	Wed, 18 Sep 2013 16:53:47 +0000 (12:53 -0400)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
Mothur.xcodeproj/xcuserdata/SarahsWork.xcuserdatad/xcschemes/Mothur.xcscheme		patch \| blob \| history
aligncommand.cpp		patch \| blob \| history
counttable.cpp		patch \| blob \| history
filters.h		patch \| blob \| history
lefsecommand.cpp		patch \| blob \| history
lefsecommand.h		patch \| blob \| history
linearalgebra.cpp		patch \| blob \| history
linearalgebra.h		patch \| blob \| history
makebiomcommand.cpp		patch \| blob \| history
makecontigscommand.cpp		patch \| blob \| history
makecontigscommand.h		patch \| blob \| history
makefile		patch \| blob \| history
makelefsecommand.cpp		patch \| blob \| history
mothurout.cpp		patch \| blob \| history
mothurout.h		patch \| blob \| history
pairwiseseqscommand.cpp		patch \| blob \| history
pairwiseseqscommand.h		patch \| blob \| history
qualityscores.cpp		patch \| blob \| history
seqsummarycommand.cpp		patch \| blob \| history
sffinfocommand.cpp		patch \| blob \| history
shhhercommand.cpp		patch \| blob \| history
splitgroupscommand.cpp		patch \| blob \| history
subsamplecommand.cpp		patch \| blob \| history
trimflowscommand.cpp		patch \| blob \| history
trimoligos.cpp		patch \| blob \| history
unifracunweightedcommand.cpp		patch \| blob \| history
unifracweightedcommand.cpp		patch \| blob \| history
unifracweightedcommand.h		patch \| blob \| history