From 3a5dd9e428ab93a6dcdce7912e8ebb977be0b893 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Wed, 18 Sep 2013 12:53:47 -0400 Subject: [PATCH] added lefse command. added check for hard filter length. fixed bug in make.biom when multiple labels in shared file. added index file to make.contigs. fixed weird output in summary.seqs for mean values. added checks for edianness in sffinfo when oligos file is used. shhh.flows fix for reads longer than 1024. fixed bug in split.groups with count file. added several functions to linearalegra for lefse. --- Mothur.xcodeproj/project.pbxproj | 8 +- .../xcschemes/Mothur.xcscheme | 6 +- aligncommand.cpp | 1 - counttable.cpp | 3 + filters.h | 2 + lefsecommand.cpp | 943 +++++++++++++++--- lefsecommand.h | 25 +- linearalgebra.cpp | 605 ++++++++++- linearalgebra.h | 11 +- makebiomcommand.cpp | 9 +- makecontigscommand.cpp | 396 ++++++-- makecontigscommand.h | 39 +- makefile | 7 +- makelefsecommand.cpp | 17 +- mothurout.cpp | 85 +- mothurout.h | 4 + pairwiseseqscommand.cpp | 17 +- pairwiseseqscommand.h | 4 + qualityscores.cpp | 4 +- seqsummarycommand.cpp | 2 +- sffinfocommand.cpp | 60 +- shhhercommand.cpp | 53 +- splitgroupscommand.cpp | 2 +- subsamplecommand.cpp | 2 +- trimflowscommand.cpp | 3 +- trimoligos.cpp | 22 +- unifracunweightedcommand.cpp | 2 +- unifracweightedcommand.cpp | 8 +- unifracweightedcommand.h | 6 +- 29 files changed, 2059 insertions(+), 287 deletions(-) diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index a17c97b..27607ff 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -2413,7 +2413,7 @@ GCC_OPTIMIZATION_LEVEL = 3; "INSTALL_PATH[sdk=*]" = TARGET_BUILD_DIR; PRODUCT_NAME = mothur; - SDKROOT = macosx10.8; + SDKROOT = macosx; SKIP_INSTALL = NO; }; name = Debug; @@ -2449,6 +2449,7 @@ "VERSION=\"\\\"1.31.0\\\"\"", "RELEASE_DATE=\"\\\"5/24/2013\\\"\"", ); + GCC_VERSION = ""; "GCC_VERSION[arch=*]" = ""; GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES; @@ -2466,7 +2467,7 @@ "-lncurses", "-lreadline", ); - SDKROOT = macosx; + SDKROOT = macosx10.8; SKIP_INSTALL = NO; USER_HEADER_SEARCH_PATHS = ""; }; @@ -2485,6 +2486,7 @@ "VERSION=\"\\\"1.31.0\\\"\"", "RELEASE_DATE=\"\\\"5/24/2013\\\"\"", ); + GCC_VERSION = ""; GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES; GCC_WARN_MISSING_PARENTHESES = YES; @@ -2504,7 +2506,7 @@ "-lncurses", "-lreadline", ); - SDKROOT = macosx; + SDKROOT = macosx10.8; SKIP_INSTALL = NO; }; name = Release; diff --git a/Mothur.xcodeproj/xcuserdata/SarahsWork.xcuserdatad/xcschemes/Mothur.xcscheme b/Mothur.xcodeproj/xcuserdata/SarahsWork.xcuserdatad/xcschemes/Mothur.xcscheme index 3ced6e4..2ce2b0e 100644 --- a/Mothur.xcodeproj/xcuserdata/SarahsWork.xcuserdatad/xcschemes/Mothur.xcscheme +++ b/Mothur.xcodeproj/xcuserdata/SarahsWork.xcuserdatad/xcschemes/Mothur.xcscheme @@ -1,5 +1,6 @@ @@ -39,11 +40,12 @@ diff --git a/aligncommand.cpp b/aligncommand.cpp index f757a79..f9c0436 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -558,7 +558,6 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam if (m->control_pressed) { break; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); - cout << candidateSeq->getAligned() << endl; report.setCandidate(candidateSeq); int origNumBases = candidateSeq->getNumBases(); diff --git a/counttable.cpp b/counttable.cpp index 6eca8ee..0e9eec3 100644 --- a/counttable.cpp +++ b/counttable.cpp @@ -246,6 +246,9 @@ int CountTable::readTable(string file, bool readGroups) { in >> name; m->gobble(in); in >> thisTotal; m->gobble(in); if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + toString(thisTotal) + "\n"); } + if (thisTotal == 0) { error=true; m->mothurOut("[ERROR]: Your count table contains a sequence named " + name + " with a total=0. Please correct."); m->mothurOutEndLine(); + } + //if group info, then read it vector groupCounts; groupCounts.resize(numGroups, 0); if (columnHeaders.size() > 2) { //file contains groups diff --git a/filters.h b/filters.h index 21bc8bb..b00cb35 100644 --- a/filters.h +++ b/filters.h @@ -82,6 +82,8 @@ public: fileHandle >> filter; fileHandle.close(); + + if (filter.length() != alignmentLength) { m->mothurOut("[ERROR]: Sequences are not all the same length as the filter, please correct.\n"); m->control_pressed = true; } } void getFreqs(Sequence seq) { diff --git a/lefsecommand.cpp b/lefsecommand.cpp index c478a4d..3575e02 100644 --- a/lefsecommand.cpp +++ b/lefsecommand.cpp @@ -18,8 +18,28 @@ vector LefseCommand::setParameters(){ CommandParameter psubclass("subclass", "String", "", "", "", "", "","",false,false); parameters.push_back(psubclass); CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); CommandParameter pclasses("classes", "String", "", "", "", "", "","",false,false); parameters.push_back(pclasses); - CommandParameter palpha("anova_alpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(palpha); - CommandParameter pwalpha("wilcoxon_alpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(pwalpha); + CommandParameter palpha("aalpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(palpha); + CommandParameter pwalpha("walpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(pwalpha); + + CommandParameter plda("lda", "Number", "", "2.0", "", "", "","",false,false); parameters.push_back(plda); + CommandParameter pwilc("wilc", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pwilc); + CommandParameter pnormmillion("norm", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pnormmillion); + CommandParameter piters("iters", "Number", "", "30", "", "", "","",false,false); parameters.push_back(piters); + //CommandParameter pwilcsamename("wilcsamename", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pwilcsamename); + CommandParameter pcurv("curv", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcurv); + CommandParameter pfiters("fboots", "Number", "", "0.67", "", "", "","",false,false); parameters.push_back(pfiters); + CommandParameter pstrict("strict", "Multiple", "0-1-2", "0", "", "", "","",false,false); parameters.push_back(pstrict); + CommandParameter pminc("minc", "Number", "", "10", "", "", "","",false,false); parameters.push_back(pminc); + CommandParameter pmulticlass_strat("multiclass", "Multiple", "onevone-onevall", "onevall", "", "", "","",false,false); parameters.push_back(pmulticlass_strat); + //CommandParameter psubject("subject", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psubject); + + + //not used in their current code, but in parameters + //CommandParameter pnlogs("nlogs", "Number", "", "3", "", "", "","",false,false); parameters.push_back(pnlogs); + //CommandParameter pranktec("ranktec", "Multiple", "lda-svm", "lda", "", "", "","",false,false); parameters.push_back(pranktec); // svm not implemented in their source yet. + //CommandParameter psvmnorm("svmnorm", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(psvmnorm); //not used because svm not implemented yet. + + //every command must have inputdir and outputdir. This allows mothur users to redirect input and output files. CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -38,10 +58,22 @@ string LefseCommand::getHelpString(){ try { string helpString = ""; helpString += "The lefse command allows you to ....\n"; - helpString += "The lefse command parameters are: shared, design, class, subclass, label, classes, wilcoxon_alpha, anova_alpha.\n"; + helpString += "The lefse command parameters are: shared, design, class, subclass, label, classes, walpha, aalpha, lda, wilc, iters, curv, fboots, strict, minc, multiclass and norm.\n"; helpString += "The class parameter is used to indicate the which category you would like used for the Kruskal Wallis analysis. If none is provided first category is used.\n"; - helpString += "The subclass parameter is used to indicate the .....If none is provided second category is used, or if only one category subclass is ignored. \n"; - helpString += "The classes parameter is used to indicate the classes you would like to use. Clases should be inputted in the following format: classes=label-label. For example to include groups from treatment with value early or late and age= young or old. class=treatment-age.\n"; + helpString += "The subclass parameter is used to indicate the .....If none is provided, second category is used, or if only one category subclass is ignored. \n"; + helpString += "The aalpha parameter is used to set the alpha value for the Krukal Wallis Anova test Default=0.05. \n"; + helpString += "The walpha parameter is used to set the alpha value for the Wilcoxon test. Default=0.05. \n"; + helpString += "The lda parameter is used to set the threshold on the absolute value of the logarithmic LDA score. Default=2.0. \n"; + helpString += "The wilc parameter is used to indicate whether to perform the Wilcoxon test. Default=T. \n"; + helpString += "The iters parameter is used to set the number of bootstrap iteration for LDA. Default=30. \n"; + //helpString += "The wilcsamename parameter is used to indicate whether perform the wilcoxon test only among the subclasses with the same name. Default=F. \n"; + helpString += "The curv parameter is used to set whether perform the wilcoxon test ing the Curtis's approach [BETA VERSION] Default=F. \n"; + helpString += "The norm parameter is used to multiply relative abundances by 1000000. Recommended when very low values are present. Default=T. \n"; + helpString += "The fboots parameter is used to set the subsampling fraction value for each bootstrap iteration. Default=0.67. \n"; + helpString += "The strict parameter is used to set the multiple testing correction options. 0 no correction (more strict, default), 1 correction for independent comparisons, 2 correction for independent comparison. Options = 0,1,2. Default=0. \n"; + helpString += "The minc parameter is used to minimum number of samples per subclass for performing wilcoxon test. Default=10. \n"; + helpString += "The multiclass parameter is used to (for multiclass tasks) set whether the test is performed in a one-against-one ( onevone - more strict!) or in a one-against-all setting ( onevall - less strict). Default=onevall. \n"; + helpString += "The classes parameter is used to indicate the classes you would like to use. Classes should be inputted in the following format: classes=label-label. For example to include groups from treatment with value early or late and age= young or old. class=treatment-age.\n"; helpString += "The label parameter is used to indicate which distances in the shared file you would like to use. labels are separated by dashes.\n"; helpString += "The lefse command should be in the following format: lefse(shared=final.an.shared, design=final.design, class=treatment, subclass=age).\n"; return helpString; @@ -57,8 +89,6 @@ string LefseCommand::getOutputPattern(string type) { string pattern = ""; if (type == "summary") { pattern = "[filename],[distance],lefse_summary"; } - else if (type == "kruskall-wallis") { pattern = "[filename],[distance],kruskall_wallis"; } - else if (type == "wilcoxon") { pattern = "[filename],[distance],wilcoxon"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -75,8 +105,6 @@ LefseCommand::LefseCommand(){ setParameters(); vector tempOutNames; outputTypes["summary"] = tempOutNames; - outputTypes["kruskall-wallis"] = tempOutNames; - outputTypes["wilcoxon"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "LefseCommand", "LefseCommand"); @@ -109,8 +137,6 @@ LefseCommand::LefseCommand(string option) { vector tempOutNames; outputTypes["summary"] = tempOutNames; - outputTypes["kruskall-wallis"] = tempOutNames; - outputTypes["wilcoxon"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -171,19 +197,63 @@ LefseCommand::LefseCommand(string option) { if (mclass == "not found") { mclass = ""; } subclass = validParameter.validFile(parameters, "subclass", false); - if (subclass == "not found") { subclass = ""; } + if (subclass == "not found") { subclass = mclass; } classes = validParameter.validFile(parameters, "classes", false); if (classes == "not found") { classes = ""; } - string temp = validParameter.validFile(parameters, "anova_alpha", false); + string temp = validParameter.validFile(parameters, "aalpha", false); if (temp == "not found") { temp = "0.05"; } m->mothurConvert(temp, anovaAlpha); - temp = validParameter.validFile(parameters, "wilcoxon_alpha", false); + temp = validParameter.validFile(parameters, "walpha", false); if (temp == "not found") { temp = "0.05"; } m->mothurConvert(temp, wilcoxonAlpha); + + temp = validParameter.validFile(parameters, "wilc", false); + if (temp == "not found") { temp = "T"; } + wilc = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "norm", false); + if (temp == "not found") { temp = "T"; } + normMillion = m->isTrue(temp); + + //temp = validParameter.validFile(parameters, "subject", false); + //if (temp == "not found") { temp = "F"; } + //subject = m->isTrue(temp); + temp = validParameter.validFile(parameters, "lda", false); + if (temp == "not found") { temp = "2.0"; } + m->mothurConvert(temp, ldaThreshold); + + temp = validParameter.validFile(parameters, "iters", false); + if (temp == "not found") { temp = "30"; } + m->mothurConvert(temp, iters); + + temp = validParameter.validFile(parameters, "fboots", false); + if (temp == "not found") { temp = "0.67"; } + m->mothurConvert(temp, fBoots); + + //temp = validParameter.validFile(parameters, "wilcsamename", false); + //if (temp == "not found") { temp = "F"; } + //wilcsamename = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "curv", false); + if (temp == "not found") { temp = "F"; } + curv = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "strict", false); + if (temp == "not found"){ temp = "0"; } + if ((temp != "0") && (temp != "1") && (temp != "2")) { m->mothurOut("Invalid strict option: choices are 0, 1 or 2."); m->mothurOutEndLine(); abort=true; } + else { m->mothurConvert(temp, strict); } + + temp = validParameter.validFile(parameters, "minc", false); + if (temp == "not found") { temp = "10"; } + m->mothurConvert(temp, minC); + + multiClassStrat = validParameter.validFile(parameters, "multiclass", false); + if (multiClassStrat == "not found"){ multiClassStrat = "onevall"; } + if ((multiClassStrat != "onevall") && (multiClassStrat != "onevone")) { m->mothurOut("Invalid multiclass option: choices are onevone or onevall."); m->mothurOutEndLine(); abort=true; } } } @@ -196,6 +266,8 @@ LefseCommand::LefseCommand(string option) { int LefseCommand::execute(){ try { + //for reading lefse formatted file and running in mothur for testing - pass number of rows used for design file + if (false) { makeShared(1); exit(1); } if (abort == true) { if (calledHelp) { return 0; } return 2; } @@ -212,7 +284,7 @@ int LefseCommand::execute(){ if (mclass == "") { mclass = designMap.getDefaultClass(); m->mothurOut("\nYou did not provide a class, using " + mclass +".\n\n"); } InputData input(sharedfile, "sharedfile"); - vector lookup = input.getSharedRAbundVectors(); + vector lookup = input.getSharedRAbundFloatVectors(); string lastLabel = lookup[0]->getLabel(); //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. @@ -239,7 +311,7 @@ int LefseCommand::execute(){ string saveLabel = lookup[0]->getLabel(); for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } - lookup = input.getSharedRAbundVectors(lastLabel); + lookup = input.getSharedRAbundFloatVectors(lastLabel); m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); process(lookup, designMap); @@ -258,7 +330,7 @@ int LefseCommand::execute(){ if (m->control_pressed) { return 0; } //get next line to process - lookup = input.getSharedRAbundVectors(); + lookup = input.getSharedRAbundFloatVectors(); } if (m->control_pressed) { return 0; } @@ -279,7 +351,7 @@ int LefseCommand::execute(){ //run last label if you need to if (needToRun == true) { for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } } - lookup = input.getSharedRAbundVectors(lastLabel); + lookup = input.getSharedRAbundFloatVectors(lastLabel); m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); process(lookup, designMap); @@ -303,13 +375,73 @@ int LefseCommand::execute(){ } //********************************************************************************************************************** -int LefseCommand::process(vector& lookup, DesignMap& designMap) { +int LefseCommand::process(vector& lookup, DesignMap& designMap) { try { + vector classes; + vector subclasses; + map subclass2Class; + map > class2SubClasses; //maps class name to vector of its subclasses + map > subClass2GroupIndex; //maps subclass name to vector of indexes in lookup from that subclass. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old. Saves time below. + map > class2GroupIndex; //maps subclass name to vector of indexes in lookup from that class. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old. Saves time below. + for (int j = 0; j < lookup.size(); j++) { + if (normMillion) { normalize(lookup); } + string group = lookup[j]->getGroup(); + string treatment = designMap.get(group, mclass); //get value for this group in this category + string thisSub = designMap.get(group, subclass); + map::iterator it = subclass2Class.find(thisSub); + if (it == subclass2Class.end()) { + subclass2Class[thisSub] = treatment; + vector temp; temp.push_back(j); + subClass2GroupIndex[thisSub] = temp; + } + else { + if (it->second != treatment) { + //m->mothurOut("[WARNING]: subclass " + thisSub + " has members in " + it->second + " and " + treatment + ". Subclass members must be from the same class for Wilcoxon. Changing " + thisSub + " to " + treatment + "_" + thisSub + ".\n"); + thisSub = treatment + "_" + thisSub; + subclass2Class[thisSub] = treatment; + vector temp; temp.push_back(j); + subClass2GroupIndex[thisSub] = temp; + }else { subClass2GroupIndex[thisSub].push_back(j); } + } + + map >::iterator itClass = class2SubClasses.find(treatment); + if (itClass == class2SubClasses.end()) { + set temp; temp.insert(thisSub); + class2SubClasses[treatment] = temp; + vector temp2; temp2.push_back(j); + class2GroupIndex[treatment] = temp2; + classes.push_back(treatment); + }else{ + class2SubClasses[treatment].insert(thisSub); + class2GroupIndex[treatment].push_back(j); + } + } + //sort classes so order is right + sort(classes.begin(), classes.end()); + + vector< vector > means = getMeans(lookup, class2GroupIndex); //[numOTUs][classes] - classes in same order as class2GroupIndex + //run kruskal wallis on each otu - vector significantOtuLabels = runKruskalWallis(lookup, designMap); + map significantOtuLabels = runKruskalWallis(lookup, designMap); + + int numSigBeforeWilcox = significantOtuLabels.size(); //check for subclass - if (subclass != "") { significantOtuLabels = runWilcoxon(lookup, designMap, significantOtuLabels); } + string wilcoxString = ""; + if ((subclass != "") && wilc) { significantOtuLabels = runWilcoxon(lookup, designMap, significantOtuLabels, class2SubClasses, subClass2GroupIndex, subclass2Class); wilcoxString += " ( " + toString(numSigBeforeWilcox) + " ) before internal wilcoxon"; } + + int numSigAfterWilcox = significantOtuLabels.size(); + + m->mothurOut("\nNumber of significantly discriminative features: " + toString(numSigAfterWilcox) + wilcoxString + ".\n"); + + map sigOTUSLDA; + if (numSigAfterWilcox > 0) { + sigOTUSLDA = testLDA(lookup, significantOtuLabels, class2GroupIndex, subClass2GroupIndex); + m->mothurOut("Number of discriminative features with abs LDA score > " + toString(ldaThreshold) + " : " + toString(significantOtuLabels.size()) + ".\n"); + } + else { m->mothurOut("No features with significant differences between the classes.\n"); } + + printResults(means, significantOtuLabels, sigOTUSLDA, lookup[0]->getLabel(), classes); return 0; } @@ -319,20 +451,30 @@ int LefseCommand::process(vector& lookup, DesignMap& design } } //********************************************************************************************************************** - -vector LefseCommand::runKruskalWallis(vector& lookup, DesignMap& designMap) { +int LefseCommand::normalize(vector& lookup) { try { - map variables; - variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)); - variables["[distance]"] = lookup[0]->getLabel(); - string outputFileName = getOutputFileName("kruskall-wallis",variables); + vector mul; + for (int i = 0; i < lookup.size(); i++) { + double sum = 0.0; + for (int j = 0; j < lookup[i]->getNumBins(); j++) { sum += lookup[i]->getAbundance(j); } + mul.push_back(1000000.0/sum); + } - ofstream out; - m->openOutputFile(outputFileName, out); - outputNames.push_back(outputFileName); outputTypes["kruskall-wallis"].push_back(outputFileName); - out << "OTULabel\tKW\tPvalue\n"; + for (int i = 0; i < lookup.size(); i++) { + for (int j = 0; j < lookup[i]->getNumBins(); j++) { lookup[i]->set(j, lookup[i]->getAbundance(j)*mul[i], lookup[i]->getGroup()); } + } - vector significantOtuLabels; + return 0; + } + catch(exception& e) { + m->errorOut(e, "LefseCommand", "normalize"); + exit(1); + } +} +//********************************************************************************************************************** +map LefseCommand::runKruskalWallis(vector& lookup, DesignMap& designMap) { + try { + map significantOtuLabels; int numBins = lookup[0]->getNumBins(); //sanity check to make sure each treatment has a group in the shared file set treatments; @@ -357,13 +499,9 @@ vector LefseCommand::runKruskalWallis(vector& lookup, double pValue = 0.0; double H = linear.calcKruskalWallis(values, pValue); - - //output H and signifigance - out << m->currentBinLabels[i] << '\t' << H << '\t' << pValue << endl; - - if (pValue < anovaAlpha) { significantOtuLabels.push_back(i); } + + if (pValue < anovaAlpha) { significantOtuLabels[i] = pValue; } } - out.close(); return significantOtuLabels; } @@ -374,110 +512,691 @@ vector LefseCommand::runKruskalWallis(vector& lookup, } //********************************************************************************************************************** //assumes not neccessarily paired -vector LefseCommand::runWilcoxon(vector& lookup, DesignMap& designMap, vector bins) { +map LefseCommand::runWilcoxon(vector& lookup, DesignMap& designMap, map bins, map >& class2SubClasses, map >& subClass2GroupIndex, map subclass2Class) { try { - LinearAlgebra linear; - vector significantOtuLabels; + map significantOtuLabels; + map::iterator it; //if it exists and meets the following requirements run Wilcoxon /* 1. Subclass members all belong to same main class anything else */ - vector subclasses; - map subclass2Class; - map subclassCounts; - map > subClass2GroupIndex; //maps subclass name to vector of indexes in lookup from that subclass. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old. Saves time below. - bool error = false; - for (int j = 0; j < lookup.size(); j++) { - string group = lookup[j]->getGroup(); - string treatment = designMap.get(group, mclass); //get value for this group in this category - string thisSub = designMap.get(group, subclass); - map::iterator it = subclass2Class.find(thisSub); - if (it == subclass2Class.end()) { - subclass2Class[thisSub] = treatment; - subclassCounts[thisSub] = 1; - vector temp; temp.push_back(j); - subClass2GroupIndex[thisSub] = temp; + + int numBins = lookup[0]->getNumBins(); + for (int i = 0; i < numBins; i++) { + if (m->control_pressed) { break; } + + it = bins.find(i); + if (it != bins.end()) { //flagged in Kruskal Wallis + + vector abunds; for (int j = 0; j < lookup.size(); j++) { abunds.push_back(lookup[j]->getAbundance(i)); } + + bool sig = testOTUWilcoxon(class2SubClasses, abunds, subClass2GroupIndex, subclass2Class); + if (sig) { significantOtuLabels[i] = it->second; } + + }//bins flagged from kw + }//for bins + + return significantOtuLabels; + } + catch(exception& e) { + m->errorOut(e, "LefseCommand", "runWilcoxon"); + exit(1); + } +} +//********************************************************************************************************************** +//lefse.py - test_rep_wilcoxon_r function +bool LefseCommand::testOTUWilcoxon(map >& class2SubClasses, vector abunds, map >& subClass2GroupIndex, map subclass2Class) { + try { + int totalOk = 0; + double alphaMtc = wilcoxonAlpha; + vector< set > allDiffs; + LinearAlgebra linear; + + //for each subclass comparision + map >::iterator itB; + for(map >::iterator it=class2SubClasses.begin();it!=class2SubClasses.end();it++){ + itB = it;itB++; + for(itB;itB!=class2SubClasses.end();itB++){ + if (m->control_pressed) { return false; } + bool first = true; + int dirCmp = 0; // not set?? dir_cmp = "not_set" # 0=notset or none, 1=true, 2=false. + int curv_sign = 0; + int ok = 0; + int count = 0; + for (set::iterator itClass1 = (it->second).begin(); itClass1 != (it->second).end(); itClass1++) { + bool br = false; + for (set::iterator itClass2 = (itB->second).begin(); itClass2 != (itB->second).end(); itClass2++) { + string subclass1 = *itClass1; + string subclass2 = *itClass2; + count++; + + if (m->debug) { m->mothurOut( "[DEBUG comparing " + it->first + "-" + *itClass1 + " to " + itB->first + "-" + *itClass2 + "\n"); } + + string treatment1 = subclass2Class[subclass1]; + string treatment2 = subclass2Class[subclass2]; + int numSubs1 = class2SubClasses[treatment1].size(); + int numSubs2 = class2SubClasses[treatment2].size(); + + //if mul_cor != 0: alpha_mtc = th*l_subcl1*l_subcl2 if mul_cor == 2 else 1.0-math.pow(1.0-th,l_subcl1*l_subcl2) + if (strict != 0) { alphaMtc = wilcoxonAlpha * numSubs1 * numSubs2 ; } + if (strict == 2) {}else{ alphaMtc = 1.0-pow((1.0-wilcoxonAlpha),(double)(numSubs1 * numSubs2)); } + + //fill x and y with this comparisons data + vector x; vector y; + + //fill x and y + vector xIndexes = subClass2GroupIndex[subclass1]; //indexes in lookup for this subclass + vector yIndexes = subClass2GroupIndex[subclass2]; //indexes in lookup for this subclass + for (int k = 0; k < yIndexes.size(); k++) { y.push_back(abunds[yIndexes[k]]); } + for (int k = 0; k < xIndexes.size(); k++) { x.push_back(abunds[xIndexes[k]]); } + + // med_comp = False + //if len(cl1) < min_c or len(cl2) < min_c: + //med_comp = True + bool medComp = false; // are there enough samples per subclass + if ((xIndexes.size() < minC) || (yIndexes.size() < minC)) { medComp = true; } + + double sx = m->median(x); + double sy = m->median(y); + + //if cl1[0] == cl2[0] and len(set(cl1)) == 1 and len(set(cl2)) == 1: + //tres, first = False, False + double pValue = 0.0; + double H = 0.0; + bool tres = true; //don't think this is set in the python source. Not sure how that is handled, but setting it here. + if ((x[0] == y[0]) && (x.size() == 1) && (y.size() == 1)) { tres = false; first = false; } + else if (!medComp) { + H = linear.calcWilcoxon(x, y, pValue); + if (pValue < (alphaMtc*2.0)) { tres = true; } + else { tres = false; } + } + /*if first: + first = False + if not curv and ( med_comp or tres ): + dir_cmp = sx < sy + if sx == sy: br = True + elif curv: + dir_cmp = None + if med_comp or tres: + curv_sign += 1 + dir_cmp = sx < sy + else: br = True + elif not curv and med_comp: + if ((sx < sy) != dir_cmp or sx == sy): br = True + elif curv: + if tres and dir_cmp == None: + curv_sign += 1 + dir_cmp = sx < sy + if tres and dir_cmp != (sx < sy): + br = True + curv_sign = -1 + elif not tres or (sx < sy) != dir_cmp or sx == sy: br = True + */ + int sxSy = 2; //false + if (sx 0) { diff = true; } } //if curv: diff = curv_sign > 0 + else { //else: diff = (ok == len(cl_hie[pair[1]])*len(cl_hie[pair[0]])) + diff = false; + if (ok == count) { diff = true; } + } + if (diff) { totalOk++; } + if (!diff && (multiClassStrat == "onevone")) { return false; } + if (diff && (multiClassStrat == "onevall")) { //all_diff.append(pair) + set pair; pair.insert(it->first); pair.insert(itB->first); + allDiffs.push_back(pair); + } + }//classes + }//classes + + if (multiClassStrat == "onevall") { + int tot_k = class2SubClasses.size(); + for(map >::iterator it=class2SubClasses.begin();it!=class2SubClasses.end();it++){ + if (m->control_pressed) { return false; } + int nk = 0; + //is this class okay in all comparisons + for (int h = 0; h < allDiffs.size(); h++) { + if (allDiffs[h].count(it->first) != 0) { nk++; } + } + if (nk == (tot_k-1)) { return true; }//if nk == tot_k-1: return True } - else { - subclassCounts[thisSub]++; - subClass2GroupIndex[thisSub].push_back(j); - if (it->second != treatment) { - error = true; - m->mothurOut("[ERROR]: subclass " + thisSub + " has members in " + it->second + " and " + treatment + ". Subclass members must be from the same class. Ignoring wilcoxon.\n"); + return false; + } + + return true; + } + catch(exception& e) { + m->errorOut(e, "LefseCommand", "testOTUWilcoxon"); + exit(1); + } +} +//********************************************************************************************************************** +//modelled after lefse.py test_lda_r function +map LefseCommand::testLDA(vector& lookup, map bins, map >& class2GroupIndex, map >& subClass2GroupIndex) { + try { + map sigOTUS; + map::iterator it; + LinearAlgebra linear; + + int numBins = lookup[0]->getNumBins(); + vector< vector > adjustedLookup; + + for (int i = 0; i < numBins; i++) { + if (m->control_pressed) { break; } + + it = bins.find(i); + if (it != bins.end()) { //flagged in Kruskal Wallis and Wilcoxon(if we ran it) + + //fill x with this OTUs abundances + vector x; + for (int j = 0; j < lookup.size(); j++) { x.push_back(lookup[j]->getAbundance(i)); } + + //go through classes + for (map >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) { + //max(float(feats['class'].count(c))*0.5,4) + //max(numGroups in this class*0.5, 4.0) + double necessaryNum = ((double)((it->second).size())*0.5); + if (4.0 > necessaryNum) { necessaryNum = 4.0; } + + set uniques; + for (int j = 0; j < (it->second).size(); j++) { uniques.insert(x[(it->second)[j]]); } + + //if len(set([float(v[1]) for v in ff if v[0] == c])) > max(float(feats['class'].count(c))*0.5,4): continue + if ((double)(uniques.size()) > necessaryNum) { } + else { + //feats[k][i] = math.fabs(feats[k][i] + lrand.normalvariate(0.0,max(feats[k][i]*0.05,0.01))) + for (int j = 0; j < (it->second).size(); j++) { //(it->second) contains indexes of abundance for this class + double sigma = max((x[(it->second)[j]]*0.05), 0.01); + x[(it->second)[j]] = abs(x[(it->second)[j]] + linear.normalvariate(0.0, sigma)); + } + } } + adjustedLookup.push_back(x); + } + } + + //go through classes + int minCl = 1e6; + map indexToClass; + vector classes; + for (map >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) { + //class with minimum number of groups + if ((it->second).size() < minCl) { minCl = (it->second).size(); } + for (int i = 0; i < (it->second).size(); i++) { indexToClass[(it->second)[i]] = it->first; } + classes.push_back(it->first); + } + + int numGroups = lookup.size(); //lfk + int fractionNumGroups = numGroups * fBoots; //rfk + minCl = (int)((float)(minCl*fBoots*fBoots*0.05)); + minCl = max(minCl, 1); + + vector< vector< vector > > results;//[iters][numComparison][numOTUs] + for (int j = 0; j < iters; j++) { + if (m->control_pressed) { return sigOTUS; } + + //find "good" random vector + vector rand_s; + for (int h = 0; h < 1000; h++) { //generate a vector of length fractionNumGroups with range 0 to numGroups-1 + rand_s.clear(); + for (int k = 0; k < fractionNumGroups; k++) { rand_s.push_back(m->getRandomIndex(numGroups-1)); } + if (!contastWithinClassesOrFewPerClass(adjustedLookup, rand_s, minCl, class2GroupIndex, indexToClass)) { h+=1000; } //break out of loop } + if (m->control_pressed) { return sigOTUS; } + + //print data in R input format for testing + if (false) { + vector groups; for (int h = 0; h < rand_s.size(); h++) { groups.push_back(lookup[rand_s[h]]->getGroup()); } + printToCoutForRTesting(adjustedLookup, rand_s, class2GroupIndex, bins, subClass2GroupIndex, groups); + } + + //for each pair of classes + vector< vector > temp = lda(adjustedLookup, rand_s, indexToClass, classes); //[numComparison][numOTUs] + if (temp.size() != 0) { results.push_back(temp); } } - if (error) { return significantOtuLabels; } + if (m->control_pressed) { return sigOTUS; } + //m = max([numpy.mean([means[k][kk][p] for kk in range(boots)]) for p in range(len(pairs))]) + int k = 0; + for (it = bins.begin(); it != bins.end(); it++) { //[numOTUs] - need to go through bins so we can tie adjustedLookup back to the binNumber. adjustedLookup[0] ->bins entry[0]. + vector averageForEachComparison; averageForEachComparison.resize(results[0].size(), 0.0); + double maxM = 0.0; //max of averages for each comparison + for (int j = 0; j < results[0].size(); j++) { //numComparisons + for (int i = 0; i < results.size(); i++) { //iters + averageForEachComparison[j]+= results[i][j][k]; + } + averageForEachComparison[j] /= (double) results.size(); + if (averageForEachComparison[j] > maxM) { maxM = averageForEachComparison[j]; } + } + //res[k] = math.copysign(1.0,m)*math.log(1.0+math.fabs(m),10) + double multiple = 1.0; if (maxM < 0.0) { multiple = -1.0; } + double resK = multiple * log10(1.0+abs(maxM)); + if (resK > ldaThreshold) { sigOTUS[it->first] = resK; } + k++; + } + + return sigOTUS; + } + catch(exception& e) { + m->errorOut(e, "LefseCommand", "testLDA"); + exit(1); + } +} +//********************************************************************************************************************** +vector< vector > LefseCommand::getMeans(vector& lookup, map >& class2GroupIndex) { + try { int numBins = lookup[0]->getNumBins(); - vector comp; - //find comparisons and fill comp - map::iterator itB; - for(map::iterator it=subclassCounts.begin();it!=subclassCounts.end();it++){ - itB = it;itB++; - for(itB;itB!=subclassCounts.end();itB++){ - compGroup temp(it->first,itB->first); - comp.push_back(temp); - } + int numClasses = class2GroupIndex.size(); + vector< vector > means; //[numOTUS][classes] + means.resize(numBins); + for (int i = 0; i < means.size(); i++) { means[i].resize(numClasses, 0.0); } + + map indexToClass; + int count = 0; + //shortcut for vectors below + map quickIndex; + vector classCounts; + for (map >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) { + for (int i = 0; i < (it->second).size(); i++) { indexToClass[(it->second)[i]] = it->first; } + quickIndex[it->first] = count; count++; + classCounts.push_back((it->second).size()); + } + + for (int i = 0; i < numBins; i++) { + for (int j = 0; j < lookup.size(); j++) { + if (m->control_pressed) { return means; } + means[i][quickIndex[indexToClass[j]]] += lookup[j]->getAbundance(i); + } + } + + for (int i = 0; i < numBins; i++) { + for (int j = 0; j < numClasses; j++) { means[i][j] /= (double) classCounts[j]; } + } + + return means; + } + catch(exception& e) { + m->errorOut(e, "LefseCommand", "getMeans"); + exit(1); + } +} +//********************************************************************************************************************** +vector< vector > LefseCommand::lda(vector< vector >& adjustedLookup, vector rand_s, map& indexToClass, vector classes) { + try { + //shortcut for vectors below + map quickIndex; + for (int i = 0; i < classes.size(); i++) { quickIndex[classes[i]] = i; } + + vector randClass; //classes for rand sample + vector counts; counts.resize(classes.size(), 0); + for (int i = 0; i < rand_s.size(); i++) { + string thisClass = indexToClass[rand_s[i]]; + randClass.push_back(thisClass); + counts[quickIndex[thisClass]]++; } - int numComp = comp.size(); - if (numComp < 2) { m->mothurOut("[ERROR]: Need at least 2 subclasses, Ignoring Wilcoxon.\n"); - return significantOtuLabels; } + vector< vector > a; //[numOTUs][numSampled] + for (int i = 0; i < adjustedLookup.size(); i++) { + vector temp; + for (int j = 0; j < rand_s.size(); j++) { + temp.push_back(adjustedLookup[i][rand_s[j]]); + } + a.push_back(temp); + } + + LinearAlgebra linear; + vector< vector > means; bool ignore; + vector< vector > scaling = linear.lda(a, randClass, means, ignore); //means are returned sorted, quickIndex sorts as well since it uses a map. means[class][otu] = + if (ignore) { scaling.clear(); return scaling; } + if (m->control_pressed) { return scaling; } + + vector< vector > w; w.resize(a.size()); //w.unit <- w/sqrt(sum(w^2)) + double denom = 0.0; + for (int i = 0; i < scaling.size(); i++) { w[i].push_back(scaling[i][0]); denom += (w[i][0]*w[i][0]); } + denom = sqrt(denom); + for (int i = 0; i < w.size(); i++) { w[i][0] /= denom; } //[numOTUs][1] - w.unit + + //robjects.r('LD <- xy.matrix%*%w.unit') [numSampled][numOtus] * [numOTUs][1] + vector< vector > LD = linear.matrix_mult(linear.transpose(a), w); + //find means for each groups LDs + vector LDMeans; LDMeans.resize(classes.size(), 0.0); //means[0] -> average for [group0]. + for (int i = 0; i < LD.size(); i++) { LDMeans[quickIndex[randClass[i]]] += LD[i][0]; } + for (int i = 0; i < LDMeans.size(); i++) { LDMeans[i] /= (double) counts[i]; } + + //calculate for each comparisons i.e. with groups A,B,C = AB, AC, BC = 3; + vector< vector > results;// [numComparison][numOTUs] + for (int i = 0; i < LDMeans.size(); i++) { + for (int l = 0; l < i; l++) { + if (m->control_pressed) { return scaling; } + //robjects.r('effect.size <- abs(mean(LD[sub_d[,"class"]=="'+p[0]+'"]) - mean(LD[sub_d[,"class"]=="'+p[1]+'"]))') + double effectSize = abs(LDMeans[i] - LDMeans[l]); + //scal = robjects.r('wfinal <- w.unit * effect.size') + vector compResults; + for (int j = 0; j < w.size(); j++) { //[numOTUs][1] + //coeff = [abs(float(v)) if not math.isnan(float(v)) else 0.0 for v in scal] + double coeff = abs(w[j][0]*effectSize); if (isnan(coeff) || isinf(coeff)) { coeff = 0.0; } + //gm = abs(res[p[0]][j] - res[p[1]][j]) - res is the means for each group for each otu + double gm = abs(means[i][j] - means[l][j]); + //means[k][i].append((gm+coeff[j])*0.5) + compResults.push_back((gm+coeff)*0.5); + } + results.push_back(compResults); + } + } + + return results; + } + catch(exception& e) { + m->errorOut(e, "LefseCommand", "lda"); + exit(1); + } +} + +//********************************************************************************************************************** +//modelled after lefse.py contast_within_classes_or_few_per_class function +bool LefseCommand::contastWithinClassesOrFewPerClass(vector< vector >& lookup, vector rands, int minCl, map > class2GroupIndex, map indexToClass) { + try { + set cls; + int countFound = 0; + + for (int i = 0; i < rands.size(); i++) { //fill cls with the classes represented in the random selection + for (map >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) { + if (m->inUsersGroups(rands[i], (it->second))) { + cls.insert(it->first); + countFound++; + } + } + } + + //sanity check + if (rands.size() != countFound) { m->mothurOut("oops, should never get here, missing something.\n"); } + + if (cls.size() < class2GroupIndex.size()) { return true; } //some classes are not present in sampling + + for (set::iterator it = cls.begin(); it != cls.end(); it++) { + if (cls.count(*it) < minCl) { return true; } //this sampling has class count below minimum + } + + //for this otu + int numBins = lookup.size(); + for (int i = 0; i < numBins; i++) { + if (m->control_pressed) { break; } + + //break up random sampling by class + map > class2Values; //maps class name -> set of abunds present in random sampling. F003Early -> 0.001, 0.003... + for (int j = 0; j < rands.size(); j++) { + class2Values[indexToClass[rands[j]]].insert(lookup[i][rands[j]]); + //rands[j] = index of randomly selected group in lookup, randIndex2Class[rands[j]] = class this group belongs to. lookup[rands[j]]->getAbundance(i) = abundance of this group for this OTU. + } + //are the unique values less than we want + //if (len(set(col)) <= min_cl and min_cl > 1) or (min_cl == 1 and len(set(col)) <= 1): + for (map >::iterator it = class2Values.begin(); it != class2Values.end(); it++) { + if (((it->second).size() <= minCl && minCl > 1) || (minCl == 1 && (it->second).size() <= 1)) { return true; } + } + } + + return false; + } + catch(exception& e) { + m->errorOut(e, "LefseCommand", "contastWithinClassesOrFewPerClass"); + exit(1); + } +} +//********************************************************************************************************************** +int LefseCommand::printResults(vector< vector > means, map sigKW, map sigLDA, string label, vector classes) { + try { map variables; variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)); - variables["[distance]"] = lookup[0]->getLabel(); - string outputFileName = getOutputFileName("wilcoxon",variables); + variables["[distance]"] = label; + string outputFileName = getOutputFileName("summary",variables); + ofstream out; + m->openOutputFile(outputFileName, out); + outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName); - ofstream out; - m->openOutputFile(outputFileName, out); - outputNames.push_back(outputFileName); outputTypes["wilcoxon"].push_back(outputFileName); - out << "OTULabel\tComparision\tWilcoxon\tPvalue\n"; + //output headers + out << "OTU\tLogMaxMean\tClass\tLDA\tpValue\n"; - for (int i = 0; i < numBins; i++) { + string temp = ""; + for (int i = 0; i < means.size(); i++) { //[numOTUs][classes] + //find max mean of classes + double maxMean = -1.0; string maxClass = "none"; + for (int j = 0; j < means[i].size(); j++) { if (means[i][j] > maxMean) { maxMean = means[i][j]; maxClass = classes[j]; } } + + //str(math.log(max(max(v),1.0),10.0)) + double logMaxMean = 1.0; + if (maxMean > logMaxMean) { logMaxMean = maxMean; } + logMaxMean = log10(logMaxMean); + + out << m->currentBinLabels[i] << '\t' << logMaxMean << '\t'; + if (m->debug) { temp = m->currentBinLabels[i] + '\t' + toString(logMaxMean) + '\t'; } + + map::iterator it = sigLDA.find(i); + if (it != sigLDA.end()) { + out << maxClass << '\t' << it->second << '\t' << sigKW[i] << endl; //sigLDA is a subset of sigKW so no need to look + if (m->debug) { temp += maxClass + '\t' + toString(it->second) + '\t' + toString(sigKW[i]) + '\n'; m->mothurOut(temp); temp = ""; } + }else { out << '-' << endl; } + } + + out.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "LefseCommand", "printResults"); + exit(1); + } +} +//********************************************************************************************************************** +//printToCoutForRTesting(adjustedLookup, rand_s, class2GroupIndex, numBins); +bool LefseCommand::printToCoutForRTesting(vector< vector >& adjustedLookup, vector rand_s, map >& class2GroupIndex, map bins, map >& subClass2GroupIndex, vector groups) { + try { + cout << "rand_s = "; + for (int h = 0; h < rand_s.size(); h++) { cout << rand_s[h] << '\t'; } cout << endl; + + //print otu data + int count = 0; + for (map::iterator it = bins.begin(); it != bins.end(); it++) { + if (m->control_pressed) { break; } + + cout << m->currentBinLabels[it->first] << " <- c("; + for (int h = 0; h < rand_s.size()-1; h++) { cout << (adjustedLookup[count][rand_s[h]]) << ", "; } + cout << (adjustedLookup[count][rand_s[rand_s.size()-1]]) << ")\n"; + count++; + } + /* + string tempOutput = ""; + for (int h = 0; h < rand_s.size(); h++) { + //find class this index is in + for (map >::iterator it = class2GroupIndex.begin(); it!= class2GroupIndex.end(); it++) { + if (m->inUsersGroups(rand_s[h], (it->second)) ) { cout << (h+1) << " <- c(\"" +it->first + "\")\n" ; } + } + }*/ + + + string tempOutput = "treatments <- c("; + for (int h = 0; h < rand_s.size(); h++) { + //find class this index is in + for (map >::iterator it = class2GroupIndex.begin(); it!= class2GroupIndex.end(); it++) { + if (m->inUsersGroups(rand_s[h], (it->second)) ) { tempOutput += "\"" +it->first + "\"" + ","; } //"\"" +it->first + "\"" + } + } + tempOutput = tempOutput.substr(0, tempOutput.length()-1); + tempOutput += ")\n"; + cout << tempOutput; + + /* + if (subclass != "") { + string tempOutput = "sub <- c("; + for (int h = 0; h < rand_s.size(); h++) { + //find class this index is in + for (map >::iterator it = subClass2GroupIndex.begin(); it!= subClass2GroupIndex.end(); it++) { + if (m->inUsersGroups(rand_s[h], (it->second)) ) { tempOutput += "\"" +it->first + "\"" + ','; } + } + } + tempOutput = tempOutput.substr(0, tempOutput.length()-1); + tempOutput += ")\n"; + cout << tempOutput; + } + + if (subject) { + string tempOutput = "group <- c("; + for (int h = 0; h < groups.size(); h++) { + tempOutput += "\"" +groups[h] + "\"" + ','; + } + tempOutput = tempOutput.substr(0, tempOutput.length()-1); + tempOutput += ")\n"; + cout << tempOutput; + }*/ + + + //print data frame + tempOutput = "dat <- data.frame("; + for (map::iterator it = bins.begin(); it != bins.end(); it++) { if (m->control_pressed) { break; } - if (m->inUsersGroups(i, bins)) { //flagged in Kruskal Wallis + tempOutput += "\"" + m->currentBinLabels[it->first] + "\"=" + m->currentBinLabels[it->first] + ","; + } + //tempOutput = tempOutput.substr(0, tempOutput.length()-1); + tempOutput += " class=treatments"; + //if (subclass != "") { tempOutput += ", subclass=sub"; } + //if (subject) { tempOutput += ", subject=group"; } + tempOutput += ")\n"; + cout << tempOutput; - bool sig = false; - //for each subclass comparision - for (int j = 0; j < numComp; j++) { - //fill x and y with this comparisons data - vector x; vector y; - - cout << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << " x <- ("; - //fill x and y - vector xIndexes = subClass2GroupIndex[comp[j].group1]; //indexes in lookup for this subclass - for (int k = 0; k < xIndexes.size(); k++) { x.push_back(lookup[xIndexes[k]]->getAbundance(i)); cout << lookup[xIndexes[k]]->getAbundance(i) << ", "; } - cout << ")\n"; - - cout << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << " y <- ("; - - vector yIndexes = subClass2GroupIndex[comp[j].group2]; //indexes in lookup for this subclass - for (int k = 0; k < yIndexes.size(); k++) { y.push_back(lookup[yIndexes[k]]->getAbundance(i)); cout << lookup[yIndexes[k]]->getAbundance(i) << ", ";} - cout << ")\n"; - - double pValue = 0.0; - double H = linear.calcWilcoxon(x, y, pValue); + tempOutput = "z <- suppressWarnings(mylda(as.formula(class ~ "; + for (map::iterator it = bins.begin(); it != bins.end(); it++) { + if (m->control_pressed) { break; } - //output H and signifigance - if (!isnan(pValue)) { out << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << '\t' << H << '\t' << pValue << endl; } - else { out << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << '\t' << H << '\t' << "NA" << endl; } - - //set sig - not sure how yet - } - if (sig) { significantOtuLabels.push_back(i); } + tempOutput += m->currentBinLabels[it->first] + "+"; + } + tempOutput = tempOutput.substr(0, tempOutput.length()-1); //rip off extra plus sign + tempOutput += "), data = dat, tol = 1e-10))"; + cout << tempOutput + "\nz\n"; + cout << "w <- z$scaling[,1]\n"; //robjects.r('w <- z$scaling[,1]') + cout << "w.unit <- w/sqrt(sum(w^2))\n"; //robjects.r('w.unit <- w/sqrt(sum(w^2))') + cout << "ss <- dat[,-match(\"class\",colnames(dat))]\n"; //robjects.r('ss <- sub_d[,-match("class",colnames(sub_d))]') + //if (subclass != "") { cout << "ss <- ss[,-match(\"subclass\",colnames(ss))]\n"; }//robjects.r('ss <- ss[,-match("subclass",colnames(ss))]') + //if (subject) { cout << "ss <- ss[,-match(\"subject\",colnames(ss))]\n"; }//robjects.r('ss <- ss[,-match("subject",colnames(ss))]') + cout << "xy.matrix <- as.matrix(ss)\n"; //robjects.r('xy.matrix <- as.matrix(ss)') + cout << "LD <- xy.matrix%*%w.unit\n"; //robjects.r('LD <- xy.matrix%*%w.unit') + cout << "effect.size <- abs(mean(LD[dat[,\"class\"]==\"'+p[0]+'\"]) - mean(LD[dat[,\"class\"]==\"'+p[1]+'\"]))\n"; //robjects.r('effect.size <- abs(mean(LD[sub_d[,"class"]=="'+p[0]+'"]) - mean(LD[sub_d[,"class"]=="'+p[1]+'"]))') + cout << "wfinal <- w.unit * effect.size\n"; //scal = robjects.r('wfinal <- w.unit * effect.size') + cout << "mm <- z$means\n"; //rres = robjects.r('mm <- z$means') + + return true; + } + catch(exception& e) { + m->errorOut(e, "LefseCommand", "printToCoutForRTesting"); + exit(1); + } +} +//********************************************************************************************************************** +int LefseCommand::makeShared(int numDesignLines) { + try { + ifstream in; + m->openInputFile(sharedfile, in); + vector< vector > lines; + for(int i = 0; i < numDesignLines; i++) { + if (m->control_pressed) { return 0; } + + string line = m->getline(in); + cout << line << endl; + vector pieces = m->splitWhiteSpace(line); + lines.push_back(pieces); + } + + ofstream out; + m->openOutputFile(sharedfile+".design", out); out << "group" << '\t'; + for (int j = 0; j < lines.size(); j++) { out << lines[j][0] << '\t'; } out << endl; + for (int j = 1; j < lines[0].size(); j++) { + out <<(j-1) << '\t'; + for (int i = 0; i < lines.size(); i++) { + out << lines[i][j] << '\t'; } + out << endl; } out.close(); + DesignMap design(sharedfile+".design"); - return significantOtuLabels; + vector lookup; + for (int k = 0; k < lines[0].size()-1; k++) { + SharedRAbundFloatVector* temp = new SharedRAbundFloatVector(); + temp->setLabel("0.03"); + temp->setGroup(toString(k)); + lookup.push_back(temp); + } + + m->currentBinLabels.clear(); + int count = 0; + while (!in.eof()) { + if (m->control_pressed) { return 0; } + + string line = m->getline(in); + vector pieces = m->splitWhiteSpace(line); + + float sum = 0.0; + for (int i = 1; i < pieces.size(); i++) { + float value; m->mothurConvert(pieces[i], value); + sum += value; + } + + if (sum != 0.0) { + //cout << count << '\t'; + for (int i = 1; i < pieces.size(); i++) { + float value; m->mothurConvert(pieces[i], value); + lookup[i-1]->push_back(value, toString(i-1)); + //cout << pieces[i] << '\t'; + } + m->currentBinLabels.push_back(toString(count)); + //m->currentBinLabels.push_back(pieces[0]); + //cout << line<< endl; + //cout << endl; + } + count++; + } + in.close(); + + for (int k = 0; k < lookup.size(); k++) { + //cout << "0.03" << '\t' << toString(k) << endl; lookup[k]->print(cout); + } + + process(lookup, design); + + return 0; } catch(exception& e) { - m->errorOut(e, "LefseCommand", "runWilcoxon"); + m->errorOut(e, "LefseCommand", "printToCoutForRTesting"); exit(1); } } diff --git a/lefsecommand.h b/lefsecommand.h index c9c4092..1c5ac6f 100644 --- a/lefsecommand.h +++ b/lefsecommand.h @@ -48,16 +48,27 @@ public: void help() { m->mothurOut(getHelpString()); } private: - bool abort, allLines; - string outputDir, sharedfile, designfile, mclass, subclass, classes; + bool abort, allLines, wilc, wilcsamename, curv, subject, normMillion; + string outputDir, sharedfile, designfile, mclass, subclass, classes, rankTec, multiClassStrat; vector outputNames; set labels; - float anovaAlpha, wilcoxonAlpha; + double anovaAlpha, wilcoxonAlpha, fBoots, ldaThreshold; + int nlogs, iters, strict, minC; - int process(vector&, DesignMap&); - vector runKruskalWallis(vector&, DesignMap&); - vector runWilcoxon(vector&, DesignMap&, vector); - + int process(vector&, DesignMap&); + int normalize(vector&); + map runKruskalWallis(vector&, DesignMap&); + map runWilcoxon(vector&, DesignMap&, map, map >& class2SubClasses, map >& subClass2GroupIndex, map); + bool testOTUWilcoxon(map >& class2SubClasses, vector abunds, map >& subClass2GroupIndex, map); + map testLDA(vector&, map, map >& class2GroupIndex, map >&); + bool contastWithinClassesOrFewPerClass(vector< vector >&, vector rands, int minCl, map > class2GroupIndex, map indexToClass); + vector< vector > lda(vector< vector >& adjustedLookup, vector rand_s, map& indexToClass, vector); + vector< vector > getMeans(vector& lookup, map >& class2GroupIndex); + int printResults(vector< vector >, map, map, string, vector); + + //for testing + bool printToCoutForRTesting(vector< vector >& adjustedLookup, vector rand_s, map >& class2GroupIndex, map bins, map >&, vector); + int makeShared(int); }; /**************************************************************************************************/ diff --git a/linearalgebra.cpp b/linearalgebra.cpp index cd2ca03..fdd95ec 100644 --- a/linearalgebra.cpp +++ b/linearalgebra.cpp @@ -253,7 +253,7 @@ double LinearAlgebra::betacf(const double a, const double b, const double x) { } } /*********************************************************************************************************************************/ - +//[3][4] * [4][5] - columns in first must match rows in second, returns matrix[3][5] vector > LinearAlgebra::matrix_mult(vector > first, vector > second){ try { vector > product; @@ -287,7 +287,23 @@ vector > LinearAlgebra::matrix_mult(vector > first } } +/*********************************************************************************************************************************/ +vector > LinearAlgebra::transpose(vector >matrix){ + try { + vector > trans; trans.resize(matrix[0].size()); + for (int i = 0; i < trans.size(); i++) { + for (int j = 0; j < matrix.size(); j++) { trans[i].push_back(matrix[j][i]); } + } + + return trans; + } + catch(exception& e) { + m->errorOut(e, "LinearAlgebra", "transpose"); + exit(1); + } + +} /*********************************************************************************************************************************/ void LinearAlgebra::recenter(double offset, vector > D, vector >& G){ @@ -1304,6 +1320,31 @@ double LinearAlgebra::calcKruskalWallis(vector& values, double& pV } } /*********************************************************************************************************************************/ +//python random.normalvariate - thanks http://madssj.com/blog/2008/05/07/porting-normalvariate-from-python-to-c/ +double LinearAlgebra::normalvariate(double mu, double sigma) { + try { + double NV_MAGICCONST = 1.7155277699214135; /* (4 * exp(-0.5) / sqrt(2.0)); */ + unsigned long int MAX_RANDOM = 2147483647; /* (2 ** 31) - 1; */ + + double u1, u2, z, zz; + for (;;) { + if (m->control_pressed) { break; } + u1 = ((float)random()) / MAX_RANDOM; + u2 = 1.0 - (((float)random()) / MAX_RANDOM); + z = NV_MAGICCONST * (u1 - 0.5) / u2; + zz = z * z / 4.0; + if (zz <= -(log(u2))) { + break; + } + } + return mu + z * sigma; + } + catch(exception& e) { + m->errorOut(e, "LinearAlgebra", "normalvariate"); + exit(1); + } +} +/*********************************************************************************************************************************/ //thanks http://www.johndcook.com/cpp_phi.html double LinearAlgebra::pnorm(double x){ try { @@ -2014,3 +2055,565 @@ vector > LinearAlgebra::getInverse(vector > matrix exit(1); } } +/*********************************************************************************************************************************/ +//modelled R lda function - MASS:::lda.default +vector< vector > LinearAlgebra::lda(vector< vector >& a, vector groups, vector< vector >& means, bool& ignore) { + try { + + set uniqueGroups; + for (int i = 0; i < groups.size(); i++) { uniqueGroups.insert(groups[i]); } + int numGroups = uniqueGroups.size(); + + map quickIndex; //className to index. hoping to save counts, proportions and means in vectors to save time. This map will allow us to know index 0 in counts refers to group1. + int count = 0; + for (set::iterator it = uniqueGroups.begin(); it != uniqueGroups.end(); it++) { quickIndex[*it] = count; count++; } + + int numSampled = groups.size(); //number of sampled groups + int numOtus = a.size(); //number of flagged bins + + //counts <- as.vector(table(g)) //number of samples from each class in random sampling + vector counts; counts.resize(numGroups, 0); + for (int i = 0; i < groups.size(); i++) { + counts[quickIndex[groups[i]]]++; + } + + vector proportions; proportions.resize(numGroups, 0.0); + for (int i = 0; i < numGroups; i++) { proportions[i] = counts[i] / (double) numSampled; } + + means.clear(); //means[0] -> means[0][0] average for [group0][OTU0]. + means.resize(numGroups); for (int i = 0; i < means.size(); i++) { means[i].resize(numOtus, 0.0); } + for (int j = 0; j < numSampled; j++) { //total for each class for each OTU + for (int i = 0; i < numOtus; i++) { means[quickIndex[groups[j]]][i] += a[i][j]; } + } + //average for each class for each OTU + for (int j = 0; j < numGroups; j++) { for (int i = 0; i < numOtus; i++) { means[j][i] /= counts[j]; } } + + //randCov <- x - group.means[g, ] + vector< vector > randCov; //randCov[0][0] -> (random sample value0 for OTU0 - average for samples group in OTU0). example OTU0, random sample 0.01 from class early. average of class early for OTU0 is 0.005. randCov[0][0] = (0.01-0.005) + for (int i = 0; i < numOtus; i++) { //for each flagged OTU + vector tempRand; + for (int j = 0; j < numSampled; j++) { tempRand.push_back(a[i][j] - means[quickIndex[groups[j]]][i]); } + randCov.push_back(tempRand); + } + + //find variance and std for each OTU + //f1 <- sqrt(diag(var(x - group.means[g, ]))) + vector stdF1; + vector ave; + for (int i = 0; i < numOtus; i++) { + stdF1.push_back(0.0); + ave.push_back(m->getAverage(randCov[i])); + } + + for (int i = 0; i < numOtus; i++) { + for (int j = 0; j < numSampled; j++) { stdF1[i] += ((randCov[i][j] - ave[i]) * (randCov[i][j] - ave[i])); } + } + + //fac <- 1/(n - ng) + double fac = 1 / (double) (numSampled-numGroups); + + for (int i = 0; i < stdF1.size(); i++) { + stdF1[i] /= (double) (numSampled-1); + stdF1[i] = sqrt(stdF1[i]); + } + + vector< vector > scaling; //[numOTUS][numOTUS] + for (int i = 0; i < numOtus; i++) { + vector temp; + for (int j = 0; j < numOtus; j++) { + if (i == j) { temp.push_back(1.0/stdF1[i]); } + else { temp.push_back(0.0); } + + } + scaling.push_back(temp); + } + /* + cout << "scaling = " << endl; + for (int i = 0; i < scaling.size(); i++) { + for (int j = 0; j < scaling[i].size(); j++) { cout << scaling[i][j] << '\t'; } + cout << endl; + }*/ + + //X <- sqrt(fac) * ((x - group.means[g, ]) %*% scaling) + vector< vector > X = randCov; //[numOTUS][numSampled] + //((x - group.means[g, ]) %*% scaling) + //matrix multiplication of randCov and scaling + LinearAlgebra linear; + X = linear.matrix_mult(scaling, randCov); //[numOTUS][numOTUS] * [numOTUS][numSampled] = [numOTUS][numSampled] + fac = sqrt(fac); + + for (int i = 0; i < X.size(); i++) { + for (int j = 0; j < X[i].size(); j++) { X[i][j] *= fac; } + } + + vector d; + vector< vector > v; + vector< vector > Xcopy; //X = [numOTUS][numSampled] + bool transpose = false; //svd requires rows < columns, so if they are not then I need to transpose and look for the results in v. + if (X.size() < X[0].size()) { Xcopy = linear.transpose(X); transpose=true; } + else { Xcopy = X; } + linear.svd(Xcopy, d, v); //Xcopy gets the results we want for v below, because R's version is [numSampled][numOTUS] + + /*cout << "Xcopy = " << endl; + for (int i = 0; i < Xcopy.size(); i++) { + for (int j = 0; j < Xcopy[i].size(); j++) { cout << Xcopy[i][j] << '\t'; } + cout << endl; + } + cout << "v = " << endl; + for (int i = 0; i < v.size(); i++) { + for (int j = 0; j < v[i].size(); j++) { cout << v[i][j] << '\t'; } + cout << endl; + } + */ + + int rank = 0; + set goodColumns; + //cout << "d = " << endl; + for (int i = 0; i < d.size(); i++) { if (d[i] > 0.0000000001) { rank++; goodColumns.insert(i); } } //cout << d[i] << endl; + + if (rank == 0) { + ignore=true; //m->mothurOut("[ERROR]: rank = 0: variables are numerically const\n"); m->control_pressed = true; + return scaling; } + + //scaling <- scaling %*% X.s$v[, 1L:rank] %*% diag(1/X.s$d[1L:rank], , rank) + //X.s$v[, 1L:rank] = columns in Xcopy that correspond to "good" d values + //diag(1/X.s$d[1L:rank], , rank) = matrix size rank * rank where the diagonal is 1/"good" dvalues + /*example: + d + [1] 3.721545e+00 3.034607e+00 2.296649e+00 7.986927e-16 6.922408e-16 + [6] 5.471102e-16 + + $v + [,1] [,2] [,3] [,4] [,5] [,6] + [1,] 0.31122175 0.10944725 0.20183340 -0.30136820 0.60786235 -0.13537095 + [2,] -0.29563726 -0.20568893 0.11233366 -0.05073289 0.48234270 0.21965978 + ... + + [1] "X.s$v[, 1L:rank]" + [,1] [,2] [,3] + [1,] 0.31122175 0.10944725 0.20183340 + [2,] -0.29563726 -0.20568893 0.11233366 + ... + [1] "1/X.s$d[1L:rank]" + [1] 0.2687056 0.3295320 0.4354170 + + [1] "diag(1/X.s$d[1L:rank], , rank)" + [,1] [,2] [,3] + [1,] 0.2687056 0.000000 0.000000 + [2,] 0.0000000 0.329532 0.000000 + [3,] 0.0000000 0.000000 0.435417 + */ + if (transpose) { + Xcopy = linear.transpose(v); + /* + cout << "Xcopy = " << endl; + for (int i = 0; i < Xcopy.size(); i++) { + for (int j = 0; j < Xcopy[i].size(); j++) { cout << Xcopy[i][j] << '\t'; } + cout << endl; + }*/ + } + v.clear(); //store "good" columns - X.s$v[, 1L:rank] + v.resize(Xcopy.size()); //[numOTUS]["good" columns] + for (set::iterator it = goodColumns.begin(); it != goodColumns.end(); it++) { + for (int i = 0; i < Xcopy.size(); i++) { + v[i].push_back(Xcopy[i][*it]); + } + } + + vector< vector > diagRanks; diagRanks.resize(rank); + for (int i = 0; i < rank; i++) { diagRanks[i].resize(rank, 0.0); } + count = 0; + for (set::iterator it = goodColumns.begin(); it != goodColumns.end(); it++) { diagRanks[count][count] = 1.0 / d[*it]; count++; } + + scaling = linear.matrix_mult(linear.matrix_mult(scaling, v), diagRanks); //([numOTUS][numOTUS]*[numOTUS]["good" columns]) = [numOTUS]["good" columns] then ([numOTUS]["good" columns] * ["good" columns]["good" columns] = scaling = [numOTUS]["good" columns] + + /*cout << "scaling = " << endl; + for (int i = 0; i < scaling.size(); i++) { + for (int j = 0; j < scaling[i].size(); j++) { cout << scaling[i][j] << '\t'; } + cout << endl; + }*/ + + //Note: linear.matrix_mult [1][numGroups] * [numGroups][numOTUs] - columns in first must match rows in second, returns matrix[1][numOTUs] + vector< vector > prior; prior.push_back(proportions); + vector< vector > xbar = linear.matrix_mult(prior, means); + vector xBar = xbar[0]; //length numOTUs + + /*cout << "xbar" << endl; + for (int j = 0; j < numOtus; j++) { cout << xBar[j] <<'\t'; }cout << endl;*/ + //fac <- 1/(ng - 1) + fac = 1 / (double) (numGroups-1); + //scale(group.means, center = xbar, scale = FALSE) %*% scaling + vector< vector > scaledMeans = means; //[numGroups][numOTUs] + for (int i = 0; i < numGroups; i++) { + for (int j = 0; j < numOtus; j++) { scaledMeans[i][j] -= xBar[j]; } + } + scaledMeans = linear.matrix_mult(scaledMeans, scaling); //[numGroups][numOTUS]*[numOTUS]["good"columns] = [numGroups]["good"columns] + + + //sqrt((n * prior) * fac) + vector temp = proportions; //[numGroups] + for (int i = 0; i < temp.size(); i++) { temp[i] *= numSampled * fac; temp[i] = sqrt(temp[i]); } + + //X <- sqrt((n * prior) * fac) * (scale(group.means, center = xbar, scale = FALSE) %*% scaling) + //X <- temp * scaledMeans + X.clear(); X = scaledMeans; //[numGroups]["good"columns] + for (int i = 0; i < X.size(); i++) { + for (int j = 0; j < X[i].size(); j++) { X[i][j] *= temp[j]; } + } + /* + cout << "X = " << endl; + for (int i = 0; i < X.size(); i++) { + for (int j = 0; j < X[i].size(); j++) { cout << X[i][j] << '\t'; } + cout << endl; + } + */ + + d.clear(); v.clear(); + //we want to transpose so results are in Xcopy, but if that makes rows > columns then we don't since svd requires rows < cols. + transpose=false; + if (X.size() > X[0].size()) { Xcopy = X; transpose=true; } + else { Xcopy = linear.transpose(X); } + linear.svd(Xcopy, d, v); //Xcopy gets the results we want for v below + /*cout << "Xcopy = " << endl; + for (int i = 0; i < Xcopy.size(); i++) { + for (int j = 0; j < Xcopy[i].size(); j++) { cout << Xcopy[i][j] << '\t'; } + cout << endl; + } + + cout << "v = " << endl; + for (int i = 0; i < v.size(); i++) { + for (int j = 0; j < v[i].size(); j++) { cout << v[i][j] << '\t'; } + cout << endl; + } + + cout << "d = " << endl; + for (int i = 0; i < d.size(); i++) { cout << d[i] << endl; }*/ + + //rank <- sum(X.s$d > tol * X.s$d[1L]) + //X.s$d[1L] = larger value in d vector + double largeD = m->max(d); + rank = 0; goodColumns.clear(); + for (int i = 0; i < d.size(); i++) { if (d[i] > (0.0000000001*largeD)) { rank++; goodColumns.insert(i); } } + + if (rank == 0) { + ignore=true;//m->mothurOut("[ERROR]: rank = 0: class means are numerically identical.\n"); m->control_pressed = true; + return scaling; } + + if (transpose) { Xcopy = linear.transpose(v); } + //scaling <- scaling %*% X.s$v[, 1L:rank] - scaling * "good" columns + v.clear(); //store "good" columns - X.s$v[, 1L:rank] + v.resize(Xcopy.size()); //Xcopy = ["good"columns][numGroups] + for (set::iterator it = goodColumns.begin(); it != goodColumns.end(); it++) { + for (int i = 0; i < Xcopy.size(); i++) { + v[i].push_back(Xcopy[i][*it]); + } + } + + scaling = linear.matrix_mult(scaling, v); //[numOTUS]["good" columns] * ["good"columns][new "good" columns] + + /*cout << "scaling = " << endl; + for (int i = 0; i < scaling.size(); i++) { + for (int j = 0; j < scaling[i].size(); j++) { cout << scaling[i][j] << '\t'; } + cout << endl; + }*/ + ignore=false; + return scaling; + } + catch(exception& e) { + m->errorOut(e, "LinearAlgebra", "lda"); + exit(1); + } +} +/*********************************************************************************************************************************/ +//Singular value decomposition (SVD) - adapted from http://svn.lirec.eu/libs/magicsquares/src/SVD.cpp +/* + * svdcomp - SVD decomposition routine. + * Takes an mxn matrix a and decomposes it into udv, where u,v are + * left and right orthogonal transformation matrices, and d is a + * diagonal matrix of singular values. + * + * This routine is adapted from svdecomp.c in XLISP-STAT 2.1 which is + * code from Numerical Recipes adapted by Luke Tierney and David Betz. + * + * Input to dsvd is as follows: + * a = mxn matrix to be decomposed, gets overwritten with u + * m = row dimension of a + * n = column dimension of a + * w = returns the vector of singular values of a + * v = returns the right orthogonal transformation matrix + */ + +int LinearAlgebra::svd(vector< vector >& a, vector& w, vector< vector >& v) { + try { + int flag, i, its, j, jj, k, l, nm; + double c, f, h, s, x, y, z; + double anorm = 0.0, g = 0.0, scale = 0.0; + + int numRows = a.size(); if (numRows == 0) { return 0; } + int numCols = a[0].size(); + w.resize(numCols, 0.0); + v.resize(numCols); for (int i = 0; i < numCols; i++) { v[i].resize(numRows, 0.0); } + + vector rv1; rv1.resize(numCols, 0.0); + if (numRows < numCols){ m->mothurOut("[ERROR]: numRows < numCols\n"); m->control_pressed = true; return 0; } + + /* Householder reduction to bidiagonal form */ + for (i = 0; i < numCols; i++) + { + /* left-hand reduction */ + l = i + 1; + rv1[i] = scale * g; + g = s = scale = 0.0; + if (i < numRows) + { + for (k = i; k < numRows; k++) + scale += fabs((double)a[k][i]); + if (scale) + { + for (k = i; k < numRows; k++) + { + a[k][i] = (double)((double)a[k][i]/scale); + s += ((double)a[k][i] * (double)a[k][i]); + } + f = (double)a[i][i]; + g = -SIGN(sqrt(s), f); + h = f * g - s; + a[i][i] = (double)(f - g); + if (i != numCols - 1) + { + for (j = l; j < numCols; j++) + { + for (s = 0.0, k = i; k < numRows; k++) + s += ((double)a[k][i] * (double)a[k][j]); + f = s / h; + for (k = i; k < numRows; k++) + a[k][j] += (double)(f * (double)a[k][i]); + } + } + for (k = i; k < numRows; k++) + a[k][i] = (double)((double)a[k][i]*scale); + } + } + w[i] = (double)(scale * g); + + /* right-hand reduction */ + g = s = scale = 0.0; + if (i < numRows && i != numCols - 1) + { + for (k = l; k < numCols; k++) + scale += fabs((double)a[i][k]); + if (scale) + { + for (k = l; k < numCols; k++) + { + a[i][k] = (double)((double)a[i][k]/scale); + s += ((double)a[i][k] * (double)a[i][k]); + } + f = (double)a[i][l]; + g = -SIGN(sqrt(s), f); + h = f * g - s; + a[i][l] = (double)(f - g); + for (k = l; k < numCols; k++) + rv1[k] = (double)a[i][k] / h; + if (i != numRows - 1) + { + for (j = l; j < numRows; j++) + { + for (s = 0.0, k = l; k < numCols; k++) + s += ((double)a[j][k] * (double)a[i][k]); + for (k = l; k < numCols; k++) + a[j][k] += (double)(s * rv1[k]); + } + } + for (k = l; k < numCols; k++) + a[i][k] = (double)((double)a[i][k]*scale); + } + } + anorm = max(anorm, (fabs((double)w[i]) + fabs(rv1[i]))); + } + + /* accumulate the right-hand transformation */ + for (i = numCols - 1; i >= 0; i--) + { + if (i < numCols - 1) + { + if (g) + { + for (j = l; j < numCols; j++) + v[j][i] = (double)(((double)a[i][j] / (double)a[i][l]) / g); + /* double division to avoid underflow */ + for (j = l; j < numCols; j++) + { + for (s = 0.0, k = l; k < numCols; k++) + s += ((double)a[i][k] * (double)v[k][j]); + for (k = l; k < numCols; k++) + v[k][j] += (double)(s * (double)v[k][i]); + } + } + for (j = l; j < numCols; j++) + v[i][j] = v[j][i] = 0.0; + } + v[i][i] = 1.0; + g = rv1[i]; + l = i; + } + + /* accumulate the left-hand transformation */ + for (i = numCols - 1; i >= 0; i--) + { + l = i + 1; + g = (double)w[i]; + if (i < numCols - 1) + for (j = l; j < numCols; j++) + a[i][j] = 0.0; + if (g) + { + g = 1.0 / g; + if (i != numCols - 1) + { + for (j = l; j < numCols; j++) + { + for (s = 0.0, k = l; k < numRows; k++) + s += ((double)a[k][i] * (double)a[k][j]); + f = (s / (double)a[i][i]) * g; + for (k = i; k < numRows; k++) + a[k][j] += (double)(f * (double)a[k][i]); + } + } + for (j = i; j < numRows; j++) + a[j][i] = (double)((double)a[j][i]*g); + } + else + { + for (j = i; j < numRows; j++) + a[j][i] = 0.0; + } + ++a[i][i]; + } + + /* diagonalize the bidiagonal form */ + for (k = numCols - 1; k >= 0; k--) + { /* loop over singular values */ + for (its = 0; its < 30; its++) + { /* loop over allowed iterations */ + flag = 1; + for (l = k; l >= 0; l--) + { /* test for splitting */ + nm = l - 1; + if (fabs(rv1[l]) + anorm == anorm) + { + flag = 0; + break; + } + if (fabs((double)w[nm]) + anorm == anorm) + break; + } + if (flag) + { + c = 0.0; + s = 1.0; + for (i = l; i <= k; i++) + { + f = s * rv1[i]; + if (fabs(f) + anorm != anorm) + { + g = (double)w[i]; + h = pythag(f, g); + w[i] = (double)h; + h = 1.0 / h; + c = g * h; + s = (- f * h); + for (j = 0; j < numRows; j++) + { + y = (double)a[j][nm]; + z = (double)a[j][i]; + a[j][nm] = (double)(y * c + z * s); + a[j][i] = (double)(z * c - y * s); + } + } + } + } + z = (double)w[k]; + if (l == k) + { /* convergence */ + if (z < 0.0) + { /* make singular value nonnegative */ + w[k] = (double)(-z); + for (j = 0; j < numCols; j++) + v[j][k] = (-v[j][k]); + } + break; + } + if (its >= 30) { + m->mothurOut("No convergence after 30,000! iterations \n"); m->control_pressed = true; + return(0); + } + + /* shift from bottom 2 x 2 minor */ + x = (double)w[l]; + nm = k - 1; + y = (double)w[nm]; + g = rv1[nm]; + h = rv1[k]; + f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y); + g = pythag(f, 1.0); + f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x; + + /* next QR transformation */ + c = s = 1.0; + for (j = l; j <= nm; j++) + { + i = j + 1; + g = rv1[i]; + y = (double)w[i]; + h = s * g; + g = c * g; + z = pythag(f, h); + rv1[j] = z; + c = f / z; + s = h / z; + f = x * c + g * s; + g = g * c - x * s; + h = y * s; + y = y * c; + for (jj = 0; jj < numCols; jj++) + { + x = (double)v[jj][j]; + z = (double)v[jj][i]; + v[jj][j] = (float)(x * c + z * s); + v[jj][i] = (float)(z * c - x * s); + } + z = pythag(f, h); + w[j] = (float)z; + if (z) + { + z = 1.0 / z; + c = f * z; + s = h * z; + } + f = (c * g) + (s * y); + x = (c * y) - (s * g); + for (jj = 0; jj < numRows; jj++) + { + y = (double)a[jj][j]; + z = (double)a[jj][i]; + a[jj][j] = (double)(y * c + z * s); + a[jj][i] = (double)(z * c - y * s); + } + } + rv1[l] = 0.0; + rv1[k] = f; + w[k] = (double)x; + } + } + + return(0); + + } + catch(exception& e) { + m->errorOut(e, "LinearAlgebra", "svd"); + exit(1); + } +} + + diff --git a/linearalgebra.h b/linearalgebra.h index 3c86cb1..7453f4e 100644 --- a/linearalgebra.h +++ b/linearalgebra.h @@ -20,9 +20,12 @@ public: ~LinearAlgebra() {} vector > matrix_mult(vector >, vector >); + vector >transpose(vector >); void recenter(double, vector >, vector >&); - int tred2(vector >&, vector&, vector&); + //eigenvectors + int tred2(vector >&, vector&, vector&); int qtli(vector&, vector&, vector >&); + vector< vector > calculateEuclidianDistance(vector >&, int); //pass in axes and number of dimensions vector< vector > calculateEuclidianDistance(vector >&); //pass in axes vector > getObservedEuclideanDistance(vector >&); @@ -44,7 +47,9 @@ public: vector solveEquations(vector >, vector); vector > getInverse(vector >); double choose(double, double); - + double normalvariate(double mu, double sigma); + vector< vector > lda(vector< vector >& a, vector groups, vector< vector >& means, bool&); //Linear discriminant analysis - a is [features][valuesFromGroups] groups indicates which group each sampling comes from. For example if groups = early, late, mid, early, early. a[0][0] = value for feature0 from groupEarly. + int svd(vector< vector >& a, vector& w, vector< vector >& v); //Singular value decomposition private: MothurOut* m; @@ -72,8 +77,6 @@ private: void ludcmp(vector >&, vector&, float&); void lubksb(vector >&, vector&, vector&); - - }; #endif diff --git a/makebiomcommand.cpp b/makebiomcommand.cpp index a19e2e3..a101883 100644 --- a/makebiomcommand.cpp +++ b/makebiomcommand.cpp @@ -405,6 +405,7 @@ int MakeBiomCommand::getBiom(vector& lookup){ out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n"; out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n"; + int numBins = lookup[0]->getNumBins(); vector metadata = getMetaData(lookup); if (m->control_pressed) { out.close(); return 0; } @@ -420,11 +421,11 @@ int MakeBiomCommand::getBiom(vector& lookup){ out << spaces + "\"rows\":[\n"; string rowFront = spaces + spaces + "{\"id\":\""; string rowBack = "\", \"metadata\":"; - for (int i = 0; i < m->currentBinLabels.size()-1; i++) { + for (int i = 0; i < numBins-1; i++) { if (m->control_pressed) { out.close(); return 0; } out << rowFront << m->currentBinLabels[i] << rowBack << metadata[i] << "},\n"; } - out << rowFront << m->currentBinLabels[(m->currentBinLabels.size()-1)] << rowBack << metadata[(m->currentBinLabels.size()-1)] << "}\n" + spaces + "],\n"; + out << rowFront << m->currentBinLabels[(numBins-1)] << rowBack << metadata[(numBins-1)] << "}\n" + spaces + "],\n"; //get column info /*"columns": [ @@ -624,8 +625,8 @@ vector MakeBiomCommand::getMetaData(vector& lookup) //********************************************************************************************************************** int MakeBiomCommand::getSampleMetaData(vector& lookup){ try { - - if (metadatafile == "") { for (int i = 0; i < lookup.size(); i++) { sampleMetadata.push_back("null"); } } + sampleMetadata.clear(); + if (metadatafile == "") { for (int i = 0; i < lookup.size(); i++) { sampleMetadata.push_back("null"); } } else { ifstream in; m->openInputFile(metadatafile, in); diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index f888fbe..0bf56e7 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -19,6 +19,8 @@ vector MakeContigsCommand::setParameters(){ CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(prqual); CommandParameter pfile("file", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "none","fasta-qfile",false,false,true); parameters.push_back(pfile); CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos); + CommandParameter pfindex("findex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pfindex); + CommandParameter prindex("rindex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(prindex); CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs); CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs); @@ -50,15 +52,17 @@ vector MakeContigsCommand::setParameters(){ string MakeContigsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. It will also provide new quality files if the fastq or file parameter is used.\n"; + helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. \n"; helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n"; - helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n"; + helpString += "If a forward index or reverse index file is provided barcodes be trimmed, and a group file will be created. The oligos parameter is required if an index file is given.\n"; + helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, findex, rindex, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n"; helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n"; - helpString += "The file parameter is 2 or 3 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n"; + helpString += "The file parameter is 2, 3 or 4 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n"; helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process. If you provide one, you must provide the other.\n"; helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process. If you provide one, you must provide the other.\n"; helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters. If you provide one, you must provide the other.\n"; helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n"; + helpString += "The findex and rindex parameters are used to provide a forward index and reverse index files to process. \n"; helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n"; helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; @@ -216,6 +220,22 @@ MakeContigsCommand::MakeContigsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["oligos"] = inputDir + it->second; } } + + it = parameters.find("findex"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["findex"] = inputDir + it->second; } + } + + it = parameters.find("rindex"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["rindex"] = inputDir + it->second; } + } } ffastqfile = validParameter.validFile(parameters, "ffastq", true); @@ -264,6 +284,29 @@ MakeContigsCommand::MakeContigsCommand(string option) { else if(oligosfile == "not open") { abort = true; } else { m->setOligosFile(oligosfile); } + findexfile = validParameter.validFile(parameters, "findex", true); + if (findexfile == "not found") { findexfile = ""; } + else if(findexfile == "not open") { abort = true; } + + rindexfile = validParameter.validFile(parameters, "rindex", true); + if (rindexfile == "not found") { rindexfile = ""; } + else if(rindexfile == "not open") { abort = true; } + + if ((rindexfile != "") || (findexfile != "")) { + if (oligosfile == ""){ + oligosfile = m->getOligosFile(); + if (oligosfile != "") { m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter.\n"); } + else { + m->mothurOut("You need to provide an oligos file if you are going to use an index file.\n"); abort = true; + } + } + + //can only use an index file with the fastq parameters not fasta and qual + if ((ffastafile != "") || (rfastafile != "")) { + m->mothurOut("[ERROR]: You can only use an index file with the fastq parameters or the file option.\n"); abort = true; + } + } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; @@ -421,7 +464,6 @@ int MakeContigsCommand::execute(){ m->mothurOut("Making contigs...\n"); createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames, l); - m->mothurOut("Here...\n"); //remove temp fasta and qual files for (int i = 0; i < processors; i++) { for(int j = 0; j < filesToProcess[l][i].size(); j++) { m->mothurRemove(filesToProcess[l][i][j]); } } @@ -572,7 +614,7 @@ vector< vector< vector > > MakeContigsCommand::preProcessData(unsigned l vector< vector< vector > > filesToProcess; if (ffastqfile != "") { //reading one file - vector< vector > files = readFastqFiles(numReads, ffastqfile, rfastqfile); + vector< vector > files = readFastqFiles(numReads, ffastqfile, rfastqfile, findexfile, rindexfile); //adjust for really large processors or really small files if (numReads == 0) { m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; } if (numReads < processors) { @@ -593,12 +635,20 @@ vector< vector< vector > > MakeContigsCommand::preProcessData(unsigned l if (m->control_pressed) { for (int l = 0; l < filesToProcess.size(); l++) { for (int k = 0; k < filesToProcess[l].size(); k++) { for(int j = 0; j < filesToProcess[l][k].size(); j++) { m->mothurRemove(filesToProcess[l][k][j]); } filesToProcess[l][k].clear(); } return filesToProcess; } } unsigned long int thisFilesReads; - vector< vector > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1]); + vector< vector > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1], filePairsToProcess[i][2], filePairsToProcess[i][3]); //adjust for really large processors or really small files if (thisFilesReads < processors) { m->mothurOut("[ERROR]: " + filePairsToProcess[i][0] + " has less than " + toString(processors) + " good reads, skipping\n"); for (int k = 0; k < files.size(); k++) { for(int j = 0; j < files[k].size(); j++) { m->mothurRemove(files[k][j]); } files[k].clear(); } + //remove from file2Group if necassary + map cFile2Group; + for (map::iterator it = file2Group.begin(); it != file2Group.end(); it++) { + if ((it->first) < i) { cFile2Group[it->first] = it->second; } + else if ((it->first) == i) { } //do nothing, we removed files for i + else { cFile2Group[(it->first-1)] = it->second; } //adjust files because i was removed + } + file2Group = cFile2Group; }else { filesToProcess.push_back(files); numReads += thisFilesReads; @@ -870,10 +920,12 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string string thisfqualfile = files[1]; string thisrfastafile = files[2]; string thisrqualfile = files[3]; + string thisfindexfile = files[4]; + string thisrindexfile = files[5]; - if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); } + if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); } - ifstream inFFasta, inRFasta, inFQual, inRQual; + ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex; ofstream outFasta, outMisMatch, outScrapFasta; m->openInputFile(thisffastafile, inFFasta); m->openInputFile(thisrfastafile, inRFasta); @@ -881,6 +933,10 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string m->openInputFile(thisfqualfile, inFQual); m->openInputFile(thisrqualfile, inRQual); } + + if (thisfindexfile != "") { m->openInputFile(thisfindexfile, inFIndex); } + if (thisrindexfile != "") { m->openInputFile(thisrindexfile, inRIndex); } + m->openOutputFile(outputFasta, outFasta); m->openOutputFile(outputScrapFasta, outScrapFasta); m->openOutputFile(outputMisMatches, outMisMatch); @@ -904,13 +960,27 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string fQual = new QualityScores(inFQual); m->gobble(inFQual); rQual = new QualityScores(inRQual); m->gobble(inRQual); } + Sequence findexBarcode("findex", "NONE"); Sequence rindexBarcode("rindex", "NONE"); + if (thisfindexfile != "") { + Sequence temp(inFIndex); m->gobble(inFIndex); + findexBarcode.setAligned(temp.getAligned()); + } + + if (thisrindexfile != "") { + Sequence temp(inRIndex); m->gobble(inRIndex); + rindexBarcode.setAligned(temp.getAligned()); + } int barcodeIndex = 0; int primerIndex = 0; - + if(barcodes.size() != 0){ if (thisfqualfile != "") { - success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex); + if ((thisfindexfile != "") || (thisrindexfile != "")) { + success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex); + }else { + success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex); + } }else { success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex); } @@ -1097,11 +1167,11 @@ int MakeContigsCommand::driver(vector files, string outputFasta, string } } //********************************************************************************************************************** -vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq){ +vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq, string findex, string rindex){ try { vector< vector > files; //maps processors number to file pointer - map > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual + map > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual, tempfiles[4] = forwardIndex, [4] = forwardReverse map >::iterator it; //create files to write to @@ -1111,6 +1181,8 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c ofstream* outFQ = new ofstream; temp.push_back(outFQ); ofstream* outRF = new ofstream; temp.push_back(outRF); ofstream* outRQ = new ofstream; temp.push_back(outRQ); + ofstream* outFI = new ofstream; temp.push_back(outFI); + ofstream* outRI = new ofstream; temp.push_back(outRI); tempfiles[i] = temp; vector names; @@ -1120,8 +1192,13 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c string rfastafilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rfastatemp"; string fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(ffastq)) + toString(i) + "fqualtemp"; string rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rqualtemp"; + string findexfilename = ""; string rindexfilename = ""; + noneOk = false; //flag to oligos file read that its okay to allow for non paired barcodes + if (findex != "") { findexfilename = thisOutputDir + m->getRootName(m->getSimpleName(findex)) + toString(i) + "findextemp"; m->openOutputFile(findexfilename, *outFI); noneOk = true; } + if (rindex != "") { rindexfilename = thisOutputDir + m->getRootName(m->getSimpleName(rindex)) + toString(i) + "rindextemp"; m->openOutputFile(rindexfilename, *outRI); noneOk = true; } names.push_back(ffastafilename); names.push_back(fqualfilename); names.push_back(rfastafilename); names.push_back(rqualfilename); + names.push_back(findexfilename); names.push_back(rindexfilename); files.push_back(names); m->openOutputFile(ffastafilename, *outFF); @@ -1135,7 +1212,7 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } //remove files for (int i = 0; i < files.size(); i++) { - for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } + for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } } @@ -1145,31 +1222,52 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c ifstream inReverse; m->openInputFile(rfastq, inReverse); + ifstream infIndex, inrIndex; + bool findexIsGood = false; + bool rindexIsGood = false; + if (findex != "") { m->openInputFile(findex, infIndex); findexIsGood = true; } + if (rindex != "") { m->openInputFile(rindex, inrIndex); rindexIsGood = true; } + count = 0; map uniques; + map iUniques; + map pairUniques; map::iterator itUniques; - while ((!inForward.eof()) || (!inReverse.eof())) { + while ((!inForward.eof()) || (!inReverse.eof()) || (findexIsGood) || (rindexIsGood)) { - if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; } + if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; } //get a read from forward and reverse fastq files - bool ignoref, ignorer; - fastqRead thisFread, thisRread; + bool ignoref, ignorer, ignorefi, ignoreri; + fastqRead thisFread, thisRread, thisFIread, thisRIread; if (!inForward.eof()) { thisFread = readFastq(inForward, ignoref); } else { ignoref = true; } if (!inReverse.eof()) { thisRread = readFastq(inReverse, ignorer); } else { ignorer = true; } + if (findexIsGood) { thisFIread = readFastq(infIndex, ignorefi); if (infIndex.eof()) { findexIsGood = false; } } + else { ignorefi = true; } + if (rindexIsGood) { thisRIread = readFastq(inrIndex, ignoreri); if (inrIndex.eof()) { rindexIsGood = false; } } + else { ignoreri = true; } + + bool allowOne = false; + if ((findex == "") || (rindex == "")) { allowOne = true; } + vector frReads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false); + vector friReads = getReads(ignorefi, ignoreri, thisFIread, thisRIread, iUniques, allowOne); + + //add in index info if provided + vector reads = frReads; + if ((findex != "") || (rindex != "")) { reads = mergeReads(frReads, friReads, pairUniques); } - vector reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques); - for (int i = 0; i < reads.size(); i++) { fastqRead fread = reads[i].forward; fastqRead rread = reads[i].reverse; + fastqRead firead = reads[i].findex; + fastqRead riread = reads[i].rindex; - if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); } + if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); if (findex != "") { m->mothurOut(toString(count) + '\t' + firead.name + '\n'); } if (rindex != "") { m->mothurOut(toString(count) + '\t' + riread.name + '\n'); } } //if (checkReads(fread, rread, ffastq, rfastq)) { - if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; } + if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; } //if the reads are okay write to output files int process = count % processors; @@ -1182,7 +1280,9 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c *(tempfiles[process][3]) << ">" << rread.name << endl; for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; } *(tempfiles[process][3]) << endl; - + if (findex != "") { *(tempfiles[process][4]) << ">" << firead.name << endl << firead.sequence << endl; } + if (rindex != "") { *(tempfiles[process][5]) << ">" << riread.name << endl << riread.sequence << endl; } + count++; //report progress @@ -1197,6 +1297,9 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) { m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n"); } + for (map:: iterator it = pairUniques.begin(); it != pairUniques.end(); it++) { + m->mothurOut("[WARNING]: did not find paired read for " + (it->first).substr(1) + ", ignoring.\n"); + } m->mothurOutEndLine(); } @@ -1204,7 +1307,9 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } inForward.close(); inReverse.close(); - + if (findex != "") { infIndex.close(); } + if (rindex != "") { inrIndex.close(); } + return files; } catch(exception& e) { @@ -1238,8 +1343,10 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c if (fqualfile != "") { fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(fqualfile)) + toString(i) + "fqual.temp"; m->openOutputFile(fqualfilename, *outFQ); } string rqualfilename = ""; if (rqualfile != "") { rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rqualfile)) + toString(i) + "rqual.temp"; m->openOutputFile(rqualfilename, *outRQ); } + string findexfilename = ""; string rindexfilename = ""; names.push_back(ffastafilename); names.push_back(fqualfilename); names.push_back(rfastafilename); names.push_back(rqualfilename); + names.push_back(findexfilename); names.push_back(rindexfilename); files.push_back(names); m->openOutputFile(ffastafilename, *outFF); @@ -1303,7 +1410,7 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c }else { ignorer = true; } } - vector reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques); + vector reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false); for (int i = 0; i < reads.size(); i++) { fastqRead fread = reads[i].forward; @@ -1358,7 +1465,7 @@ vector< vector > MakeContigsCommand::readFastaFiles(unsigned long int& c } } //********************************************************************************************************************** -vector MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques){ +vector MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques, bool allowOne){ try { vector reads; map::iterator itUniques; @@ -1406,25 +1513,36 @@ vector MakeContigsCommand::getReads(bool ignoref, bool ignorer, f } }else if (!ignoref && ignorer) { //ignore reverse keep forward - //look for forward pair - itUniques = uniques.find(forward.name); - if (itUniques != uniques.end()) { //we have the pair for this read - pairFastqRead temp(forward, itUniques->second); + if (allowOne) { + fastqRead dummy; + pairFastqRead temp(forward, dummy); reads.push_back(temp); - uniques.erase(itUniques); - }else { //save this read for later - uniques[forward.name] = forward; + }else { + //look for forward pair + itUniques = uniques.find(forward.name); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(forward, itUniques->second); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques[forward.name] = forward; + } } - }else if (ignoref && !ignorer) { //ignore forward keep reverse - //look for reverse pair - itUniques = uniques.find(reverse.name); - if (itUniques != uniques.end()) { //we have the pair for this read - pairFastqRead temp(itUniques->second, reverse); + if (allowOne) { + fastqRead dummy; + pairFastqRead temp(dummy, reverse); reads.push_back(temp); - uniques.erase(itUniques); - }else { //save this read for later - uniques[reverse.name] = reverse; + }else { + //look for reverse pair + itUniques = uniques.find(reverse.name); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(itUniques->second, reverse); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques[reverse.name] = reverse; + } } }//else ignore both and do nothing @@ -1436,6 +1554,73 @@ vector MakeContigsCommand::getReads(bool ignoref, bool ignorer, f } } //********************************************************************************************************************** +//look through the reads from the forward and reverse files and try to find matching reads from index files. +vector MakeContigsCommand::mergeReads(vector thisReads, vector indexes, map& uniques){ + try { + vector reads; + map::iterator itUniques; + + set foundIndexes; + for (int i = 0; i < thisReads.size(); i++) { + bool found = false; + for (int j = 0; j < indexes.size(); j++) { + + //incase only one index + string indexName = indexes[j].forward.name; + if (indexName == "") { indexName = indexes[j].reverse.name; } + + if (thisReads[i].forward.name == indexName){ + thisReads[i].findex = indexes[j].forward; + thisReads[i].rindex = indexes[j].reverse; + reads.push_back(thisReads[i]); + found = true; + foundIndexes.insert(j); + } + } + + if (!found) { + //look for forward pair + itUniques = uniques.find('i'+thisReads[i].forward.name); + if (itUniques != uniques.end()) { //we have the pair for this read + thisReads[i].findex = itUniques->second.forward; + thisReads[i].rindex = itUniques->second.reverse; + reads.push_back(thisReads[i]); + uniques.erase(itUniques); + }else { //save this read for later + uniques['r'+thisReads[i].forward.name] = thisReads[i]; + } + } + } + + if (foundIndexes.size() != indexes.size()) { //if we didnt match all the indexes look for them in uniques + for (int j = 0; j < indexes.size(); j++) { + if (foundIndexes.count(j) == 0) { //we didnt find this one + //incase only one index + string indexName = indexes[j].forward.name; + if (indexName == "") { indexName = indexes[j].reverse.name; } + + //look for forward pair + itUniques = uniques.find('r'+indexName); + if (itUniques != uniques.end()) { //we have the pair for this read + pairFastqRead temp(itUniques->second.forward, itUniques->second.reverse, indexes[j].forward, indexes[j].reverse); + reads.push_back(temp); + uniques.erase(itUniques); + }else { //save this read for later + uniques['i'+indexName] = indexes[j]; + } + } + } + } + + + return reads; + } + catch(exception& e) { + m->errorOut(e, "MakeContigsCommand", "mergeReads"); + exit(1); + } +} +//********************************************************************************************************************** fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){ try { fastqRead read; @@ -1475,6 +1660,8 @@ fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){ read.name = name; read.sequence = sequence; read.scores = qualScores; + + if (m->debug) { m->mothurOut("[DEBUG]: " + read.name + " " + read.sequence + " " + quality + "\n"); } return read; } @@ -1508,10 +1695,16 @@ bool MakeContigsCommand::checkReads(fastqRead& forward, fastqRead& reverse, stri } }*/ //*************************************************************************************************************** +//lines can be 2, 3, or 4 columns +// forward.fastq reverse.fastq -> 2 column +// groupName forward.fastq reverse.fastq -> 3 column +// forward.fastq reverse.fastq forward.index.fastq reverse.index.fastq -> 4 column +// forward.fastq reverse.fastq none reverse.index.fastq -> 4 column +// forward.fastq reverse.fastq forward.index.fastq none -> 4 column vector< vector > MakeContigsCommand::readFileNames(string filename){ try { vector< vector > files; - string forward, reverse; + string forward, reverse, findex, rindex; ifstream in; m->openInputFile(filename, in); @@ -1520,28 +1713,35 @@ vector< vector > MakeContigsCommand::readFileNames(string filename){ if (m->control_pressed) { return files; } - in >> forward; m->gobble(in); - in >> reverse; + string line = m->getline(in); m->gobble(in); + vector pieces = m->splitWhiteSpace(line); string group = ""; - while (!in.eof()) { //do we have a group assigned to this pair - char c = in.get(); - if (c == 10 || c == 13 || c == -1){ break; } - else if (c == 32 || c == 9){;} //space or tab - else { group += c; } - } - - if (group != "") { - //line in file look like: group forward reverse - string temp = forward; - forward = reverse; - reverse = group; - group = temp; + if (pieces.size() == 2) { + forward = pieces[0]; + reverse = pieces[1]; + group = ""; + findex = ""; + rindex = ""; + }else if (pieces.size() == 3) { + group = pieces[0]; + forward = pieces[1]; + reverse = pieces[2]; + findex = ""; + rindex = ""; createFileGroup = true; + }else if (pieces.size() == 4) { + forward = pieces[0]; + reverse = pieces[1]; + findex = pieces[2]; + rindex = pieces[3]; + if ((findex == "none") || (findex == "NONE")){ findex = ""; } + if ((rindex == "none") || (rindex == "NONE")){ rindex = ""; } + }else { + m->mothurOut("[ERROR]: file lines can be 2, 3, or 4 columns. The forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. \n"); m->control_pressed = true; } - m->gobble(in); - if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ".\n"); } + if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ", forwardIndex = " + findex + ", reverseIndex = " + rindex + ".\n"); } //check to make sure both are able to be opened ifstream in2; @@ -1606,11 +1806,83 @@ vector< vector > MakeContigsCommand::readFileNames(string filename){ m->mothurOut("[WARNING]: can't find " + reverse + ", ignoring pair.\n"); }else{ in3.close(); } - if ((openForward != 1) && (openReverse != 1)) { //good pair + int openFindex = 0; + if (findex != "") { + ifstream in4; + openFindex = m->openInputFile(findex, in4, "noerror"); in4.close(); + + //if you can't open it, try default location + if (openFindex == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(findex); + m->mothurOut("Unable to open " + findex + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in5; + openFindex = m->openInputFile(tryPath, in5, "noerror"); + in5.close(); + findex = tryPath; + } + } + + //if you can't open it, try output location + if (openFindex == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(findex); + m->mothurOut("Unable to open " + findex + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in6; + openFindex = m->openInputFile(tryPath, in6, "noerror"); + findex = tryPath; + in6.close(); + } + } + + if (openFindex == 1) { //can't find it + m->mothurOut("[WARNING]: can't find " + findex + ", ignoring pair.\n"); + } + } + + int openRindex = 0; + if (rindex != "") { + ifstream in7; + openRindex = m->openInputFile(rindex, in7, "noerror"); in7.close(); + + //if you can't open it, try default location + if (openRindex == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(rindex); + m->mothurOut("Unable to open " + rindex + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in8; + openRindex = m->openInputFile(tryPath, in8, "noerror"); + in8.close(); + rindex = tryPath; + } + } + + //if you can't open it, try output location + if (openRindex == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(rindex); + m->mothurOut("Unable to open " + rindex + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in9; + openRindex = m->openInputFile(tryPath, in9, "noerror"); + rindex = tryPath; + in9.close(); + } + } + + if (openRindex == 1) { //can't find it + m->mothurOut("[WARNING]: can't find " + rindex + ", ignoring pair.\n"); + } + } + + + if ((openForward != 1) && (openReverse != 1) && (openFindex != 1) && (openRindex != 1)) { //good pair file2Group[files.size()] = group; vector pair; pair.push_back(forward); pair.push_back(reverse); + pair.push_back(findex); + pair.push_back(rindex); + if (((findex != "") || (rindex != "")) && (oligosfile == "")) { m->mothurOut("[ERROR]: You need to provide an oligos file if you are going to use an index file.\n"); m->control_pressed = true; } files.push_back(pair); } } @@ -1619,7 +1891,7 @@ vector< vector > MakeContigsCommand::readFileNames(string filename){ return files; } catch(exception& e) { - m->errorOut(e, "MakeContigsCommand", "checkReads"); + m->errorOut(e, "MakeContigsCommand", "readFileNames"); exit(1); } } @@ -1692,7 +1964,7 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, stri oligosPair newPrimer(foligo, roligo); if (m->debug) { m->mothurOut("[DEBUG]: primer pair " + newPrimer.forward + " " + newPrimer.reverse + ", and group = " + group + ".\n"); } - + //check for repeat barcodes string tempPair = foligo+roligo; if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine(); } @@ -1715,6 +1987,8 @@ bool MakeContigsCommand::getOligos(vector >& fastaFileNames, stri oligosPair newPair(foligo, roligo); + if ((foligo == "NONE") || (roligo == "NONE")) { if (!noneOk) { m->mothurOut("[ERROR]: barcodes must be paired unless you are using an index file.\n"); m->control_pressed = true; } } + group = ""; while (!in.eof()) { char c = in.get(); diff --git a/makecontigscommand.h b/makecontigscommand.h index b45501e..1ea3835 100644 --- a/makecontigscommand.h +++ b/makecontigscommand.h @@ -32,9 +32,12 @@ struct fastqRead { struct pairFastqRead { fastqRead forward; fastqRead reverse; + fastqRead findex; + fastqRead rindex; pairFastqRead() {}; pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){}; + pairFastqRead(fastqRead f, fastqRead r, fastqRead fi, fastqRead ri) : forward(f), reverse(r), findex(fi), rindex(ri) {}; ~pairFastqRead() {}; }; /**************************************************************************************************/ @@ -59,8 +62,8 @@ public: void help() { m->mothurOut(getHelpString()); } private: - bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup, makeCount; - string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file, format; + bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup, makeCount, noneOk; + string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, findexfile, rindexfile, file, format; float match, misMatch, gapOpen, gapExtend; int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq; vector outputNames; @@ -81,14 +84,15 @@ private: fastqRead readFastq(ifstream&, bool&); vector< vector< vector > > preProcessData(unsigned long int&); vector< vector > readFileNames(string); - vector< vector > readFastqFiles(unsigned long int&, string, string); + vector< vector > readFastqFiles(unsigned long int&, string, string, string, string); vector< vector > readFastaFiles(unsigned long int&, string, string); //bool checkReads(fastqRead&, fastqRead&, string, string); int createProcesses(vector< vector >, string, string, string, vector >, int); int driver(vector, string, string, string, vector >, int, string); bool getOligos(vector >&, string); string reverseOligo(string); - vector getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques); + vector getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map& uniques, bool); + vector mergeReads(vector frReads, vector friReads, map& pairUniques); }; /**************************************************************************************************/ @@ -166,8 +170,10 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ string thisfqualfile = pDataArray->files[1]; string thisrfastafile = pDataArray->files[2]; string thisrqualfile = pDataArray->files[3]; + string thisfindexfile = pDataArray->files[4]; + string thisrindexfile = pDataArray->files[5]; - if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); } + if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); } if(pDataArray->allFiles){ for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file @@ -180,7 +186,7 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ } } - ifstream inFFasta, inRFasta, inFQual, inRQual; + ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex; ofstream outFasta, outMisMatch, outScrapFasta; pDataArray->m->openInputFile(thisffastafile, inFFasta); pDataArray->m->openInputFile(thisrfastafile, inRFasta); @@ -188,6 +194,10 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ pDataArray->m->openInputFile(thisfqualfile, inFQual); pDataArray->m->openInputFile(thisrqualfile, inRQual); } + + if (thisfindexfile != "") { pDataArray->m->openInputFile(thisfindexfile, inFIndex); } + if (thisrindexfile != "") { pDataArray->m->openInputFile(thisrindexfile, inRIndex); } + pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta); pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch); pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta); @@ -213,12 +223,27 @@ static DWORD WINAPI MyContigsThreadFunction(LPVOID lpParam){ rQual = new QualityScores(inRQual); pDataArray->m->gobble(inRQual); } + Sequence findexBarcode("findex", "NONE"); Sequence rindexBarcode("rindex", "NONE"); + if (thisfindexfile != "") { + Sequence temp(inFIndex); pDataArray->m->gobble(inFIndex); + findexBarcode.setAligned(temp.getAligned()); + } + + if (thisrindexfile != "") { + Sequence temp(inRIndex); pDataArray->m->gobble(inRIndex); + rindexBarcode.setAligned(temp.getAligned()); + } + int barcodeIndex = 0; int primerIndex = 0; if(pDataArray->barcodes.size() != 0){ if (thisfqualfile != "") { - success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex); + if ((thisfindexfile != "") || (thisrindexfile != "")) { + success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex); + }else { + success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex); + } }else { success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex); } diff --git a/makefile b/makefile index 163db6b..04de533 100644 --- a/makefile +++ b/makefile @@ -15,8 +15,8 @@ USEREADLINE ?= yes CYGWIN_BUILD ?= no USECOMPRESSION ?= no MOTHUR_FILES="\"Enter_your_default_path_here\"" -RELEASE_DATE = "\"5/29/2013\"" -VERSION = "\"1.31.1\"" +RELEASE_DATE = "\"8/19/2013\"" +VERSION = "\"1.31.2\"" FORTAN_COMPILER = gfortran FORTRAN_FLAGS = @@ -37,10 +37,9 @@ ifeq ($(strip $(64BIT_VERSION)),yes) #TARGET_ARCH += -m64 -static #if you are a linux user use the following line - #CXXFLAGS += -mtune=native -march=native -m64 + #CXXFLAGS += -mtune=native -march=native CXXFLAGS += -DBIT_VERSION - FORTRAN_FLAGS = -m64 endif diff --git a/makelefsecommand.cpp b/makelefsecommand.cpp index dd44249..9f895f6 100644 --- a/makelefsecommand.cpp +++ b/makelefsecommand.cpp @@ -264,22 +264,28 @@ int MakeLefseCommand::runRelabund(map& consTax, vector categories = designMap->getNamesOfCategories(); + if (categories.size() > 3) { m->mothurOut("\n[NOTE]: LEfSe input files allow for a class, subclass and subject. More than 3 categories can cause formatting errors.\n\n"); } + for (int j = 0; j < categories.size(); j++) { out << categories[j] << "\t"; - for (int i = 0; i < lookup.size(); i++) { + for (int i = 0; i < lookup.size()-1; i++) { if (m->control_pressed) { out.close(); delete designMap; return 0; } string value = designMap->get(lookup[i]->getGroup(), categories[j]); if (value == "not found") { m->mothurOut("[ERROR]: " + lookup[i]->getGroup() + " is not in your design file, please correct.\n"); m->control_pressed = true; }else { out << value << '\t'; } } + string value = designMap->get(lookup[lookup.size()-1]->getGroup(), categories[j]); + if (value == "not found") { + m->mothurOut("[ERROR]: " + lookup[lookup.size()-1]->getGroup() + " is not in your design file, please correct.\n"); m->control_pressed = true; + }else { out << value; } out << endl; } } out << "group\t"; - for (int i = 0; i < lookup.size(); i++) { out << lookup[i]->getGroup() << '\t'; } - out << endl; + for (int i = 0; i < lookup.size()-1; i++) { out << lookup[i]->getGroup() << '\t'; } + out << lookup[lookup.size()-1]->getGroup() << endl; for (int i = 0; i < lookup[0]->getNumBins(); i++) { //process each otu if (m->control_pressed) { break; } @@ -306,8 +312,9 @@ int MakeLefseCommand::runRelabund(map& consTax, vectorgetAbundance(i) << '\t'; } - out << endl; + for (int j = 0; j < lookup.size()-1; j++) { out << lookup[j]->getAbundance(i) << '\t'; } + + out << lookup[lookup.size()-1]->getAbundance(i)<< endl; } out.close(); diff --git a/mothurout.cpp b/mothurout.cpp index bfd1cc1..713bafd 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -2864,7 +2864,50 @@ unsigned int MothurOut::fromBase36(string base36){ } } /***********************************************************************/ - +string MothurOut::findEdianness() { + try { + // find real endian type + unsigned char EndianTest[2] = {1,0}; + short x = *(short *)EndianTest; + + string endianType = "unknown"; + if(x == 1) { endianType = "BIG_ENDIAN"; } + else { endianType = "LITTLE_ENDIAN"; } + + return endianType; + } + catch(exception& e) { + errorOut(e, "MothurOut", "findEdianness"); + exit(1); + } +} +/***********************************************************************/ +double MothurOut::median(vector x) { + try { + double value = 0.0; + + if (x.size() == 0) { } //error + else { + //For example, if a < b < c, then the median of the list {a, b, c} is b, and, if a < b < c < d, then the median of the list {a, b, c, d} is the mean of b and c; i.e., it is (b + c)/2. + sort(x.begin(), x.end()); + //is x.size even? + if ((x.size()%2) == 0) { //size() is even. median = average of 2 midpoints + int midIndex1 = (x.size()/2)-1; + int midIndex2 = (x.size()/2); + value = (x[midIndex1]+ x[midIndex2]) / 2.0; + }else { + int midIndex = (x.size()/2); + value = x[midIndex]; + } + } + return value; + } + catch(exception& e) { + errorOut(e, "MothurOut", "median"); + exit(1); + } +} +/***********************************************************************/ int MothurOut::factorial(int num){ try { int total = 1; @@ -3499,7 +3542,7 @@ vector MothurOut::getStandardDeviation(vector< vector >& dists, return stdDev; } catch(exception& e) { - errorOut(e, "MothurOut", "getAverages"); + errorOut(e, "MothurOut", "getStandardDeviation"); exit(1); } } @@ -3782,6 +3825,44 @@ double MothurOut::getStandardDeviation(vector& featureVector){ } } /**************************************************************************************************/ +// returns largest value in vector +double MothurOut::max(vector& featureVector){ + try { + if (featureVector.size() == 0) { mothurOut("[ERROR]: vector size = 0!\n"); control_pressed=true; return 0.0; } + + //finds largest + double largest = featureVector[0]; + for (int i = 1; i < featureVector.size(); i++) { + if (featureVector[i] > largest) { largest = featureVector[i]; } + } + + return largest; + } + catch(exception& e) { + errorOut(e, "MothurOut", "max"); + exit(1); + } +} +/**************************************************************************************************/ +// returns smallest value in vector +double MothurOut::min(vector& featureVector){ + try { + if (featureVector.size() == 0) { mothurOut("[ERROR]: vector size = 0!\n"); control_pressed=true; return 0.0; } + + //finds smallest + double smallest = featureVector[0]; + for (int i = 1; i < featureVector.size(); i++) { + if (featureVector[i] < smallest) { smallest = featureVector[i]; } + } + + return smallest; + } + catch(exception& e) { + errorOut(e, "MothurOut", "min"); + exit(1); + } +} +/**************************************************************************************************/ diff --git a/mothurout.h b/mothurout.h index 3cf6fa4..9987374 100644 --- a/mothurout.h +++ b/mothurout.h @@ -138,6 +138,7 @@ class MothurOut { bool isTrue(string); bool isContainingOnlyDigits(string); bool isNumeric1(string); + string findEdianness(); //string manipulation @@ -158,11 +159,14 @@ class MothurOut { map > parseClasses(string); //math operation + double max(vector&); //returns largest value in vector + double min(vector&); //returns smallest value in vector int factorial(int num); vector > binomial(int); float ceilDist(float, int); float roundDist(float, int); unsigned int fromBase36(string); + double median(vector); int getRandomIndex(int); //highest double getStandardDeviation(vector&); vector getStandardDeviation(vector< vector >&); diff --git a/pairwiseseqscommand.cpp b/pairwiseseqscommand.cpp index 7d96e31..f874e9a 100644 --- a/pairwiseseqscommand.cpp +++ b/pairwiseseqscommand.cpp @@ -671,14 +671,11 @@ int PairwiseSeqsCommand::driver(int startLine, int endLine, string dFileName, fl alignment->align(seqI.getUnaligned(), seqJ.getUnaligned()); seqI.setAligned(alignment->getSeqAAln()); seqJ.setAligned(alignment->getSeqBAln()); - - //cout << seqI.getName() << '\t' << seqJ.getName() << endl; - //cout << alignment->getSeqAAln() << endl << alignment->getSeqBAln() << endl; distCalculator->calcDist(seqI, seqJ); double dist = distCalculator->getDist(); - //cout << "dist = " << dist << endl; + if (m->debug) { m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); } if(dist <= cutoff){ if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; } @@ -777,7 +774,9 @@ int PairwiseSeqsCommand::driver(int startLine, int endLine, string dFileName, st distCalculator->calcDist(seqI, seqJ); double dist = distCalculator->getDist(); - outFile << dist << '\t'; + outFile << dist << '\t'; + + if (m->debug) { m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); } } outFile << endl; @@ -860,6 +859,8 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, distCalculator->calcDist(seqI, seqJ); double dist = distCalculator->getDist(); + + if (m->debug) { cout << ("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); } if(dist <= cutoff){ outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n'); @@ -968,6 +969,8 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsi distCalculator->calcDist(seqI, seqJ); double dist = distCalculator->getDist(); + + if (m->debug) { cout << ("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); } outputString += toString(dist) + "\t"; } @@ -1072,7 +1075,9 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsi distCalculator->calcDist(seqI, seqJ); double dist = distCalculator->getDist(); - outputString += toString(dist) + "\t"; + outputString += toString(dist) + "\t"; + + if (m->debug) { cout << ("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); } } outputString += "\n"; diff --git a/pairwiseseqscommand.h b/pairwiseseqscommand.h index cb68e7f..90c21b2 100644 --- a/pairwiseseqscommand.h +++ b/pairwiseseqscommand.h @@ -193,6 +193,8 @@ static DWORD WINAPI MyPairwiseSquareThreadFunction(LPVOID lpParam){ distCalculator->calcDist(seqI, seqJ); double dist = distCalculator->getDist(); + if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); } + outFile << dist << '\t'; } @@ -293,6 +295,8 @@ static DWORD WINAPI MyPairwiseThreadFunction(LPVOID lpParam){ distCalculator->calcDist(seqI, seqJ); double dist = distCalculator->getDist(); + if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); } + if(dist <= pDataArray->cutoff){ if (pDataArray->output == "column") { outFile << pDataArray->alignDB.get(i).getName() << ' ' << pDataArray->alignDB.get(j).getName() << ' ' << dist << endl; } } diff --git a/qualityscores.cpp b/qualityscores.cpp index 0408b83..1f86efc 100644 --- a/qualityscores.cpp +++ b/qualityscores.cpp @@ -44,7 +44,7 @@ QualityScores::QualityScores(ifstream& qFile){ while(qFile.peek() != '>' && qFile.peek() != EOF){ if (m->control_pressed) { break; } string temp = m->getline(qFile); m->gobble(qFile); - if (m->debug) { m->mothurOut("[DEBUG]: scores = '" + temp + "'\n."); } + //if (m->debug) { m->mothurOut("[DEBUG]: scores = '" + temp + "'\n."); } qScoreString += ' ' + temp; } //cout << "done reading " << endl; @@ -55,7 +55,7 @@ QualityScores::QualityScores(ifstream& qFile){ string temp; qScoreStringStream >> temp; m->gobble(qScoreStringStream); - if (m->debug) { m->mothurOut("[DEBUG]: score " + toString(qScores.size()) + " = '" + temp + "'\n."); } + //if (m->debug) { m->mothurOut("[DEBUG]: score " + toString(qScores.size()) + " = '" + temp + "'\n."); } //check temp to make sure its a number if (!m->isContainingOnlyDigits(temp)) { m->mothurOut("[ERROR]: In sequence " + seqName + "'s quality scores, expected a number and got " + temp + ", setting score to 0."); m->mothurOutEndLine(); temp = "0"; } diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index e6b037b..0e3b70c 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -376,7 +376,7 @@ int SeqSummaryCommand::execute(){ double meanstartPosition, meanendPosition, meanseqLength, meanambigBases, meanlongHomoPolymer; - meanstartPosition /= (double) size; meanendPosition /= (double) size; meanlongHomoPolymer /= (double) size; meanseqLength /= (double) size; meanambigBases /= (double) size; + meanstartPosition = meanStartPosition / (double) size; meanendPosition = meanEndPosition /(double) size; meanlongHomoPolymer = meanLongHomoPolymer / (double) size; meanseqLength = meanSeqLength / (double) size; meanambigBases = meanAmbigBases /(double) size; int ptile0_25 = int(size * 0.025); int ptile25 = int(size * 0.250); diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp index c80ac2e..399f253 100644 --- a/sffinfocommand.cpp +++ b/sffinfocommand.cpp @@ -559,6 +559,7 @@ int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ if (m->control_pressed) { count = 0; break; } if (count >= header.numReads) { break; } + //if (count >= 100) { break; } } //report progress @@ -651,6 +652,8 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ char buffer4 [4]; in.read(buffer4, 4); header.numReads = be_int4(*(unsigned int *)(&buffer4)); + + if (m->debug) { m->mothurOut("[DEBUG]: numReads = " + toString(header.numReads) + "\n"); } //read header length char buffer5 [2]; @@ -716,7 +719,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); out.write(mybuffer, in.gcount()); out.close(); } @@ -729,7 +732,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); out.write(mybuffer, in.gcount()); out.close(); } @@ -752,7 +755,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ thisbuffer[6] = (offset >> 8) & 0xFF; thisbuffer[7] = offset & 0xFF; ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); out.write(thisbuffer, 8); out.close(); } @@ -766,8 +769,8 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); - int offset = 0; + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); + unsigned int offset = 0; char* thisbuffer = new char[4]; thisbuffer[0] = (offset >> 24) & 0xFF; thisbuffer[1] = (offset >> 16) & 0xFF; @@ -786,13 +789,20 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); //convert number of reads to 4 byte char* char* thisbuffer = new char[4]; - thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF; - thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF; - thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF; - thisbuffer[3] = numSplitReads[i][j] & 0xFF; + if ((m->findEdianness()) == "BIG_ENDIAN") { + thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF; + thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF; + thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF; + thisbuffer[3] = numSplitReads[i][j] & 0xFF; + }else { + thisbuffer[0] = numSplitReads[i][j] & 0xFF; + thisbuffer[1] = (numSplitReads[i][j] >> 8) & 0xFF; + thisbuffer[2] = (numSplitReads[i][j] >> 16) & 0xFF; + thisbuffer[3] = (numSplitReads[i][j] >> 24) & 0xFF; + } out.write(thisbuffer, 4); out.close(); delete[] thisbuffer; @@ -805,7 +815,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); out.write(mybuffer, in.gcount()); out.close(); } @@ -818,7 +828,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); out.write(mybuffer, in.gcount()); out.close(); } @@ -831,7 +841,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); out.write(mybuffer, in.gcount()); out.close(); } @@ -844,7 +854,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); out.write(mybuffer, in.gcount()); out.close(); } @@ -857,7 +867,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); out.write(mybuffer, in.gcount()); out.close(); } @@ -870,7 +880,7 @@ int SffInfoCommand::adjustCommonHeader(CommonHeader header){ for (int i = 0; i < filehandlesHeaders.size(); i++) { for (int j = 0; j < filehandlesHeaders[i].size(); j++) { ofstream out; - m->openOutputFileAppend(filehandlesHeaders[i][j], out); + m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); out.write(mybuffer, in.gcount()); out.close(); } @@ -913,7 +923,7 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, H //read header length char buffer [2]; - in.read(buffer, 2); + in.read(buffer, 2); header.headerLength = be_int2(*(unsigned short *)(&buffer)); //read name length @@ -925,33 +935,39 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, H char buffer3 [4]; in.read(buffer3, 4); header.numBases = be_int4(*(unsigned int *)(&buffer3)); + //read clip qual left char buffer4 [2]; in.read(buffer4, 2); header.clipQualLeft = be_int2(*(unsigned short *)(&buffer4)); - header.clipQualLeft = 5; + header.clipQualLeft = 5; + //read clip qual right char buffer5 [2]; in.read(buffer5, 2); header.clipQualRight = be_int2(*(unsigned short *)(&buffer5)); + //read clipAdapterLeft char buffer6 [2]; in.read(buffer6, 2); header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6)); + //read clipAdapterRight char buffer7 [2]; in.read(buffer7, 2); header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7)); + //read name char* tempBuffer = new char[header.nameLength]; in.read(&(*tempBuffer), header.nameLength); header.name = tempBuffer; if (header.name.length() > header.nameLength) { header.name = header.name.substr(0, header.nameLength); } + delete[] tempBuffer; //extract info from name @@ -1014,11 +1030,12 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, H int trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex); if(trashCodeLength == 0){ + //cout << header.name << " length = " << spot << '\t' << startSpotInFile << '\t' << in2.gcount() << endl; + ofstream out; m->openOutputFileBinaryAppend(filehandles[barcodeIndex][primerIndex], out); out.write(mybuffer, in2.gcount()); out.close(); - delete[] mybuffer; numSplitReads[barcodeIndex][primerIndex]++; } else{ @@ -1026,9 +1043,8 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, H m->openOutputFileBinaryAppend(noMatchFile, out); out.write(mybuffer, in2.gcount()); out.close(); - delete[] mybuffer; } - + delete[] mybuffer; } }else{ m->mothurOut("Error reading."); m->mothurOutEndLine(); @@ -1771,7 +1787,7 @@ bool SffInfoCommand::readOligos(string oligoFile){ } filehandles[itBar->second][itPrimer->second] = thisFilename; - m->openOutputFile(thisFilename, temp); temp.close(); + temp.open(thisFilename.c_str(), ios::binary); temp.close(); } } } diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 4a6e5ff..c405e38 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -1690,10 +1690,8 @@ void ShhherCommand::writeQualities(vector otuCounts){ if (m->control_pressed) { break; } int index = 0; - int base = 0; if(nSeqsPerOTU[i] > 0){ - qualities[i].assign(1024, -1); while(index < numFlowCells){ double maxPrValue = 1e8; @@ -1742,8 +1740,7 @@ void ShhherCommand::writeQualities(vector otuCounts){ value = (int)floor(temp); if(value > 100){ value = 100; } - qualities[i][base] = (int)value; - base++; + qualities[i].push_back((int)value); } } @@ -1754,12 +1751,7 @@ void ShhherCommand::writeQualities(vector otuCounts){ if(otuCounts[i] > 0){ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl; - - int j=4; //need to get past the first four bases - while(qualities[i][j] != -1){ - qualityFile << qualities[i][j] << ' '; - j++; - } + for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; } qualityFile << endl; } } @@ -2394,16 +2386,22 @@ int ShhherCommand::driver(vector filenames, string thisCompositeFASTAFil m->mothurOut("\nFinalizing...\n"); fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); + if (m->debug) { m->mothurOut("[DEBUG]: done fill().\n"); } + if (m->control_pressed) { break; } setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI); + if (m->debug) { m->mothurOut("[DEBUG]: done setOTUs().\n"); } + if (m->control_pressed) { break; } vector otuCounts(numOTUs, 0); for(int j=0;jdebug) { m->mothurOut("[DEBUG]: done calcCentroidsDriver().\n"); } if (m->control_pressed) { break; } @@ -3280,12 +3278,11 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string quality if (m->control_pressed) { break; } int index = 0; - int base = 0; - + if(nSeqsPerOTU[i] > 0){ - qualities[i].assign(1024, -1); while(index < numFlowCells){ + double maxPrValue = 1e8; short maxPrIndex = -1; double count = 0.0000; @@ -3304,7 +3301,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string quality pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity]; } } - + maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index]; maxPrValue = pr[maxPrIndex]; @@ -3315,7 +3312,7 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string quality for(int s=0;s 100){ value = 100; } - qualities[i][base] = (int)value; - base++; + qualities[i].push_back((int)value); } - } + }//end if index++; - } - } - + + }//end while + + }//end if + if(otuCounts[i] > 0){ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl; - - int j=4; //need to get past the first four bases - while(qualities[i][j] != -1){ - qualityFile << qualities[i][j] << ' '; - if (j > qualities[i].size()) { break; } - j++; - } + //need to get past the first four bases + for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; } qualityFile << endl; } - } + }//end for qualityFile.close(); outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName); diff --git a/splitgroupscommand.cpp b/splitgroupscommand.cpp index c15ba3a..5c01037 100644 --- a/splitgroupscommand.cpp +++ b/splitgroupscommand.cpp @@ -362,7 +362,7 @@ int SplitGroupCommand::runCount(){ for (int i = 0; i < thisSeqsGroups.size(); i++) { if (m->inUsersGroups(thisSeqsGroups[i], Groups)) { //if this sequence belongs to a group we want them print seq.printSequence(*(ffiles[thisSeqsGroups[i]])); - int numSeqs = ct.getGroupCount(seq.getName(), Groups[i]); + int numSeqs = ct.getGroupCount(seq.getName(), thisSeqsGroups[i]); (*(cfiles[thisSeqsGroups[i]])) << seq.getName() << '\t' << numSeqs << '\t' << numSeqs << endl; } } diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index fe79fbb..9e27fae 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -552,7 +552,7 @@ int SubSampleCommand::getSubSampleFasta() { else{ itGroupCounts = groupCounts.find(group); if (itGroupCounts != groupCounts.end()) { - if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; } + if (itGroupCounts->second < size) { subset.insert(names[j]); (itGroupCounts->second)++; } } } } diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp index 2a16351..fd55edf 100644 --- a/trimflowscommand.cpp +++ b/trimflowscommand.cpp @@ -637,8 +637,7 @@ void TrimFlowsCommand::getOligos(vector >& outFlowFileNames){ primers[""] = 0; primerNameVector.push_back(""); } - - + outFlowFileNames.resize(barcodeNameVector.size()); for(int i=0;idebug) { m->mothurOut("[DEBUG]: " + seq.getName() + " aligned fragment =" + temp + ", barcode =" + oligo + ", numDiffs = " + toString(numDiff) + "\n"); } + if(numDiff < minDiff){ minDiff = numDiff; minCount = 1; @@ -487,6 +489,8 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); + if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); } + if (alnLength == 0) { numDiff = bdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; @@ -548,6 +552,9 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); + + if (m->debug) { m->mothurOut("[DEBUG]: reverse " + forwardSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); } + if (alnLength == 0) { numDiff = bdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; @@ -700,7 +707,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality int numDiff = countDiffs(oligo, temp); if (alnLength == 0) { numDiff = bdiffs + 100; } - //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; + if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); } if(numDiff < minDiff){ minDiff = numDiff; @@ -762,6 +769,8 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality int numDiff = countDiffs(oligo, temp); if (alnLength == 0) { numDiff = bdiffs + 100; } + if (m->debug) { m->mothurOut("[DEBUG]: reverse " + reverseSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); } + //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; if(numDiff < minDiff){ minDiff = numDiff; @@ -1370,6 +1379,8 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality if (alnLength == 0) { numDiff = pdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; + if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); } + if(numDiff < minDiff){ minDiff = numDiff; minCount = 1; @@ -1430,6 +1441,8 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality int numDiff = countDiffs(oligo, temp); if (alnLength == 0) { numDiff = pdiffs + 100; } + if (m->debug) { m->mothurOut("[DEBUG]: reverse " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); } + //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; if(numDiff < minDiff){ minDiff = numDiff; @@ -1579,6 +1592,8 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); + if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); } + if (alnLength == 0) { numDiff = pdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; @@ -1640,6 +1655,9 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); int numDiff = countDiffs(oligo, temp); + + if (m->debug) { m->mothurOut("[DEBUG]: reverse " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); } + if (alnLength == 0) { numDiff = pdiffs + 100; } //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; @@ -1773,6 +1791,8 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){ int numDiff = countDiffs(oligo, temp); + if (m->debug) { m->mothurOut("[DEBUG]: " + seq.getName() + " aligned fragment =" + temp + ", barcode =" + oligo + ", numDiffs = " + toString(numDiff) + "\n"); } + if(numDiff < minDiff){ minDiff = numDiff; minCount = 1; diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 272ae82..d2c459d 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -736,7 +736,7 @@ int UnifracUnweightedCommand::runRandomCalcs(Tree* thisTree, vector user float rcumul = 1.0000; //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print. - for (map::iterator it = validScores.begin(); it != validScores.end(); it++) { + for (map::iterator it = validScores.begin(); it != validScores.end(); it++) { //make rscoreFreq map and rCumul map::iterator it2 = rscoreFreq[a].find(it->first); rCumul[a][it->first] = rcumul; diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index 3b0c53b..f96264f 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -915,7 +915,7 @@ void UnifracWeightedCommand::printWeightedFile() { for(int a = 0; a < numComp; a++) { output->initFile(groupComb[a], tags); //print each line - for (map::iterator it = validScores.begin(); it != validScores.end(); it++) { + for (map::iterator it = validScores.begin(); it != validScores.end(); it++) { data.push_back(it->first); data.push_back(rScoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]); output->output(data); data.clear(); @@ -1085,7 +1085,7 @@ void UnifracWeightedCommand::calculateFreqsCumuls() { for (int f = 0; f < numComp; f++) { for (int i = 0; i < rScores[f].size(); i++) { //looks like 0,0,1,1,1,2,4,7... you want to make a map that say rScoreFreq[0] = 2, rScoreFreq[1] = 3... validScores[rScores[f][i]] = rScores[f][i]; - map::iterator it = rScoreFreq[f].find(rScores[f][i]); + map::iterator it = rScoreFreq[f].find(rScores[f][i]); if (it != rScoreFreq[f].end()) { rScoreFreq[f][rScores[f][i]]++; }else{ @@ -1098,9 +1098,9 @@ void UnifracWeightedCommand::calculateFreqsCumuls() { for(int a = 0; a < numComp; a++) { float rcumul = 1.0000; //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print. - for (map::iterator it = validScores.begin(); it != validScores.end(); it++) { + for (map::iterator it = validScores.begin(); it != validScores.end(); it++) { //make rscoreFreq map and rCumul - map::iterator it2 = rScoreFreq[a].find(it->first); + map::iterator it2 = rScoreFreq[a].find(it->first); rCumul[a][it->first] = rcumul; //get percentage of random trees with that info if (it2 != rScoreFreq[a].end()) { rScoreFreq[a][it->first] /= iters; rcumul-= it2->second; } diff --git a/unifracweightedcommand.h b/unifracweightedcommand.h index 10cbc7b..a5507cd 100644 --- a/unifracweightedcommand.h +++ b/unifracweightedcommand.h @@ -54,9 +54,9 @@ class UnifracWeightedCommand : public Command { int iters, numGroups, numComp, counter; vector< vector > rScores; //vector each group comb has an entry vector< vector > uScores; //vector each group comb has an entry - vector< map > rScoreFreq; //map -vector entry for each combination. - vector< map > rCumul; //map -vector entry for each c - map validScores; //map contains scores from random + vector< map > rScoreFreq; //map -vector entry for each combination. + vector< map > rCumul; //map -vector entry for each c + map validScores; //map contains scores from random bool abort, phylip, random, includeRoot, subsample, consensus; string groups, itersString, outputForm, treefile, groupfile, namefile, countfile; -- 2.39.2