GCC_OPTIMIZATION_LEVEL = 3;
"INSTALL_PATH[sdk=*]" = TARGET_BUILD_DIR;
PRODUCT_NAME = mothur;
- SDKROOT = macosx10.8;
+ SDKROOT = macosx;
SKIP_INSTALL = NO;
};
name = Debug;
"VERSION=\"\\\"1.31.0\\\"\"",
"RELEASE_DATE=\"\\\"5/24/2013\\\"\"",
);
+ GCC_VERSION = "";
"GCC_VERSION[arch=*]" = "";
GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
"-lncurses",
"-lreadline",
);
- SDKROOT = macosx;
+ SDKROOT = macosx10.8;
SKIP_INSTALL = NO;
USER_HEADER_SEARCH_PATHS = "";
};
"VERSION=\"\\\"1.31.0\\\"\"",
"RELEASE_DATE=\"\\\"5/24/2013\\\"\"",
);
+ GCC_VERSION = "";
GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_MISSING_PARENTHESES = YES;
"-lncurses",
"-lreadline",
);
- SDKROOT = macosx;
+ SDKROOT = macosx10.8;
SKIP_INSTALL = NO;
};
name = Release;
<?xml version="1.0" encoding="UTF-8"?>
<Scheme
+ LastUpgradeVersion = "0460"
version = "1.3">
<BuildAction
parallelizeBuildables = "YES"
</BuildActionEntries>
</BuildAction>
<TestAction
- selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
+ selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
shouldUseLaunchSchemeArgsEnv = "YES"
buildConfiguration = "Debug">
</MacroExpansion>
</TestAction>
<LaunchAction
- selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
+ selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
launchStyle = "0"
useCustomWorkingDirectory = "NO"
buildConfiguration = "Debug"
+ ignoresPersistentStateOnLaunch = "NO"
debugDocumentVersioning = "YES"
allowLocationSimulation = "YES">
<BuildableProductRunnable>
if (m->control_pressed) { break; }
Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA);
- cout << candidateSeq->getAligned() << endl;
report.setCandidate(candidateSeq);
int origNumBases = candidateSeq->getNumBases();
in >> name; m->gobble(in); in >> thisTotal; m->gobble(in);
if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + toString(thisTotal) + "\n"); }
+ if (thisTotal == 0) { error=true; m->mothurOut("[ERROR]: Your count table contains a sequence named " + name + " with a total=0. Please correct."); m->mothurOutEndLine();
+ }
+
//if group info, then read it
vector<int> groupCounts; groupCounts.resize(numGroups, 0);
if (columnHeaders.size() > 2) { //file contains groups
fileHandle >> filter;
fileHandle.close();
+
+ if (filter.length() != alignmentLength) { m->mothurOut("[ERROR]: Sequences are not all the same length as the filter, please correct.\n"); m->control_pressed = true; }
}
void getFreqs(Sequence seq) {
CommandParameter psubclass("subclass", "String", "", "", "", "", "","",false,false); parameters.push_back(psubclass);
CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
CommandParameter pclasses("classes", "String", "", "", "", "", "","",false,false); parameters.push_back(pclasses);
- CommandParameter palpha("anova_alpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(palpha);
- CommandParameter pwalpha("wilcoxon_alpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(pwalpha);
+ CommandParameter palpha("aalpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(palpha);
+ CommandParameter pwalpha("walpha", "Number", "", "0.05", "", "", "","",false,false); parameters.push_back(pwalpha);
+
+ CommandParameter plda("lda", "Number", "", "2.0", "", "", "","",false,false); parameters.push_back(plda);
+ CommandParameter pwilc("wilc", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pwilc);
+ CommandParameter pnormmillion("norm", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pnormmillion);
+ CommandParameter piters("iters", "Number", "", "30", "", "", "","",false,false); parameters.push_back(piters);
+ //CommandParameter pwilcsamename("wilcsamename", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pwilcsamename);
+ CommandParameter pcurv("curv", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcurv);
+ CommandParameter pfiters("fboots", "Number", "", "0.67", "", "", "","",false,false); parameters.push_back(pfiters);
+ CommandParameter pstrict("strict", "Multiple", "0-1-2", "0", "", "", "","",false,false); parameters.push_back(pstrict);
+ CommandParameter pminc("minc", "Number", "", "10", "", "", "","",false,false); parameters.push_back(pminc);
+ CommandParameter pmulticlass_strat("multiclass", "Multiple", "onevone-onevall", "onevall", "", "", "","",false,false); parameters.push_back(pmulticlass_strat);
+ //CommandParameter psubject("subject", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psubject);
+
+
+ //not used in their current code, but in parameters
+ //CommandParameter pnlogs("nlogs", "Number", "", "3", "", "", "","",false,false); parameters.push_back(pnlogs);
+ //CommandParameter pranktec("ranktec", "Multiple", "lda-svm", "lda", "", "", "","",false,false); parameters.push_back(pranktec); // svm not implemented in their source yet.
+ //CommandParameter psvmnorm("svmnorm", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(psvmnorm); //not used because svm not implemented yet.
+
+
//every command must have inputdir and outputdir. This allows mothur users to redirect input and output files.
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
try {
string helpString = "";
helpString += "The lefse command allows you to ....\n";
- helpString += "The lefse command parameters are: shared, design, class, subclass, label, classes, wilcoxon_alpha, anova_alpha.\n";
+ helpString += "The lefse command parameters are: shared, design, class, subclass, label, classes, walpha, aalpha, lda, wilc, iters, curv, fboots, strict, minc, multiclass and norm.\n";
helpString += "The class parameter is used to indicate the which category you would like used for the Kruskal Wallis analysis. If none is provided first category is used.\n";
- helpString += "The subclass parameter is used to indicate the .....If none is provided second category is used, or if only one category subclass is ignored. \n";
- helpString += "The classes parameter is used to indicate the classes you would like to use. Clases should be inputted in the following format: classes=label<value1|value2|value3>-label<value1|value2>. For example to include groups from treatment with value early or late and age= young or old. class=treatment<Early|Late>-age<young|old>.\n";
+ helpString += "The subclass parameter is used to indicate the .....If none is provided, second category is used, or if only one category subclass is ignored. \n";
+ helpString += "The aalpha parameter is used to set the alpha value for the Krukal Wallis Anova test Default=0.05. \n";
+ helpString += "The walpha parameter is used to set the alpha value for the Wilcoxon test. Default=0.05. \n";
+ helpString += "The lda parameter is used to set the threshold on the absolute value of the logarithmic LDA score. Default=2.0. \n";
+ helpString += "The wilc parameter is used to indicate whether to perform the Wilcoxon test. Default=T. \n";
+ helpString += "The iters parameter is used to set the number of bootstrap iteration for LDA. Default=30. \n";
+ //helpString += "The wilcsamename parameter is used to indicate whether perform the wilcoxon test only among the subclasses with the same name. Default=F. \n";
+ helpString += "The curv parameter is used to set whether perform the wilcoxon test ing the Curtis's approach [BETA VERSION] Default=F. \n";
+ helpString += "The norm parameter is used to multiply relative abundances by 1000000. Recommended when very low values are present. Default=T. \n";
+ helpString += "The fboots parameter is used to set the subsampling fraction value for each bootstrap iteration. Default=0.67. \n";
+ helpString += "The strict parameter is used to set the multiple testing correction options. 0 no correction (more strict, default), 1 correction for independent comparisons, 2 correction for independent comparison. Options = 0,1,2. Default=0. \n";
+ helpString += "The minc parameter is used to minimum number of samples per subclass for performing wilcoxon test. Default=10. \n";
+ helpString += "The multiclass parameter is used to (for multiclass tasks) set whether the test is performed in a one-against-one ( onevone - more strict!) or in a one-against-all setting ( onevall - less strict). Default=onevall. \n";
+ helpString += "The classes parameter is used to indicate the classes you would like to use. Classes should be inputted in the following format: classes=label<value1|value2|value3>-label<value1|value2>. For example to include groups from treatment with value early or late and age= young or old. class=treatment<Early|Late>-age<young|old>.\n";
helpString += "The label parameter is used to indicate which distances in the shared file you would like to use. labels are separated by dashes.\n";
helpString += "The lefse command should be in the following format: lefse(shared=final.an.shared, design=final.design, class=treatment, subclass=age).\n";
return helpString;
string pattern = "";
if (type == "summary") { pattern = "[filename],[distance],lefse_summary"; }
- else if (type == "kruskall-wallis") { pattern = "[filename],[distance],kruskall_wallis"; }
- else if (type == "wilcoxon") { pattern = "[filename],[distance],wilcoxon"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
setParameters();
vector<string> tempOutNames;
outputTypes["summary"] = tempOutNames;
- outputTypes["kruskall-wallis"] = tempOutNames;
- outputTypes["wilcoxon"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "LefseCommand", "LefseCommand");
vector<string> tempOutNames;
outputTypes["summary"] = tempOutNames;
- outputTypes["kruskall-wallis"] = tempOutNames;
- outputTypes["wilcoxon"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
if (mclass == "not found") { mclass = ""; }
subclass = validParameter.validFile(parameters, "subclass", false);
- if (subclass == "not found") { subclass = ""; }
+ if (subclass == "not found") { subclass = mclass; }
classes = validParameter.validFile(parameters, "classes", false);
if (classes == "not found") { classes = ""; }
- string temp = validParameter.validFile(parameters, "anova_alpha", false);
+ string temp = validParameter.validFile(parameters, "aalpha", false);
if (temp == "not found") { temp = "0.05"; }
m->mothurConvert(temp, anovaAlpha);
- temp = validParameter.validFile(parameters, "wilcoxon_alpha", false);
+ temp = validParameter.validFile(parameters, "walpha", false);
if (temp == "not found") { temp = "0.05"; }
m->mothurConvert(temp, wilcoxonAlpha);
+
+ temp = validParameter.validFile(parameters, "wilc", false);
+ if (temp == "not found") { temp = "T"; }
+ wilc = m->isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "norm", false);
+ if (temp == "not found") { temp = "T"; }
+ normMillion = m->isTrue(temp);
+
+ //temp = validParameter.validFile(parameters, "subject", false);
+ //if (temp == "not found") { temp = "F"; }
+ //subject = m->isTrue(temp);
+ temp = validParameter.validFile(parameters, "lda", false);
+ if (temp == "not found") { temp = "2.0"; }
+ m->mothurConvert(temp, ldaThreshold);
+
+ temp = validParameter.validFile(parameters, "iters", false);
+ if (temp == "not found") { temp = "30"; }
+ m->mothurConvert(temp, iters);
+
+ temp = validParameter.validFile(parameters, "fboots", false);
+ if (temp == "not found") { temp = "0.67"; }
+ m->mothurConvert(temp, fBoots);
+
+ //temp = validParameter.validFile(parameters, "wilcsamename", false);
+ //if (temp == "not found") { temp = "F"; }
+ //wilcsamename = m->isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "curv", false);
+ if (temp == "not found") { temp = "F"; }
+ curv = m->isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "strict", false);
+ if (temp == "not found"){ temp = "0"; }
+ if ((temp != "0") && (temp != "1") && (temp != "2")) { m->mothurOut("Invalid strict option: choices are 0, 1 or 2."); m->mothurOutEndLine(); abort=true; }
+ else { m->mothurConvert(temp, strict); }
+
+ temp = validParameter.validFile(parameters, "minc", false);
+ if (temp == "not found") { temp = "10"; }
+ m->mothurConvert(temp, minC);
+
+ multiClassStrat = validParameter.validFile(parameters, "multiclass", false);
+ if (multiClassStrat == "not found"){ multiClassStrat = "onevall"; }
+ if ((multiClassStrat != "onevall") && (multiClassStrat != "onevone")) { m->mothurOut("Invalid multiclass option: choices are onevone or onevall."); m->mothurOutEndLine(); abort=true; }
}
}
int LefseCommand::execute(){
try {
+ //for reading lefse formatted file and running in mothur for testing - pass number of rows used for design file
+ if (false) { makeShared(1); exit(1); }
if (abort == true) { if (calledHelp) { return 0; } return 2; }
if (mclass == "") { mclass = designMap.getDefaultClass(); m->mothurOut("\nYou did not provide a class, using " + mclass +".\n\n"); }
InputData input(sharedfile, "sharedfile");
- vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
+ vector<SharedRAbundFloatVector*> lookup = input.getSharedRAbundFloatVectors();
string lastLabel = lookup[0]->getLabel();
//if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
string saveLabel = lookup[0]->getLabel();
for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
- lookup = input.getSharedRAbundVectors(lastLabel);
+ lookup = input.getSharedRAbundFloatVectors(lastLabel);
m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
process(lookup, designMap);
if (m->control_pressed) { return 0; }
//get next line to process
- lookup = input.getSharedRAbundVectors();
+ lookup = input.getSharedRAbundFloatVectors();
}
if (m->control_pressed) { return 0; }
//run last label if you need to
if (needToRun == true) {
for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
- lookup = input.getSharedRAbundVectors(lastLabel);
+ lookup = input.getSharedRAbundFloatVectors(lastLabel);
m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
process(lookup, designMap);
}
//**********************************************************************************************************************
-int LefseCommand::process(vector<SharedRAbundVector*>& lookup, DesignMap& designMap) {
+int LefseCommand::process(vector<SharedRAbundFloatVector*>& lookup, DesignMap& designMap) {
try {
+ vector<string> classes;
+ vector<string> subclasses;
+ map<string, string> subclass2Class;
+ map<string, set<string> > class2SubClasses; //maps class name to vector of its subclasses
+ map<string, vector<int> > subClass2GroupIndex; //maps subclass name to vector of indexes in lookup from that subclass. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old. Saves time below.
+ map<string, vector<int> > class2GroupIndex; //maps subclass name to vector of indexes in lookup from that class. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old. Saves time below.
+ for (int j = 0; j < lookup.size(); j++) {
+ if (normMillion) { normalize(lookup); }
+ string group = lookup[j]->getGroup();
+ string treatment = designMap.get(group, mclass); //get value for this group in this category
+ string thisSub = designMap.get(group, subclass);
+ map<string, string>::iterator it = subclass2Class.find(thisSub);
+ if (it == subclass2Class.end()) {
+ subclass2Class[thisSub] = treatment;
+ vector<int> temp; temp.push_back(j);
+ subClass2GroupIndex[thisSub] = temp;
+ }
+ else {
+ if (it->second != treatment) {
+ //m->mothurOut("[WARNING]: subclass " + thisSub + " has members in " + it->second + " and " + treatment + ". Subclass members must be from the same class for Wilcoxon. Changing " + thisSub + " to " + treatment + "_" + thisSub + ".\n");
+ thisSub = treatment + "_" + thisSub;
+ subclass2Class[thisSub] = treatment;
+ vector<int> temp; temp.push_back(j);
+ subClass2GroupIndex[thisSub] = temp;
+ }else { subClass2GroupIndex[thisSub].push_back(j); }
+ }
+
+ map<string, set<string> >::iterator itClass = class2SubClasses.find(treatment);
+ if (itClass == class2SubClasses.end()) {
+ set<string> temp; temp.insert(thisSub);
+ class2SubClasses[treatment] = temp;
+ vector<int> temp2; temp2.push_back(j);
+ class2GroupIndex[treatment] = temp2;
+ classes.push_back(treatment);
+ }else{
+ class2SubClasses[treatment].insert(thisSub);
+ class2GroupIndex[treatment].push_back(j);
+ }
+ }
+ //sort classes so order is right
+ sort(classes.begin(), classes.end());
+
+ vector< vector<double> > means = getMeans(lookup, class2GroupIndex); //[numOTUs][classes] - classes in same order as class2GroupIndex
+
//run kruskal wallis on each otu
- vector<int> significantOtuLabels = runKruskalWallis(lookup, designMap);
+ map<int, double> significantOtuLabels = runKruskalWallis(lookup, designMap);
+
+ int numSigBeforeWilcox = significantOtuLabels.size();
//check for subclass
- if (subclass != "") { significantOtuLabels = runWilcoxon(lookup, designMap, significantOtuLabels); }
+ string wilcoxString = "";
+ if ((subclass != "") && wilc) { significantOtuLabels = runWilcoxon(lookup, designMap, significantOtuLabels, class2SubClasses, subClass2GroupIndex, subclass2Class); wilcoxString += " ( " + toString(numSigBeforeWilcox) + " ) before internal wilcoxon"; }
+
+ int numSigAfterWilcox = significantOtuLabels.size();
+
+ m->mothurOut("\nNumber of significantly discriminative features: " + toString(numSigAfterWilcox) + wilcoxString + ".\n");
+
+ map<int, double> sigOTUSLDA;
+ if (numSigAfterWilcox > 0) {
+ sigOTUSLDA = testLDA(lookup, significantOtuLabels, class2GroupIndex, subClass2GroupIndex);
+ m->mothurOut("Number of discriminative features with abs LDA score > " + toString(ldaThreshold) + " : " + toString(significantOtuLabels.size()) + ".\n");
+ }
+ else { m->mothurOut("No features with significant differences between the classes.\n"); }
+
+ printResults(means, significantOtuLabels, sigOTUSLDA, lookup[0]->getLabel(), classes);
return 0;
}
}
}
//**********************************************************************************************************************
-
-vector<int> LefseCommand::runKruskalWallis(vector<SharedRAbundVector*>& lookup, DesignMap& designMap) {
+int LefseCommand::normalize(vector<SharedRAbundFloatVector*>& lookup) {
try {
- map<string, string> variables;
- variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
- variables["[distance]"] = lookup[0]->getLabel();
- string outputFileName = getOutputFileName("kruskall-wallis",variables);
+ vector<double> mul;
+ for (int i = 0; i < lookup.size(); i++) {
+ double sum = 0.0;
+ for (int j = 0; j < lookup[i]->getNumBins(); j++) { sum += lookup[i]->getAbundance(j); }
+ mul.push_back(1000000.0/sum);
+ }
- ofstream out;
- m->openOutputFile(outputFileName, out);
- outputNames.push_back(outputFileName); outputTypes["kruskall-wallis"].push_back(outputFileName);
- out << "OTULabel\tKW\tPvalue\n";
+ for (int i = 0; i < lookup.size(); i++) {
+ for (int j = 0; j < lookup[i]->getNumBins(); j++) { lookup[i]->set(j, lookup[i]->getAbundance(j)*mul[i], lookup[i]->getGroup()); }
+ }
- vector<int> significantOtuLabels;
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LefseCommand", "normalize");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+map<int, double> LefseCommand::runKruskalWallis(vector<SharedRAbundFloatVector*>& lookup, DesignMap& designMap) {
+ try {
+ map<int, double> significantOtuLabels;
int numBins = lookup[0]->getNumBins();
//sanity check to make sure each treatment has a group in the shared file
set<string> treatments;
double pValue = 0.0;
double H = linear.calcKruskalWallis(values, pValue);
-
- //output H and signifigance
- out << m->currentBinLabels[i] << '\t' << H << '\t' << pValue << endl;
-
- if (pValue < anovaAlpha) { significantOtuLabels.push_back(i); }
+
+ if (pValue < anovaAlpha) { significantOtuLabels[i] = pValue; }
}
- out.close();
return significantOtuLabels;
}
}
//**********************************************************************************************************************
//assumes not neccessarily paired
-vector<int> LefseCommand::runWilcoxon(vector<SharedRAbundVector*>& lookup, DesignMap& designMap, vector<int> bins) {
+map<int, double> LefseCommand::runWilcoxon(vector<SharedRAbundFloatVector*>& lookup, DesignMap& designMap, map<int, double> bins, map<string, set<string> >& class2SubClasses, map<string, vector<int> >& subClass2GroupIndex, map<string, string> subclass2Class) {
try {
- LinearAlgebra linear;
- vector<int> significantOtuLabels;
+ map<int, double> significantOtuLabels;
+ map<int, double>::iterator it;
//if it exists and meets the following requirements run Wilcoxon
/*
1. Subclass members all belong to same main class
anything else
*/
- vector<string> subclasses;
- map<string, string> subclass2Class;
- map<string, int> subclassCounts;
- map<string, vector<int> > subClass2GroupIndex; //maps subclass name to vector of indexes in lookup from that subclass. old -> 1,2,3 means groups in location 1,2,3 of lookup are from old. Saves time below.
- bool error = false;
- for (int j = 0; j < lookup.size(); j++) {
- string group = lookup[j]->getGroup();
- string treatment = designMap.get(group, mclass); //get value for this group in this category
- string thisSub = designMap.get(group, subclass);
- map<string, string>::iterator it = subclass2Class.find(thisSub);
- if (it == subclass2Class.end()) {
- subclass2Class[thisSub] = treatment;
- subclassCounts[thisSub] = 1;
- vector<int> temp; temp.push_back(j);
- subClass2GroupIndex[thisSub] = temp;
+
+ int numBins = lookup[0]->getNumBins();
+ for (int i = 0; i < numBins; i++) {
+ if (m->control_pressed) { break; }
+
+ it = bins.find(i);
+ if (it != bins.end()) { //flagged in Kruskal Wallis
+
+ vector<float> abunds; for (int j = 0; j < lookup.size(); j++) { abunds.push_back(lookup[j]->getAbundance(i)); }
+
+ bool sig = testOTUWilcoxon(class2SubClasses, abunds, subClass2GroupIndex, subclass2Class);
+ if (sig) { significantOtuLabels[i] = it->second; }
+
+ }//bins flagged from kw
+ }//for bins
+
+ return significantOtuLabels;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LefseCommand", "runWilcoxon");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+//lefse.py - test_rep_wilcoxon_r function
+bool LefseCommand::testOTUWilcoxon(map<string, set<string> >& class2SubClasses, vector<float> abunds, map<string, vector<int> >& subClass2GroupIndex, map<string, string> subclass2Class) {
+ try {
+ int totalOk = 0;
+ double alphaMtc = wilcoxonAlpha;
+ vector< set<string> > allDiffs;
+ LinearAlgebra linear;
+
+ //for each subclass comparision
+ map<string, set<string> >::iterator itB;
+ for(map<string, set<string> >::iterator it=class2SubClasses.begin();it!=class2SubClasses.end();it++){
+ itB = it;itB++;
+ for(itB;itB!=class2SubClasses.end();itB++){
+ if (m->control_pressed) { return false; }
+ bool first = true;
+ int dirCmp = 0; // not set?? dir_cmp = "not_set" # 0=notset or none, 1=true, 2=false.
+ int curv_sign = 0;
+ int ok = 0;
+ int count = 0;
+ for (set<string>::iterator itClass1 = (it->second).begin(); itClass1 != (it->second).end(); itClass1++) {
+ bool br = false;
+ for (set<string>::iterator itClass2 = (itB->second).begin(); itClass2 != (itB->second).end(); itClass2++) {
+ string subclass1 = *itClass1;
+ string subclass2 = *itClass2;
+ count++;
+
+ if (m->debug) { m->mothurOut( "[DEBUG comparing " + it->first + "-" + *itClass1 + " to " + itB->first + "-" + *itClass2 + "\n"); }
+
+ string treatment1 = subclass2Class[subclass1];
+ string treatment2 = subclass2Class[subclass2];
+ int numSubs1 = class2SubClasses[treatment1].size();
+ int numSubs2 = class2SubClasses[treatment2].size();
+
+ //if mul_cor != 0: alpha_mtc = th*l_subcl1*l_subcl2 if mul_cor == 2 else 1.0-math.pow(1.0-th,l_subcl1*l_subcl2)
+ if (strict != 0) { alphaMtc = wilcoxonAlpha * numSubs1 * numSubs2 ; }
+ if (strict == 2) {}else{ alphaMtc = 1.0-pow((1.0-wilcoxonAlpha),(double)(numSubs1 * numSubs2)); }
+
+ //fill x and y with this comparisons data
+ vector<double> x; vector<double> y;
+
+ //fill x and y
+ vector<int> xIndexes = subClass2GroupIndex[subclass1]; //indexes in lookup for this subclass
+ vector<int> yIndexes = subClass2GroupIndex[subclass2]; //indexes in lookup for this subclass
+ for (int k = 0; k < yIndexes.size(); k++) { y.push_back(abunds[yIndexes[k]]); }
+ for (int k = 0; k < xIndexes.size(); k++) { x.push_back(abunds[xIndexes[k]]); }
+
+ // med_comp = False
+ //if len(cl1) < min_c or len(cl2) < min_c:
+ //med_comp = True
+ bool medComp = false; // are there enough samples per subclass
+ if ((xIndexes.size() < minC) || (yIndexes.size() < minC)) { medComp = true; }
+
+ double sx = m->median(x);
+ double sy = m->median(y);
+
+ //if cl1[0] == cl2[0] and len(set(cl1)) == 1 and len(set(cl2)) == 1:
+ //tres, first = False, False
+ double pValue = 0.0;
+ double H = 0.0;
+ bool tres = true; //don't think this is set in the python source. Not sure how that is handled, but setting it here.
+ if ((x[0] == y[0]) && (x.size() == 1) && (y.size() == 1)) { tres = false; first = false; }
+ else if (!medComp) {
+ H = linear.calcWilcoxon(x, y, pValue);
+ if (pValue < (alphaMtc*2.0)) { tres = true; }
+ else { tres = false; }
+ }
+ /*if first:
+ first = False
+ if not curv and ( med_comp or tres ):
+ dir_cmp = sx < sy
+ if sx == sy: br = True
+ elif curv:
+ dir_cmp = None
+ if med_comp or tres:
+ curv_sign += 1
+ dir_cmp = sx < sy
+ else: br = True
+ elif not curv and med_comp:
+ if ((sx < sy) != dir_cmp or sx == sy): br = True
+ elif curv:
+ if tres and dir_cmp == None:
+ curv_sign += 1
+ dir_cmp = sx < sy
+ if tres and dir_cmp != (sx < sy):
+ br = True
+ curv_sign = -1
+ elif not tres or (sx < sy) != dir_cmp or sx == sy: br = True
+ */
+ int sxSy = 2; //false
+ if (sx<sy) { sxSy = 1; } //true
+
+ if (first) {
+ first = false;
+ if ((!curv) && (medComp || tres)) {
+ dirCmp = 2; if (sx<sy) { dirCmp = 1; } //dir_cmp = sx < sy
+ if (sx == sy) { br = true; }
+ }else if (curv) {
+ dirCmp = 0;
+ if (medComp || tres) {
+ curv_sign++;
+ dirCmp = 2; if (sx<sy) { dirCmp = 1; } //dir_cmp = sx < sy
+ }
+ }else { br = true; }
+ }else if (!curv && medComp) {
+ if (sxSy != dirCmp || sx == sy) { br = true; }
+ }else if (curv) {
+ if (tres && dirCmp == 0) { curv_sign++; }
+ dirCmp = 2; if (sx<sy) { dirCmp = 1; } //dir_cmp = sx < sy
+ if (tres && dirCmp != sxSy) { //if tres and dir_cmp != (sx < sy):
+ br = true;
+ curv_sign = -1;
+ }
+ }else if (!tres || sxSy != dirCmp || sx == sy) { br = true; } //elif not tres or (sx < sy) != dir_cmp or sx == sy: br = True
+ if (br) { break; }
+ ok++;
+ }//for class2 subclasses
+ if (br) { break; }
+ }//for class1 subclasses
+ bool diff = false;
+ if (curv) { diff = false; if (curv_sign > 0) { diff = true; } } //if curv: diff = curv_sign > 0
+ else { //else: diff = (ok == len(cl_hie[pair[1]])*len(cl_hie[pair[0]]))
+ diff = false;
+ if (ok == count) { diff = true; }
+ }
+ if (diff) { totalOk++; }
+ if (!diff && (multiClassStrat == "onevone")) { return false; }
+ if (diff && (multiClassStrat == "onevall")) { //all_diff.append(pair)
+ set<string> pair; pair.insert(it->first); pair.insert(itB->first);
+ allDiffs.push_back(pair);
+ }
+ }//classes
+ }//classes
+
+ if (multiClassStrat == "onevall") {
+ int tot_k = class2SubClasses.size();
+ for(map<string, set<string> >::iterator it=class2SubClasses.begin();it!=class2SubClasses.end();it++){
+ if (m->control_pressed) { return false; }
+ int nk = 0;
+ //is this class okay in all comparisons
+ for (int h = 0; h < allDiffs.size(); h++) {
+ if (allDiffs[h].count(it->first) != 0) { nk++; }
+ }
+ if (nk == (tot_k-1)) { return true; }//if nk == tot_k-1: return True
}
- else {
- subclassCounts[thisSub]++;
- subClass2GroupIndex[thisSub].push_back(j);
- if (it->second != treatment) {
- error = true;
- m->mothurOut("[ERROR]: subclass " + thisSub + " has members in " + it->second + " and " + treatment + ". Subclass members must be from the same class. Ignoring wilcoxon.\n");
+ return false;
+ }
+
+ return true;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LefseCommand", "testOTUWilcoxon");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+//modelled after lefse.py test_lda_r function
+map<int, double> LefseCommand::testLDA(vector<SharedRAbundFloatVector*>& lookup, map<int, double> bins, map<string, vector<int> >& class2GroupIndex, map<string, vector<int> >& subClass2GroupIndex) {
+ try {
+ map<int, double> sigOTUS;
+ map<int, double>::iterator it;
+ LinearAlgebra linear;
+
+ int numBins = lookup[0]->getNumBins();
+ vector< vector<double> > adjustedLookup;
+
+ for (int i = 0; i < numBins; i++) {
+ if (m->control_pressed) { break; }
+
+ it = bins.find(i);
+ if (it != bins.end()) { //flagged in Kruskal Wallis and Wilcoxon(if we ran it)
+
+ //fill x with this OTUs abundances
+ vector<double> x;
+ for (int j = 0; j < lookup.size(); j++) { x.push_back(lookup[j]->getAbundance(i)); }
+
+ //go through classes
+ for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) {
+ //max(float(feats['class'].count(c))*0.5,4)
+ //max(numGroups in this class*0.5, 4.0)
+ double necessaryNum = ((double)((it->second).size())*0.5);
+ if (4.0 > necessaryNum) { necessaryNum = 4.0; }
+
+ set<double> uniques;
+ for (int j = 0; j < (it->second).size(); j++) { uniques.insert(x[(it->second)[j]]); }
+
+ //if len(set([float(v[1]) for v in ff if v[0] == c])) > max(float(feats['class'].count(c))*0.5,4): continue
+ if ((double)(uniques.size()) > necessaryNum) { }
+ else {
+ //feats[k][i] = math.fabs(feats[k][i] + lrand.normalvariate(0.0,max(feats[k][i]*0.05,0.01)))
+ for (int j = 0; j < (it->second).size(); j++) { //(it->second) contains indexes of abundance for this class
+ double sigma = max((x[(it->second)[j]]*0.05), 0.01);
+ x[(it->second)[j]] = abs(x[(it->second)[j]] + linear.normalvariate(0.0, sigma));
+ }
+ }
}
+ adjustedLookup.push_back(x);
+ }
+ }
+
+ //go through classes
+ int minCl = 1e6;
+ map<int, string> indexToClass;
+ vector<string> classes;
+ for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) {
+ //class with minimum number of groups
+ if ((it->second).size() < minCl) { minCl = (it->second).size(); }
+ for (int i = 0; i < (it->second).size(); i++) { indexToClass[(it->second)[i]] = it->first; }
+ classes.push_back(it->first);
+ }
+
+ int numGroups = lookup.size(); //lfk
+ int fractionNumGroups = numGroups * fBoots; //rfk
+ minCl = (int)((float)(minCl*fBoots*fBoots*0.05));
+ minCl = max(minCl, 1);
+
+ vector< vector< vector<double> > > results;//[iters][numComparison][numOTUs]
+ for (int j = 0; j < iters; j++) {
+ if (m->control_pressed) { return sigOTUS; }
+
+ //find "good" random vector
+ vector<int> rand_s;
+ for (int h = 0; h < 1000; h++) { //generate a vector of length fractionNumGroups with range 0 to numGroups-1
+ rand_s.clear();
+ for (int k = 0; k < fractionNumGroups; k++) { rand_s.push_back(m->getRandomIndex(numGroups-1)); }
+ if (!contastWithinClassesOrFewPerClass(adjustedLookup, rand_s, minCl, class2GroupIndex, indexToClass)) { h+=1000; } //break out of loop
}
+ if (m->control_pressed) { return sigOTUS; }
+
+ //print data in R input format for testing
+ if (false) {
+ vector<string> groups; for (int h = 0; h < rand_s.size(); h++) { groups.push_back(lookup[rand_s[h]]->getGroup()); }
+ printToCoutForRTesting(adjustedLookup, rand_s, class2GroupIndex, bins, subClass2GroupIndex, groups);
+ }
+
+ //for each pair of classes
+ vector< vector<double> > temp = lda(adjustedLookup, rand_s, indexToClass, classes); //[numComparison][numOTUs]
+ if (temp.size() != 0) { results.push_back(temp); }
}
- if (error) { return significantOtuLabels; }
+ if (m->control_pressed) { return sigOTUS; }
+ //m = max([numpy.mean([means[k][kk][p] for kk in range(boots)]) for p in range(len(pairs))])
+ int k = 0;
+ for (it = bins.begin(); it != bins.end(); it++) { //[numOTUs] - need to go through bins so we can tie adjustedLookup back to the binNumber. adjustedLookup[0] ->bins entry[0].
+ vector<double> averageForEachComparison; averageForEachComparison.resize(results[0].size(), 0.0);
+ double maxM = 0.0; //max of averages for each comparison
+ for (int j = 0; j < results[0].size(); j++) { //numComparisons
+ for (int i = 0; i < results.size(); i++) { //iters
+ averageForEachComparison[j]+= results[i][j][k];
+ }
+ averageForEachComparison[j] /= (double) results.size();
+ if (averageForEachComparison[j] > maxM) { maxM = averageForEachComparison[j]; }
+ }
+ //res[k] = math.copysign(1.0,m)*math.log(1.0+math.fabs(m),10)
+ double multiple = 1.0; if (maxM < 0.0) { multiple = -1.0; }
+ double resK = multiple * log10(1.0+abs(maxM));
+ if (resK > ldaThreshold) { sigOTUS[it->first] = resK; }
+ k++;
+ }
+
+ return sigOTUS;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LefseCommand", "testLDA");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector< vector<double> > LefseCommand::getMeans(vector<SharedRAbundFloatVector*>& lookup, map<string, vector<int> >& class2GroupIndex) {
+ try {
int numBins = lookup[0]->getNumBins();
- vector<compGroup> comp;
- //find comparisons and fill comp
- map<string, int>::iterator itB;
- for(map<string, int>::iterator it=subclassCounts.begin();it!=subclassCounts.end();it++){
- itB = it;itB++;
- for(itB;itB!=subclassCounts.end();itB++){
- compGroup temp(it->first,itB->first);
- comp.push_back(temp);
- }
+ int numClasses = class2GroupIndex.size();
+ vector< vector<double> > means; //[numOTUS][classes]
+ means.resize(numBins);
+ for (int i = 0; i < means.size(); i++) { means[i].resize(numClasses, 0.0); }
+
+ map<int, string> indexToClass;
+ int count = 0;
+ //shortcut for vectors below
+ map<string, int> quickIndex;
+ vector<int> classCounts;
+ for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) {
+ for (int i = 0; i < (it->second).size(); i++) { indexToClass[(it->second)[i]] = it->first; }
+ quickIndex[it->first] = count; count++;
+ classCounts.push_back((it->second).size());
+ }
+
+ for (int i = 0; i < numBins; i++) {
+ for (int j = 0; j < lookup.size(); j++) {
+ if (m->control_pressed) { return means; }
+ means[i][quickIndex[indexToClass[j]]] += lookup[j]->getAbundance(i);
+ }
+ }
+
+ for (int i = 0; i < numBins; i++) {
+ for (int j = 0; j < numClasses; j++) { means[i][j] /= (double) classCounts[j]; }
+ }
+
+ return means;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LefseCommand", "getMeans");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector< vector<double> > LefseCommand::lda(vector< vector<double> >& adjustedLookup, vector<int> rand_s, map<int, string>& indexToClass, vector<string> classes) {
+ try {
+ //shortcut for vectors below
+ map<string, int> quickIndex;
+ for (int i = 0; i < classes.size(); i++) { quickIndex[classes[i]] = i; }
+
+ vector<string> randClass; //classes for rand sample
+ vector<int> counts; counts.resize(classes.size(), 0);
+ for (int i = 0; i < rand_s.size(); i++) {
+ string thisClass = indexToClass[rand_s[i]];
+ randClass.push_back(thisClass);
+ counts[quickIndex[thisClass]]++;
}
- int numComp = comp.size();
- if (numComp < 2) { m->mothurOut("[ERROR]: Need at least 2 subclasses, Ignoring Wilcoxon.\n");
- return significantOtuLabels; }
+ vector< vector<double> > a; //[numOTUs][numSampled]
+ for (int i = 0; i < adjustedLookup.size(); i++) {
+ vector<double> temp;
+ for (int j = 0; j < rand_s.size(); j++) {
+ temp.push_back(adjustedLookup[i][rand_s[j]]);
+ }
+ a.push_back(temp);
+ }
+
+ LinearAlgebra linear;
+ vector< vector<double> > means; bool ignore;
+ vector< vector<double> > scaling = linear.lda(a, randClass, means, ignore); //means are returned sorted, quickIndex sorts as well since it uses a map. means[class][otu] =
+ if (ignore) { scaling.clear(); return scaling; }
+ if (m->control_pressed) { return scaling; }
+
+ vector< vector<double> > w; w.resize(a.size()); //w.unit <- w/sqrt(sum(w^2))
+ double denom = 0.0;
+ for (int i = 0; i < scaling.size(); i++) { w[i].push_back(scaling[i][0]); denom += (w[i][0]*w[i][0]); }
+ denom = sqrt(denom);
+ for (int i = 0; i < w.size(); i++) { w[i][0] /= denom; } //[numOTUs][1] - w.unit
+
+ //robjects.r('LD <- xy.matrix%*%w.unit') [numSampled][numOtus] * [numOTUs][1]
+ vector< vector<double> > LD = linear.matrix_mult(linear.transpose(a), w);
+ //find means for each groups LDs
+ vector<double> LDMeans; LDMeans.resize(classes.size(), 0.0); //means[0] -> average for [group0].
+ for (int i = 0; i < LD.size(); i++) { LDMeans[quickIndex[randClass[i]]] += LD[i][0]; }
+ for (int i = 0; i < LDMeans.size(); i++) { LDMeans[i] /= (double) counts[i]; }
+
+ //calculate for each comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
+ vector< vector<double> > results;// [numComparison][numOTUs]
+ for (int i = 0; i < LDMeans.size(); i++) {
+ for (int l = 0; l < i; l++) {
+ if (m->control_pressed) { return scaling; }
+ //robjects.r('effect.size <- abs(mean(LD[sub_d[,"class"]=="'+p[0]+'"]) - mean(LD[sub_d[,"class"]=="'+p[1]+'"]))')
+ double effectSize = abs(LDMeans[i] - LDMeans[l]);
+ //scal = robjects.r('wfinal <- w.unit * effect.size')
+ vector<double> compResults;
+ for (int j = 0; j < w.size(); j++) { //[numOTUs][1]
+ //coeff = [abs(float(v)) if not math.isnan(float(v)) else 0.0 for v in scal]
+ double coeff = abs(w[j][0]*effectSize); if (isnan(coeff) || isinf(coeff)) { coeff = 0.0; }
+ //gm = abs(res[p[0]][j] - res[p[1]][j]) - res is the means for each group for each otu
+ double gm = abs(means[i][j] - means[l][j]);
+ //means[k][i].append((gm+coeff[j])*0.5)
+ compResults.push_back((gm+coeff)*0.5);
+ }
+ results.push_back(compResults);
+ }
+ }
+
+ return results;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LefseCommand", "lda");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+//modelled after lefse.py contast_within_classes_or_few_per_class function
+bool LefseCommand::contastWithinClassesOrFewPerClass(vector< vector<double> >& lookup, vector<int> rands, int minCl, map<string, vector<int> > class2GroupIndex, map<int, string> indexToClass) {
+ try {
+ set<string> cls;
+ int countFound = 0;
+
+ for (int i = 0; i < rands.size(); i++) { //fill cls with the classes represented in the random selection
+ for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it != class2GroupIndex.end(); it++) {
+ if (m->inUsersGroups(rands[i], (it->second))) {
+ cls.insert(it->first);
+ countFound++;
+ }
+ }
+ }
+
+ //sanity check
+ if (rands.size() != countFound) { m->mothurOut("oops, should never get here, missing something.\n"); }
+
+ if (cls.size() < class2GroupIndex.size()) { return true; } //some classes are not present in sampling
+
+ for (set<string>::iterator it = cls.begin(); it != cls.end(); it++) {
+ if (cls.count(*it) < minCl) { return true; } //this sampling has class count below minimum
+ }
+
+ //for this otu
+ int numBins = lookup.size();
+ for (int i = 0; i < numBins; i++) {
+ if (m->control_pressed) { break; }
+
+ //break up random sampling by class
+ map<string, set<double> > class2Values; //maps class name -> set of abunds present in random sampling. F003Early -> 0.001, 0.003...
+ for (int j = 0; j < rands.size(); j++) {
+ class2Values[indexToClass[rands[j]]].insert(lookup[i][rands[j]]);
+ //rands[j] = index of randomly selected group in lookup, randIndex2Class[rands[j]] = class this group belongs to. lookup[rands[j]]->getAbundance(i) = abundance of this group for this OTU.
+ }
+ //are the unique values less than we want
+ //if (len(set(col)) <= min_cl and min_cl > 1) or (min_cl == 1 and len(set(col)) <= 1):
+ for (map<string, set<double> >::iterator it = class2Values.begin(); it != class2Values.end(); it++) {
+ if (((it->second).size() <= minCl && minCl > 1) || (minCl == 1 && (it->second).size() <= 1)) { return true; }
+ }
+ }
+
+ return false;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LefseCommand", "contastWithinClassesOrFewPerClass");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int LefseCommand::printResults(vector< vector<double> > means, map<int, double> sigKW, map<int, double> sigLDA, string label, vector<string> classes) {
+ try {
map<string, string> variables;
variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
- variables["[distance]"] = lookup[0]->getLabel();
- string outputFileName = getOutputFileName("wilcoxon",variables);
+ variables["[distance]"] = label;
+ string outputFileName = getOutputFileName("summary",variables);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+ outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
- ofstream out;
- m->openOutputFile(outputFileName, out);
- outputNames.push_back(outputFileName); outputTypes["wilcoxon"].push_back(outputFileName);
- out << "OTULabel\tComparision\tWilcoxon\tPvalue\n";
+ //output headers
+ out << "OTU\tLogMaxMean\tClass\tLDA\tpValue\n";
- for (int i = 0; i < numBins; i++) {
+ string temp = "";
+ for (int i = 0; i < means.size(); i++) { //[numOTUs][classes]
+ //find max mean of classes
+ double maxMean = -1.0; string maxClass = "none";
+ for (int j = 0; j < means[i].size(); j++) { if (means[i][j] > maxMean) { maxMean = means[i][j]; maxClass = classes[j]; } }
+
+ //str(math.log(max(max(v),1.0),10.0))
+ double logMaxMean = 1.0;
+ if (maxMean > logMaxMean) { logMaxMean = maxMean; }
+ logMaxMean = log10(logMaxMean);
+
+ out << m->currentBinLabels[i] << '\t' << logMaxMean << '\t';
+ if (m->debug) { temp = m->currentBinLabels[i] + '\t' + toString(logMaxMean) + '\t'; }
+
+ map<int, double>::iterator it = sigLDA.find(i);
+ if (it != sigLDA.end()) {
+ out << maxClass << '\t' << it->second << '\t' << sigKW[i] << endl; //sigLDA is a subset of sigKW so no need to look
+ if (m->debug) { temp += maxClass + '\t' + toString(it->second) + '\t' + toString(sigKW[i]) + '\n'; m->mothurOut(temp); temp = ""; }
+ }else { out << '-' << endl; }
+ }
+
+ out.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LefseCommand", "printResults");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+//printToCoutForRTesting(adjustedLookup, rand_s, class2GroupIndex, numBins);
+bool LefseCommand::printToCoutForRTesting(vector< vector<double> >& adjustedLookup, vector<int> rand_s, map<string, vector<int> >& class2GroupIndex, map<int, double> bins, map<string, vector<int> >& subClass2GroupIndex, vector<string> groups) {
+ try {
+ cout << "rand_s = ";
+ for (int h = 0; h < rand_s.size(); h++) { cout << rand_s[h] << '\t'; } cout << endl;
+
+ //print otu data
+ int count = 0;
+ for (map<int, double>::iterator it = bins.begin(); it != bins.end(); it++) {
+ if (m->control_pressed) { break; }
+
+ cout << m->currentBinLabels[it->first] << " <- c(";
+ for (int h = 0; h < rand_s.size()-1; h++) { cout << (adjustedLookup[count][rand_s[h]]) << ", "; }
+ cout << (adjustedLookup[count][rand_s[rand_s.size()-1]]) << ")\n";
+ count++;
+ }
+ /*
+ string tempOutput = "";
+ for (int h = 0; h < rand_s.size(); h++) {
+ //find class this index is in
+ for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it!= class2GroupIndex.end(); it++) {
+ if (m->inUsersGroups(rand_s[h], (it->second)) ) { cout << (h+1) << " <- c(\"" +it->first + "\")\n" ; }
+ }
+ }*/
+
+
+ string tempOutput = "treatments <- c(";
+ for (int h = 0; h < rand_s.size(); h++) {
+ //find class this index is in
+ for (map<string, vector<int> >::iterator it = class2GroupIndex.begin(); it!= class2GroupIndex.end(); it++) {
+ if (m->inUsersGroups(rand_s[h], (it->second)) ) { tempOutput += "\"" +it->first + "\"" + ","; } //"\"" +it->first + "\""
+ }
+ }
+ tempOutput = tempOutput.substr(0, tempOutput.length()-1);
+ tempOutput += ")\n";
+ cout << tempOutput;
+
+ /*
+ if (subclass != "") {
+ string tempOutput = "sub <- c(";
+ for (int h = 0; h < rand_s.size(); h++) {
+ //find class this index is in
+ for (map<string, vector<int> >::iterator it = subClass2GroupIndex.begin(); it!= subClass2GroupIndex.end(); it++) {
+ if (m->inUsersGroups(rand_s[h], (it->second)) ) { tempOutput += "\"" +it->first + "\"" + ','; }
+ }
+ }
+ tempOutput = tempOutput.substr(0, tempOutput.length()-1);
+ tempOutput += ")\n";
+ cout << tempOutput;
+ }
+
+ if (subject) {
+ string tempOutput = "group <- c(";
+ for (int h = 0; h < groups.size(); h++) {
+ tempOutput += "\"" +groups[h] + "\"" + ',';
+ }
+ tempOutput = tempOutput.substr(0, tempOutput.length()-1);
+ tempOutput += ")\n";
+ cout << tempOutput;
+ }*/
+
+
+ //print data frame
+ tempOutput = "dat <- data.frame(";
+ for (map<int, double>::iterator it = bins.begin(); it != bins.end(); it++) {
if (m->control_pressed) { break; }
- if (m->inUsersGroups(i, bins)) { //flagged in Kruskal Wallis
+ tempOutput += "\"" + m->currentBinLabels[it->first] + "\"=" + m->currentBinLabels[it->first] + ",";
+ }
+ //tempOutput = tempOutput.substr(0, tempOutput.length()-1);
+ tempOutput += " class=treatments";
+ //if (subclass != "") { tempOutput += ", subclass=sub"; }
+ //if (subject) { tempOutput += ", subject=group"; }
+ tempOutput += ")\n";
+ cout << tempOutput;
- bool sig = false;
- //for each subclass comparision
- for (int j = 0; j < numComp; j++) {
- //fill x and y with this comparisons data
- vector<double> x; vector<double> y;
-
- cout << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << " x <- (";
- //fill x and y
- vector<int> xIndexes = subClass2GroupIndex[comp[j].group1]; //indexes in lookup for this subclass
- for (int k = 0; k < xIndexes.size(); k++) { x.push_back(lookup[xIndexes[k]]->getAbundance(i)); cout << lookup[xIndexes[k]]->getAbundance(i) << ", "; }
- cout << ")\n";
-
- cout << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << " y <- (";
-
- vector<int> yIndexes = subClass2GroupIndex[comp[j].group2]; //indexes in lookup for this subclass
- for (int k = 0; k < yIndexes.size(); k++) { y.push_back(lookup[yIndexes[k]]->getAbundance(i)); cout << lookup[yIndexes[k]]->getAbundance(i) << ", ";}
- cout << ")\n";
-
- double pValue = 0.0;
- double H = linear.calcWilcoxon(x, y, pValue);
+ tempOutput = "z <- suppressWarnings(mylda(as.formula(class ~ ";
+ for (map<int, double>::iterator it = bins.begin(); it != bins.end(); it++) {
+ if (m->control_pressed) { break; }
- //output H and signifigance
- if (!isnan(pValue)) { out << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << '\t' << H << '\t' << pValue << endl; }
- else { out << m->currentBinLabels[i] << '\t' << comp[j].getCombo() << '\t' << H << '\t' << "NA" << endl; }
-
- //set sig - not sure how yet
- }
- if (sig) { significantOtuLabels.push_back(i); }
+ tempOutput += m->currentBinLabels[it->first] + "+";
+ }
+ tempOutput = tempOutput.substr(0, tempOutput.length()-1); //rip off extra plus sign
+ tempOutput += "), data = dat, tol = 1e-10))";
+ cout << tempOutput + "\nz\n";
+ cout << "w <- z$scaling[,1]\n"; //robjects.r('w <- z$scaling[,1]')
+ cout << "w.unit <- w/sqrt(sum(w^2))\n"; //robjects.r('w.unit <- w/sqrt(sum(w^2))')
+ cout << "ss <- dat[,-match(\"class\",colnames(dat))]\n"; //robjects.r('ss <- sub_d[,-match("class",colnames(sub_d))]')
+ //if (subclass != "") { cout << "ss <- ss[,-match(\"subclass\",colnames(ss))]\n"; }//robjects.r('ss <- ss[,-match("subclass",colnames(ss))]')
+ //if (subject) { cout << "ss <- ss[,-match(\"subject\",colnames(ss))]\n"; }//robjects.r('ss <- ss[,-match("subject",colnames(ss))]')
+ cout << "xy.matrix <- as.matrix(ss)\n"; //robjects.r('xy.matrix <- as.matrix(ss)')
+ cout << "LD <- xy.matrix%*%w.unit\n"; //robjects.r('LD <- xy.matrix%*%w.unit')
+ cout << "effect.size <- abs(mean(LD[dat[,\"class\"]==\"'+p[0]+'\"]) - mean(LD[dat[,\"class\"]==\"'+p[1]+'\"]))\n"; //robjects.r('effect.size <- abs(mean(LD[sub_d[,"class"]=="'+p[0]+'"]) - mean(LD[sub_d[,"class"]=="'+p[1]+'"]))')
+ cout << "wfinal <- w.unit * effect.size\n"; //scal = robjects.r('wfinal <- w.unit * effect.size')
+ cout << "mm <- z$means\n"; //rres = robjects.r('mm <- z$means')
+
+ return true;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LefseCommand", "printToCoutForRTesting");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int LefseCommand::makeShared(int numDesignLines) {
+ try {
+ ifstream in;
+ m->openInputFile(sharedfile, in);
+ vector< vector<string> > lines;
+ for(int i = 0; i < numDesignLines; i++) {
+ if (m->control_pressed) { return 0; }
+
+ string line = m->getline(in);
+ cout << line << endl;
+ vector<string> pieces = m->splitWhiteSpace(line);
+ lines.push_back(pieces);
+ }
+
+ ofstream out;
+ m->openOutputFile(sharedfile+".design", out); out << "group" << '\t';
+ for (int j = 0; j < lines.size(); j++) { out << lines[j][0] << '\t'; } out << endl;
+ for (int j = 1; j < lines[0].size(); j++) {
+ out <<(j-1) << '\t';
+ for (int i = 0; i < lines.size(); i++) {
+ out << lines[i][j] << '\t';
}
+ out << endl;
}
out.close();
+ DesignMap design(sharedfile+".design");
- return significantOtuLabels;
+ vector<SharedRAbundFloatVector*> lookup;
+ for (int k = 0; k < lines[0].size()-1; k++) {
+ SharedRAbundFloatVector* temp = new SharedRAbundFloatVector();
+ temp->setLabel("0.03");
+ temp->setGroup(toString(k));
+ lookup.push_back(temp);
+ }
+
+ m->currentBinLabels.clear();
+ int count = 0;
+ while (!in.eof()) {
+ if (m->control_pressed) { return 0; }
+
+ string line = m->getline(in);
+ vector<string> pieces = m->splitWhiteSpace(line);
+
+ float sum = 0.0;
+ for (int i = 1; i < pieces.size(); i++) {
+ float value; m->mothurConvert(pieces[i], value);
+ sum += value;
+ }
+
+ if (sum != 0.0) {
+ //cout << count << '\t';
+ for (int i = 1; i < pieces.size(); i++) {
+ float value; m->mothurConvert(pieces[i], value);
+ lookup[i-1]->push_back(value, toString(i-1));
+ //cout << pieces[i] << '\t';
+ }
+ m->currentBinLabels.push_back(toString(count));
+ //m->currentBinLabels.push_back(pieces[0]);
+ //cout << line<< endl;
+ //cout << endl;
+ }
+ count++;
+ }
+ in.close();
+
+ for (int k = 0; k < lookup.size(); k++) {
+ //cout << "0.03" << '\t' << toString(k) << endl; lookup[k]->print(cout);
+ }
+
+ process(lookup, design);
+
+ return 0;
}
catch(exception& e) {
- m->errorOut(e, "LefseCommand", "runWilcoxon");
+ m->errorOut(e, "LefseCommand", "printToCoutForRTesting");
exit(1);
}
}
void help() { m->mothurOut(getHelpString()); }
private:
- bool abort, allLines;
- string outputDir, sharedfile, designfile, mclass, subclass, classes;
+ bool abort, allLines, wilc, wilcsamename, curv, subject, normMillion;
+ string outputDir, sharedfile, designfile, mclass, subclass, classes, rankTec, multiClassStrat;
vector<string> outputNames;
set<string> labels;
- float anovaAlpha, wilcoxonAlpha;
+ double anovaAlpha, wilcoxonAlpha, fBoots, ldaThreshold;
+ int nlogs, iters, strict, minC;
- int process(vector<SharedRAbundVector*>&, DesignMap&);
- vector<int> runKruskalWallis(vector<SharedRAbundVector*>&, DesignMap&);
- vector<int> runWilcoxon(vector<SharedRAbundVector*>&, DesignMap&, vector<int>);
-
+ int process(vector<SharedRAbundFloatVector*>&, DesignMap&);
+ int normalize(vector<SharedRAbundFloatVector*>&);
+ map<int, double> runKruskalWallis(vector<SharedRAbundFloatVector*>&, DesignMap&);
+ map<int, double> runWilcoxon(vector<SharedRAbundFloatVector*>&, DesignMap&, map<int, double>, map<string, set<string> >& class2SubClasses, map<string, vector<int> >& subClass2GroupIndex, map<string, string>);
+ bool testOTUWilcoxon(map<string, set<string> >& class2SubClasses, vector<float> abunds, map<string, vector<int> >& subClass2GroupIndex, map<string, string>);
+ map<int, double> testLDA(vector<SharedRAbundFloatVector*>&, map<int, double>, map<string, vector<int> >& class2GroupIndex, map<string, vector<int> >&);
+ bool contastWithinClassesOrFewPerClass(vector< vector<double> >&, vector<int> rands, int minCl, map<string, vector<int> > class2GroupIndex, map<int, string> indexToClass);
+ vector< vector<double> > lda(vector< vector<double> >& adjustedLookup, vector<int> rand_s, map<int, string>& indexToClass, vector<string>);
+ vector< vector<double> > getMeans(vector<SharedRAbundFloatVector*>& lookup, map<string, vector<int> >& class2GroupIndex);
+ int printResults(vector< vector<double> >, map<int, double>, map<int, double>, string, vector<string>);
+
+ //for testing
+ bool printToCoutForRTesting(vector< vector<double> >& adjustedLookup, vector<int> rand_s, map<string, vector<int> >& class2GroupIndex, map<int, double> bins, map<string, vector<int> >&, vector<string>);
+ int makeShared(int);
};
/**************************************************************************************************/
}
}
/*********************************************************************************************************************************/
-
+//[3][4] * [4][5] - columns in first must match rows in second, returns matrix[3][5]
vector<vector<double> > LinearAlgebra::matrix_mult(vector<vector<double> > first, vector<vector<double> > second){
try {
vector<vector<double> > product;
}
}
+/*********************************************************************************************************************************/
+vector<vector<double> > LinearAlgebra::transpose(vector<vector<double> >matrix){
+ try {
+ vector<vector<double> > trans; trans.resize(matrix[0].size());
+ for (int i = 0; i < trans.size(); i++) {
+ for (int j = 0; j < matrix.size(); j++) { trans[i].push_back(matrix[j][i]); }
+ }
+
+ return trans;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "transpose");
+ exit(1);
+ }
+
+}
/*********************************************************************************************************************************/
void LinearAlgebra::recenter(double offset, vector<vector<double> > D, vector<vector<double> >& G){
}
}
/*********************************************************************************************************************************/
+//python random.normalvariate - thanks http://madssj.com/blog/2008/05/07/porting-normalvariate-from-python-to-c/
+double LinearAlgebra::normalvariate(double mu, double sigma) {
+ try {
+ double NV_MAGICCONST = 1.7155277699214135; /* (4 * exp(-0.5) / sqrt(2.0)); */
+ unsigned long int MAX_RANDOM = 2147483647; /* (2 ** 31) - 1; */
+
+ double u1, u2, z, zz;
+ for (;;) {
+ if (m->control_pressed) { break; }
+ u1 = ((float)random()) / MAX_RANDOM;
+ u2 = 1.0 - (((float)random()) / MAX_RANDOM);
+ z = NV_MAGICCONST * (u1 - 0.5) / u2;
+ zz = z * z / 4.0;
+ if (zz <= -(log(u2))) {
+ break;
+ }
+ }
+ return mu + z * sigma;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "normalvariate");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
//thanks http://www.johndcook.com/cpp_phi.html
double LinearAlgebra::pnorm(double x){
try {
exit(1);
}
}
+/*********************************************************************************************************************************/
+//modelled R lda function - MASS:::lda.default
+vector< vector<double> > LinearAlgebra::lda(vector< vector<double> >& a, vector<string> groups, vector< vector<double> >& means, bool& ignore) {
+ try {
+
+ set<string> uniqueGroups;
+ for (int i = 0; i < groups.size(); i++) { uniqueGroups.insert(groups[i]); }
+ int numGroups = uniqueGroups.size();
+
+ map<string, int> quickIndex; //className to index. hoping to save counts, proportions and means in vectors to save time. This map will allow us to know index 0 in counts refers to group1.
+ int count = 0;
+ for (set<string>::iterator it = uniqueGroups.begin(); it != uniqueGroups.end(); it++) { quickIndex[*it] = count; count++; }
+
+ int numSampled = groups.size(); //number of sampled groups
+ int numOtus = a.size(); //number of flagged bins
+
+ //counts <- as.vector(table(g)) //number of samples from each class in random sampling
+ vector<int> counts; counts.resize(numGroups, 0);
+ for (int i = 0; i < groups.size(); i++) {
+ counts[quickIndex[groups[i]]]++;
+ }
+
+ vector<double> proportions; proportions.resize(numGroups, 0.0);
+ for (int i = 0; i < numGroups; i++) { proportions[i] = counts[i] / (double) numSampled; }
+
+ means.clear(); //means[0] -> means[0][0] average for [group0][OTU0].
+ means.resize(numGroups); for (int i = 0; i < means.size(); i++) { means[i].resize(numOtus, 0.0); }
+ for (int j = 0; j < numSampled; j++) { //total for each class for each OTU
+ for (int i = 0; i < numOtus; i++) { means[quickIndex[groups[j]]][i] += a[i][j]; }
+ }
+ //average for each class for each OTU
+ for (int j = 0; j < numGroups; j++) { for (int i = 0; i < numOtus; i++) { means[j][i] /= counts[j]; } }
+
+ //randCov <- x - group.means[g, ]
+ vector< vector<double> > randCov; //randCov[0][0] -> (random sample value0 for OTU0 - average for samples group in OTU0). example OTU0, random sample 0.01 from class early. average of class early for OTU0 is 0.005. randCov[0][0] = (0.01-0.005)
+ for (int i = 0; i < numOtus; i++) { //for each flagged OTU
+ vector<double> tempRand;
+ for (int j = 0; j < numSampled; j++) { tempRand.push_back(a[i][j] - means[quickIndex[groups[j]]][i]); }
+ randCov.push_back(tempRand);
+ }
+
+ //find variance and std for each OTU
+ //f1 <- sqrt(diag(var(x - group.means[g, ])))
+ vector<double> stdF1;
+ vector<double> ave;
+ for (int i = 0; i < numOtus; i++) {
+ stdF1.push_back(0.0);
+ ave.push_back(m->getAverage(randCov[i]));
+ }
+
+ for (int i = 0; i < numOtus; i++) {
+ for (int j = 0; j < numSampled; j++) { stdF1[i] += ((randCov[i][j] - ave[i]) * (randCov[i][j] - ave[i])); }
+ }
+
+ //fac <- 1/(n - ng)
+ double fac = 1 / (double) (numSampled-numGroups);
+
+ for (int i = 0; i < stdF1.size(); i++) {
+ stdF1[i] /= (double) (numSampled-1);
+ stdF1[i] = sqrt(stdF1[i]);
+ }
+
+ vector< vector<double> > scaling; //[numOTUS][numOTUS]
+ for (int i = 0; i < numOtus; i++) {
+ vector<double> temp;
+ for (int j = 0; j < numOtus; j++) {
+ if (i == j) { temp.push_back(1.0/stdF1[i]); }
+ else { temp.push_back(0.0); }
+
+ }
+ scaling.push_back(temp);
+ }
+ /*
+ cout << "scaling = " << endl;
+ for (int i = 0; i < scaling.size(); i++) {
+ for (int j = 0; j < scaling[i].size(); j++) { cout << scaling[i][j] << '\t'; }
+ cout << endl;
+ }*/
+
+ //X <- sqrt(fac) * ((x - group.means[g, ]) %*% scaling)
+ vector< vector<double> > X = randCov; //[numOTUS][numSampled]
+ //((x - group.means[g, ]) %*% scaling)
+ //matrix multiplication of randCov and scaling
+ LinearAlgebra linear;
+ X = linear.matrix_mult(scaling, randCov); //[numOTUS][numOTUS] * [numOTUS][numSampled] = [numOTUS][numSampled]
+ fac = sqrt(fac);
+
+ for (int i = 0; i < X.size(); i++) {
+ for (int j = 0; j < X[i].size(); j++) { X[i][j] *= fac; }
+ }
+
+ vector<double> d;
+ vector< vector<double> > v;
+ vector< vector<double> > Xcopy; //X = [numOTUS][numSampled]
+ bool transpose = false; //svd requires rows < columns, so if they are not then I need to transpose and look for the results in v.
+ if (X.size() < X[0].size()) { Xcopy = linear.transpose(X); transpose=true; }
+ else { Xcopy = X; }
+ linear.svd(Xcopy, d, v); //Xcopy gets the results we want for v below, because R's version is [numSampled][numOTUS]
+
+ /*cout << "Xcopy = " << endl;
+ for (int i = 0; i < Xcopy.size(); i++) {
+ for (int j = 0; j < Xcopy[i].size(); j++) { cout << Xcopy[i][j] << '\t'; }
+ cout << endl;
+ }
+ cout << "v = " << endl;
+ for (int i = 0; i < v.size(); i++) {
+ for (int j = 0; j < v[i].size(); j++) { cout << v[i][j] << '\t'; }
+ cout << endl;
+ }
+ */
+
+ int rank = 0;
+ set<int> goodColumns;
+ //cout << "d = " << endl;
+ for (int i = 0; i < d.size(); i++) { if (d[i] > 0.0000000001) { rank++; goodColumns.insert(i); } } //cout << d[i] << endl;
+
+ if (rank == 0) {
+ ignore=true; //m->mothurOut("[ERROR]: rank = 0: variables are numerically const\n"); m->control_pressed = true;
+ return scaling; }
+
+ //scaling <- scaling %*% X.s$v[, 1L:rank] %*% diag(1/X.s$d[1L:rank], , rank)
+ //X.s$v[, 1L:rank] = columns in Xcopy that correspond to "good" d values
+ //diag(1/X.s$d[1L:rank], , rank) = matrix size rank * rank where the diagonal is 1/"good" dvalues
+ /*example:
+ d
+ [1] 3.721545e+00 3.034607e+00 2.296649e+00 7.986927e-16 6.922408e-16
+ [6] 5.471102e-16
+
+ $v
+ [,1] [,2] [,3] [,4] [,5] [,6]
+ [1,] 0.31122175 0.10944725 0.20183340 -0.30136820 0.60786235 -0.13537095
+ [2,] -0.29563726 -0.20568893 0.11233366 -0.05073289 0.48234270 0.21965978
+ ...
+
+ [1] "X.s$v[, 1L:rank]"
+ [,1] [,2] [,3]
+ [1,] 0.31122175 0.10944725 0.20183340
+ [2,] -0.29563726 -0.20568893 0.11233366
+ ...
+ [1] "1/X.s$d[1L:rank]"
+ [1] 0.2687056 0.3295320 0.4354170
+
+ [1] "diag(1/X.s$d[1L:rank], , rank)"
+ [,1] [,2] [,3]
+ [1,] 0.2687056 0.000000 0.000000
+ [2,] 0.0000000 0.329532 0.000000
+ [3,] 0.0000000 0.000000 0.435417
+ */
+ if (transpose) {
+ Xcopy = linear.transpose(v);
+ /*
+ cout << "Xcopy = " << endl;
+ for (int i = 0; i < Xcopy.size(); i++) {
+ for (int j = 0; j < Xcopy[i].size(); j++) { cout << Xcopy[i][j] << '\t'; }
+ cout << endl;
+ }*/
+ }
+ v.clear(); //store "good" columns - X.s$v[, 1L:rank]
+ v.resize(Xcopy.size()); //[numOTUS]["good" columns]
+ for (set<int>::iterator it = goodColumns.begin(); it != goodColumns.end(); it++) {
+ for (int i = 0; i < Xcopy.size(); i++) {
+ v[i].push_back(Xcopy[i][*it]);
+ }
+ }
+
+ vector< vector<double> > diagRanks; diagRanks.resize(rank);
+ for (int i = 0; i < rank; i++) { diagRanks[i].resize(rank, 0.0); }
+ count = 0;
+ for (set<int>::iterator it = goodColumns.begin(); it != goodColumns.end(); it++) { diagRanks[count][count] = 1.0 / d[*it]; count++; }
+
+ scaling = linear.matrix_mult(linear.matrix_mult(scaling, v), diagRanks); //([numOTUS][numOTUS]*[numOTUS]["good" columns]) = [numOTUS]["good" columns] then ([numOTUS]["good" columns] * ["good" columns]["good" columns] = scaling = [numOTUS]["good" columns]
+
+ /*cout << "scaling = " << endl;
+ for (int i = 0; i < scaling.size(); i++) {
+ for (int j = 0; j < scaling[i].size(); j++) { cout << scaling[i][j] << '\t'; }
+ cout << endl;
+ }*/
+
+ //Note: linear.matrix_mult [1][numGroups] * [numGroups][numOTUs] - columns in first must match rows in second, returns matrix[1][numOTUs]
+ vector< vector<double> > prior; prior.push_back(proportions);
+ vector< vector<double> > xbar = linear.matrix_mult(prior, means);
+ vector<double> xBar = xbar[0]; //length numOTUs
+
+ /*cout << "xbar" << endl;
+ for (int j = 0; j < numOtus; j++) { cout << xBar[j] <<'\t'; }cout << endl;*/
+ //fac <- 1/(ng - 1)
+ fac = 1 / (double) (numGroups-1);
+ //scale(group.means, center = xbar, scale = FALSE) %*% scaling
+ vector< vector<double> > scaledMeans = means; //[numGroups][numOTUs]
+ for (int i = 0; i < numGroups; i++) {
+ for (int j = 0; j < numOtus; j++) { scaledMeans[i][j] -= xBar[j]; }
+ }
+ scaledMeans = linear.matrix_mult(scaledMeans, scaling); //[numGroups][numOTUS]*[numOTUS]["good"columns] = [numGroups]["good"columns]
+
+
+ //sqrt((n * prior) * fac)
+ vector<double> temp = proportions; //[numGroups]
+ for (int i = 0; i < temp.size(); i++) { temp[i] *= numSampled * fac; temp[i] = sqrt(temp[i]); }
+
+ //X <- sqrt((n * prior) * fac) * (scale(group.means, center = xbar, scale = FALSE) %*% scaling)
+ //X <- temp * scaledMeans
+ X.clear(); X = scaledMeans; //[numGroups]["good"columns]
+ for (int i = 0; i < X.size(); i++) {
+ for (int j = 0; j < X[i].size(); j++) { X[i][j] *= temp[j]; }
+ }
+ /*
+ cout << "X = " << endl;
+ for (int i = 0; i < X.size(); i++) {
+ for (int j = 0; j < X[i].size(); j++) { cout << X[i][j] << '\t'; }
+ cout << endl;
+ }
+ */
+
+ d.clear(); v.clear();
+ //we want to transpose so results are in Xcopy, but if that makes rows > columns then we don't since svd requires rows < cols.
+ transpose=false;
+ if (X.size() > X[0].size()) { Xcopy = X; transpose=true; }
+ else { Xcopy = linear.transpose(X); }
+ linear.svd(Xcopy, d, v); //Xcopy gets the results we want for v below
+ /*cout << "Xcopy = " << endl;
+ for (int i = 0; i < Xcopy.size(); i++) {
+ for (int j = 0; j < Xcopy[i].size(); j++) { cout << Xcopy[i][j] << '\t'; }
+ cout << endl;
+ }
+
+ cout << "v = " << endl;
+ for (int i = 0; i < v.size(); i++) {
+ for (int j = 0; j < v[i].size(); j++) { cout << v[i][j] << '\t'; }
+ cout << endl;
+ }
+
+ cout << "d = " << endl;
+ for (int i = 0; i < d.size(); i++) { cout << d[i] << endl; }*/
+
+ //rank <- sum(X.s$d > tol * X.s$d[1L])
+ //X.s$d[1L] = larger value in d vector
+ double largeD = m->max(d);
+ rank = 0; goodColumns.clear();
+ for (int i = 0; i < d.size(); i++) { if (d[i] > (0.0000000001*largeD)) { rank++; goodColumns.insert(i); } }
+
+ if (rank == 0) {
+ ignore=true;//m->mothurOut("[ERROR]: rank = 0: class means are numerically identical.\n"); m->control_pressed = true;
+ return scaling; }
+
+ if (transpose) { Xcopy = linear.transpose(v); }
+ //scaling <- scaling %*% X.s$v[, 1L:rank] - scaling * "good" columns
+ v.clear(); //store "good" columns - X.s$v[, 1L:rank]
+ v.resize(Xcopy.size()); //Xcopy = ["good"columns][numGroups]
+ for (set<int>::iterator it = goodColumns.begin(); it != goodColumns.end(); it++) {
+ for (int i = 0; i < Xcopy.size(); i++) {
+ v[i].push_back(Xcopy[i][*it]);
+ }
+ }
+
+ scaling = linear.matrix_mult(scaling, v); //[numOTUS]["good" columns] * ["good"columns][new "good" columns]
+
+ /*cout << "scaling = " << endl;
+ for (int i = 0; i < scaling.size(); i++) {
+ for (int j = 0; j < scaling[i].size(); j++) { cout << scaling[i][j] << '\t'; }
+ cout << endl;
+ }*/
+ ignore=false;
+ return scaling;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "lda");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+//Singular value decomposition (SVD) - adapted from http://svn.lirec.eu/libs/magicsquares/src/SVD.cpp
+/*
+ * svdcomp - SVD decomposition routine.
+ * Takes an mxn matrix a and decomposes it into udv, where u,v are
+ * left and right orthogonal transformation matrices, and d is a
+ * diagonal matrix of singular values.
+ *
+ * This routine is adapted from svdecomp.c in XLISP-STAT 2.1 which is
+ * code from Numerical Recipes adapted by Luke Tierney and David Betz.
+ *
+ * Input to dsvd is as follows:
+ * a = mxn matrix to be decomposed, gets overwritten with u
+ * m = row dimension of a
+ * n = column dimension of a
+ * w = returns the vector of singular values of a
+ * v = returns the right orthogonal transformation matrix
+ */
+
+int LinearAlgebra::svd(vector< vector<double> >& a, vector<double>& w, vector< vector<double> >& v) {
+ try {
+ int flag, i, its, j, jj, k, l, nm;
+ double c, f, h, s, x, y, z;
+ double anorm = 0.0, g = 0.0, scale = 0.0;
+
+ int numRows = a.size(); if (numRows == 0) { return 0; }
+ int numCols = a[0].size();
+ w.resize(numCols, 0.0);
+ v.resize(numCols); for (int i = 0; i < numCols; i++) { v[i].resize(numRows, 0.0); }
+
+ vector<double> rv1; rv1.resize(numCols, 0.0);
+ if (numRows < numCols){ m->mothurOut("[ERROR]: numRows < numCols\n"); m->control_pressed = true; return 0; }
+
+ /* Householder reduction to bidiagonal form */
+ for (i = 0; i < numCols; i++)
+ {
+ /* left-hand reduction */
+ l = i + 1;
+ rv1[i] = scale * g;
+ g = s = scale = 0.0;
+ if (i < numRows)
+ {
+ for (k = i; k < numRows; k++)
+ scale += fabs((double)a[k][i]);
+ if (scale)
+ {
+ for (k = i; k < numRows; k++)
+ {
+ a[k][i] = (double)((double)a[k][i]/scale);
+ s += ((double)a[k][i] * (double)a[k][i]);
+ }
+ f = (double)a[i][i];
+ g = -SIGN(sqrt(s), f);
+ h = f * g - s;
+ a[i][i] = (double)(f - g);
+ if (i != numCols - 1)
+ {
+ for (j = l; j < numCols; j++)
+ {
+ for (s = 0.0, k = i; k < numRows; k++)
+ s += ((double)a[k][i] * (double)a[k][j]);
+ f = s / h;
+ for (k = i; k < numRows; k++)
+ a[k][j] += (double)(f * (double)a[k][i]);
+ }
+ }
+ for (k = i; k < numRows; k++)
+ a[k][i] = (double)((double)a[k][i]*scale);
+ }
+ }
+ w[i] = (double)(scale * g);
+
+ /* right-hand reduction */
+ g = s = scale = 0.0;
+ if (i < numRows && i != numCols - 1)
+ {
+ for (k = l; k < numCols; k++)
+ scale += fabs((double)a[i][k]);
+ if (scale)
+ {
+ for (k = l; k < numCols; k++)
+ {
+ a[i][k] = (double)((double)a[i][k]/scale);
+ s += ((double)a[i][k] * (double)a[i][k]);
+ }
+ f = (double)a[i][l];
+ g = -SIGN(sqrt(s), f);
+ h = f * g - s;
+ a[i][l] = (double)(f - g);
+ for (k = l; k < numCols; k++)
+ rv1[k] = (double)a[i][k] / h;
+ if (i != numRows - 1)
+ {
+ for (j = l; j < numRows; j++)
+ {
+ for (s = 0.0, k = l; k < numCols; k++)
+ s += ((double)a[j][k] * (double)a[i][k]);
+ for (k = l; k < numCols; k++)
+ a[j][k] += (double)(s * rv1[k]);
+ }
+ }
+ for (k = l; k < numCols; k++)
+ a[i][k] = (double)((double)a[i][k]*scale);
+ }
+ }
+ anorm = max(anorm, (fabs((double)w[i]) + fabs(rv1[i])));
+ }
+
+ /* accumulate the right-hand transformation */
+ for (i = numCols - 1; i >= 0; i--)
+ {
+ if (i < numCols - 1)
+ {
+ if (g)
+ {
+ for (j = l; j < numCols; j++)
+ v[j][i] = (double)(((double)a[i][j] / (double)a[i][l]) / g);
+ /* double division to avoid underflow */
+ for (j = l; j < numCols; j++)
+ {
+ for (s = 0.0, k = l; k < numCols; k++)
+ s += ((double)a[i][k] * (double)v[k][j]);
+ for (k = l; k < numCols; k++)
+ v[k][j] += (double)(s * (double)v[k][i]);
+ }
+ }
+ for (j = l; j < numCols; j++)
+ v[i][j] = v[j][i] = 0.0;
+ }
+ v[i][i] = 1.0;
+ g = rv1[i];
+ l = i;
+ }
+
+ /* accumulate the left-hand transformation */
+ for (i = numCols - 1; i >= 0; i--)
+ {
+ l = i + 1;
+ g = (double)w[i];
+ if (i < numCols - 1)
+ for (j = l; j < numCols; j++)
+ a[i][j] = 0.0;
+ if (g)
+ {
+ g = 1.0 / g;
+ if (i != numCols - 1)
+ {
+ for (j = l; j < numCols; j++)
+ {
+ for (s = 0.0, k = l; k < numRows; k++)
+ s += ((double)a[k][i] * (double)a[k][j]);
+ f = (s / (double)a[i][i]) * g;
+ for (k = i; k < numRows; k++)
+ a[k][j] += (double)(f * (double)a[k][i]);
+ }
+ }
+ for (j = i; j < numRows; j++)
+ a[j][i] = (double)((double)a[j][i]*g);
+ }
+ else
+ {
+ for (j = i; j < numRows; j++)
+ a[j][i] = 0.0;
+ }
+ ++a[i][i];
+ }
+
+ /* diagonalize the bidiagonal form */
+ for (k = numCols - 1; k >= 0; k--)
+ { /* loop over singular values */
+ for (its = 0; its < 30; its++)
+ { /* loop over allowed iterations */
+ flag = 1;
+ for (l = k; l >= 0; l--)
+ { /* test for splitting */
+ nm = l - 1;
+ if (fabs(rv1[l]) + anorm == anorm)
+ {
+ flag = 0;
+ break;
+ }
+ if (fabs((double)w[nm]) + anorm == anorm)
+ break;
+ }
+ if (flag)
+ {
+ c = 0.0;
+ s = 1.0;
+ for (i = l; i <= k; i++)
+ {
+ f = s * rv1[i];
+ if (fabs(f) + anorm != anorm)
+ {
+ g = (double)w[i];
+ h = pythag(f, g);
+ w[i] = (double)h;
+ h = 1.0 / h;
+ c = g * h;
+ s = (- f * h);
+ for (j = 0; j < numRows; j++)
+ {
+ y = (double)a[j][nm];
+ z = (double)a[j][i];
+ a[j][nm] = (double)(y * c + z * s);
+ a[j][i] = (double)(z * c - y * s);
+ }
+ }
+ }
+ }
+ z = (double)w[k];
+ if (l == k)
+ { /* convergence */
+ if (z < 0.0)
+ { /* make singular value nonnegative */
+ w[k] = (double)(-z);
+ for (j = 0; j < numCols; j++)
+ v[j][k] = (-v[j][k]);
+ }
+ break;
+ }
+ if (its >= 30) {
+ m->mothurOut("No convergence after 30,000! iterations \n"); m->control_pressed = true;
+ return(0);
+ }
+
+ /* shift from bottom 2 x 2 minor */
+ x = (double)w[l];
+ nm = k - 1;
+ y = (double)w[nm];
+ g = rv1[nm];
+ h = rv1[k];
+ f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+ g = pythag(f, 1.0);
+ f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
+
+ /* next QR transformation */
+ c = s = 1.0;
+ for (j = l; j <= nm; j++)
+ {
+ i = j + 1;
+ g = rv1[i];
+ y = (double)w[i];
+ h = s * g;
+ g = c * g;
+ z = pythag(f, h);
+ rv1[j] = z;
+ c = f / z;
+ s = h / z;
+ f = x * c + g * s;
+ g = g * c - x * s;
+ h = y * s;
+ y = y * c;
+ for (jj = 0; jj < numCols; jj++)
+ {
+ x = (double)v[jj][j];
+ z = (double)v[jj][i];
+ v[jj][j] = (float)(x * c + z * s);
+ v[jj][i] = (float)(z * c - x * s);
+ }
+ z = pythag(f, h);
+ w[j] = (float)z;
+ if (z)
+ {
+ z = 1.0 / z;
+ c = f * z;
+ s = h * z;
+ }
+ f = (c * g) + (s * y);
+ x = (c * y) - (s * g);
+ for (jj = 0; jj < numRows; jj++)
+ {
+ y = (double)a[jj][j];
+ z = (double)a[jj][i];
+ a[jj][j] = (double)(y * c + z * s);
+ a[jj][i] = (double)(z * c - y * s);
+ }
+ }
+ rv1[l] = 0.0;
+ rv1[k] = f;
+ w[k] = (double)x;
+ }
+ }
+
+ return(0);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "svd");
+ exit(1);
+ }
+}
+
+
~LinearAlgebra() {}
vector<vector<double> > matrix_mult(vector<vector<double> >, vector<vector<double> >);
+ vector<vector<double> >transpose(vector<vector<double> >);
void recenter(double, vector<vector<double> >, vector<vector<double> >&);
- int tred2(vector<vector<double> >&, vector<double>&, vector<double>&);
+ //eigenvectors
+ int tred2(vector<vector<double> >&, vector<double>&, vector<double>&);
int qtli(vector<double>&, vector<double>&, vector<vector<double> >&);
+
vector< vector<double> > calculateEuclidianDistance(vector<vector<double> >&, int); //pass in axes and number of dimensions
vector< vector<double> > calculateEuclidianDistance(vector<vector<double> >&); //pass in axes
vector<vector<double> > getObservedEuclideanDistance(vector<vector<double> >&);
vector<float> solveEquations(vector<vector<float> >, vector<float>);
vector<vector<double> > getInverse(vector<vector<double> >);
double choose(double, double);
-
+ double normalvariate(double mu, double sigma);
+ vector< vector<double> > lda(vector< vector<double> >& a, vector<string> groups, vector< vector<double> >& means, bool&); //Linear discriminant analysis - a is [features][valuesFromGroups] groups indicates which group each sampling comes from. For example if groups = early, late, mid, early, early. a[0][0] = value for feature0 from groupEarly.
+ int svd(vector< vector<double> >& a, vector<double>& w, vector< vector<double> >& v); //Singular value decomposition
private:
MothurOut* m;
void ludcmp(vector<vector<float> >&, vector<int>&, float&);
void lubksb(vector<vector<float> >&, vector<int>&, vector<float>&);
-
-
};
#endif
out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n";
out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n";
+ int numBins = lookup[0]->getNumBins();
vector<string> metadata = getMetaData(lookup);
if (m->control_pressed) { out.close(); return 0; }
out << spaces + "\"rows\":[\n";
string rowFront = spaces + spaces + "{\"id\":\"";
string rowBack = "\", \"metadata\":";
- for (int i = 0; i < m->currentBinLabels.size()-1; i++) {
+ for (int i = 0; i < numBins-1; i++) {
if (m->control_pressed) { out.close(); return 0; }
out << rowFront << m->currentBinLabels[i] << rowBack << metadata[i] << "},\n";
}
- out << rowFront << m->currentBinLabels[(m->currentBinLabels.size()-1)] << rowBack << metadata[(m->currentBinLabels.size()-1)] << "}\n" + spaces + "],\n";
+ out << rowFront << m->currentBinLabels[(numBins-1)] << rowBack << metadata[(numBins-1)] << "}\n" + spaces + "],\n";
//get column info
/*"columns": [
//**********************************************************************************************************************
int MakeBiomCommand::getSampleMetaData(vector<SharedRAbundVector*>& lookup){
try {
-
- if (metadatafile == "") { for (int i = 0; i < lookup.size(); i++) { sampleMetadata.push_back("null"); } }
+ sampleMetadata.clear();
+ if (metadatafile == "") { for (int i = 0; i < lookup.size(); i++) { sampleMetadata.push_back("null"); } }
else {
ifstream in;
m->openInputFile(metadatafile, in);
CommandParameter prqual("rqfile", "InputTypes", "", "", "none", "none", "qfileGroup","",false,false,true); parameters.push_back(prqual);
CommandParameter pfile("file", "InputTypes", "", "", "FastaFastqFile", "FastaFastqFile", "none","fasta-qfile",false,false,true); parameters.push_back(pfile);
CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos);
+ CommandParameter pfindex("findex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pfindex);
+ CommandParameter prindex("rindex", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(prindex);
CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs);
CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs);
string MakeContigsCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. It will also provide new quality files if the fastq or file parameter is used.\n";
+ helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. \n";
helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n";
- helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n";
+ helpString += "If a forward index or reverse index file is provided barcodes be trimmed, and a group file will be created. The oligos parameter is required if an index file is given.\n";
+ helpString += "The make.contigs command parameters are file, ffastq, rfastq, ffasta, rfasta, fqfile, rqfile, oligos, findex, rindex, format, tdiffs, bdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, insert, deltaq, allfiles and processors.\n";
helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n";
- helpString += "The file parameter is 2 or 3 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n";
+ helpString += "The file parameter is 2, 3 or 4 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. Mothur will process each pair and create a combined fasta and report file with all the sequences.\n";
helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process. If you provide one, you must provide the other.\n";
helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process. If you provide one, you must provide the other.\n";
helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters. If you provide one, you must provide the other.\n";
helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa, illumina1.8+ or illumina, default=illumina1.8+.\n";
+ helpString += "The findex and rindex parameters are used to provide a forward index and reverse index files to process. \n";
helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n";
helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["oligos"] = inputDir + it->second; }
}
+
+ it = parameters.find("findex");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["findex"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("rindex");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["rindex"] = inputDir + it->second; }
+ }
}
ffastqfile = validParameter.validFile(parameters, "ffastq", true);
else if(oligosfile == "not open") { abort = true; }
else { m->setOligosFile(oligosfile); }
+ findexfile = validParameter.validFile(parameters, "findex", true);
+ if (findexfile == "not found") { findexfile = ""; }
+ else if(findexfile == "not open") { abort = true; }
+
+ rindexfile = validParameter.validFile(parameters, "rindex", true);
+ if (rindexfile == "not found") { rindexfile = ""; }
+ else if(rindexfile == "not open") { abort = true; }
+
+ if ((rindexfile != "") || (findexfile != "")) {
+ if (oligosfile == ""){
+ oligosfile = m->getOligosFile();
+ if (oligosfile != "") { m->mothurOut("Using " + oligosfile + " as input file for the oligos parameter.\n"); }
+ else {
+ m->mothurOut("You need to provide an oligos file if you are going to use an index file.\n"); abort = true;
+ }
+ }
+
+ //can only use an index file with the fastq parameters not fasta and qual
+ if ((ffastafile != "") || (rfastafile != "")) {
+ m->mothurOut("[ERROR]: You can only use an index file with the fastq parameters or the file option.\n"); abort = true;
+ }
+ }
+
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
outputDir = "";
m->mothurOut("Making contigs...\n");
createProcesses(filesToProcess[l], outFastaFile, outScrapFastaFile, outMisMatchFile, fastaFileNames, l);
- m->mothurOut("Here...\n");
//remove temp fasta and qual files
for (int i = 0; i < processors; i++) { for(int j = 0; j < filesToProcess[l][i].size(); j++) { m->mothurRemove(filesToProcess[l][i][j]); } }
vector< vector< vector<string> > > filesToProcess;
if (ffastqfile != "") { //reading one file
- vector< vector<string> > files = readFastqFiles(numReads, ffastqfile, rfastqfile);
+ vector< vector<string> > files = readFastqFiles(numReads, ffastqfile, rfastqfile, findexfile, rindexfile);
//adjust for really large processors or really small files
if (numReads == 0) { m->mothurOut("[ERROR]: no good reads.\n"); m->control_pressed = true; }
if (numReads < processors) {
if (m->control_pressed) { for (int l = 0; l < filesToProcess.size(); l++) { for (int k = 0; k < filesToProcess[l].size(); k++) { for(int j = 0; j < filesToProcess[l][k].size(); j++) { m->mothurRemove(filesToProcess[l][k][j]); } filesToProcess[l][k].clear(); } return filesToProcess; } }
unsigned long int thisFilesReads;
- vector< vector<string> > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1]);
+ vector< vector<string> > files = readFastqFiles(thisFilesReads, filePairsToProcess[i][0], filePairsToProcess[i][1], filePairsToProcess[i][2], filePairsToProcess[i][3]);
//adjust for really large processors or really small files
if (thisFilesReads < processors) {
m->mothurOut("[ERROR]: " + filePairsToProcess[i][0] + " has less than " + toString(processors) + " good reads, skipping\n");
for (int k = 0; k < files.size(); k++) { for(int j = 0; j < files[k].size(); j++) { m->mothurRemove(files[k][j]); } files[k].clear(); }
+ //remove from file2Group if necassary
+ map<int, string> cFile2Group;
+ for (map<int, string>::iterator it = file2Group.begin(); it != file2Group.end(); it++) {
+ if ((it->first) < i) { cFile2Group[it->first] = it->second; }
+ else if ((it->first) == i) { } //do nothing, we removed files for i
+ else { cFile2Group[(it->first-1)] = it->second; } //adjust files because i was removed
+ }
+ file2Group = cFile2Group;
}else {
filesToProcess.push_back(files);
numReads += thisFilesReads;
string thisfqualfile = files[1];
string thisrfastafile = files[2];
string thisrqualfile = files[3];
+ string thisfindexfile = files[4];
+ string thisrindexfile = files[5];
- if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
+ if (m->debug) { m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); }
- ifstream inFFasta, inRFasta, inFQual, inRQual;
+ ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex;
ofstream outFasta, outMisMatch, outScrapFasta;
m->openInputFile(thisffastafile, inFFasta);
m->openInputFile(thisrfastafile, inRFasta);
m->openInputFile(thisfqualfile, inFQual);
m->openInputFile(thisrqualfile, inRQual);
}
+
+ if (thisfindexfile != "") { m->openInputFile(thisfindexfile, inFIndex); }
+ if (thisrindexfile != "") { m->openInputFile(thisrindexfile, inRIndex); }
+
m->openOutputFile(outputFasta, outFasta);
m->openOutputFile(outputScrapFasta, outScrapFasta);
m->openOutputFile(outputMisMatches, outMisMatch);
fQual = new QualityScores(inFQual); m->gobble(inFQual);
rQual = new QualityScores(inRQual); m->gobble(inRQual);
}
+ Sequence findexBarcode("findex", "NONE"); Sequence rindexBarcode("rindex", "NONE");
+ if (thisfindexfile != "") {
+ Sequence temp(inFIndex); m->gobble(inFIndex);
+ findexBarcode.setAligned(temp.getAligned());
+ }
+
+ if (thisrindexfile != "") {
+ Sequence temp(inRIndex); m->gobble(inRIndex);
+ rindexBarcode.setAligned(temp.getAligned());
+ }
int barcodeIndex = 0;
int primerIndex = 0;
-
+
if(barcodes.size() != 0){
if (thisfqualfile != "") {
- success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
+ if ((thisfindexfile != "") || (thisrindexfile != "")) {
+ success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex);
+ }else {
+ success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
+ }
}else {
success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
}
}
}
//**********************************************************************************************************************
-vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq){
+vector< vector<string> > MakeContigsCommand::readFastqFiles(unsigned long int& count, string ffastq, string rfastq, string findex, string rindex){
try {
vector< vector<string> > files;
//maps processors number to file pointer
- map<int, vector<ofstream*> > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual
+ map<int, vector<ofstream*> > tempfiles; //tempfiles[0] = forwardFasta, [1] = forwardQual, [2] = reverseFasta, [3] = reverseQual, tempfiles[4] = forwardIndex, [4] = forwardReverse
map<int, vector<ofstream*> >::iterator it;
//create files to write to
ofstream* outFQ = new ofstream; temp.push_back(outFQ);
ofstream* outRF = new ofstream; temp.push_back(outRF);
ofstream* outRQ = new ofstream; temp.push_back(outRQ);
+ ofstream* outFI = new ofstream; temp.push_back(outFI);
+ ofstream* outRI = new ofstream; temp.push_back(outRI);
tempfiles[i] = temp;
vector<string> names;
string rfastafilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rfastatemp";
string fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(ffastq)) + toString(i) + "fqualtemp";
string rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rfastq)) + toString(i) + "rqualtemp";
+ string findexfilename = ""; string rindexfilename = "";
+ noneOk = false; //flag to oligos file read that its okay to allow for non paired barcodes
+ if (findex != "") { findexfilename = thisOutputDir + m->getRootName(m->getSimpleName(findex)) + toString(i) + "findextemp"; m->openOutputFile(findexfilename, *outFI); noneOk = true; }
+ if (rindex != "") { rindexfilename = thisOutputDir + m->getRootName(m->getSimpleName(rindex)) + toString(i) + "rindextemp"; m->openOutputFile(rindexfilename, *outRI); noneOk = true; }
names.push_back(ffastafilename); names.push_back(fqualfilename);
names.push_back(rfastafilename); names.push_back(rqualfilename);
+ names.push_back(findexfilename); names.push_back(rindexfilename);
files.push_back(names);
m->openOutputFile(ffastafilename, *outFF);
for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
//remove files
for (int i = 0; i < files.size(); i++) {
- for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); }
+ for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } }
}
}
ifstream inReverse;
m->openInputFile(rfastq, inReverse);
+ ifstream infIndex, inrIndex;
+ bool findexIsGood = false;
+ bool rindexIsGood = false;
+ if (findex != "") { m->openInputFile(findex, infIndex); findexIsGood = true; }
+ if (rindex != "") { m->openInputFile(rindex, inrIndex); rindexIsGood = true; }
+
count = 0;
map<string, fastqRead> uniques;
+ map<string, fastqRead> iUniques;
+ map<string, pairFastqRead> pairUniques;
map<string, fastqRead>::iterator itUniques;
- while ((!inForward.eof()) || (!inReverse.eof())) {
+ while ((!inForward.eof()) || (!inReverse.eof()) || (findexIsGood) || (rindexIsGood)) {
- if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; }
+ if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; }
//get a read from forward and reverse fastq files
- bool ignoref, ignorer;
- fastqRead thisFread, thisRread;
+ bool ignoref, ignorer, ignorefi, ignoreri;
+ fastqRead thisFread, thisRread, thisFIread, thisRIread;
if (!inForward.eof()) { thisFread = readFastq(inForward, ignoref); }
else { ignoref = true; }
if (!inReverse.eof()) { thisRread = readFastq(inReverse, ignorer); }
else { ignorer = true; }
+ if (findexIsGood) { thisFIread = readFastq(infIndex, ignorefi); if (infIndex.eof()) { findexIsGood = false; } }
+ else { ignorefi = true; }
+ if (rindexIsGood) { thisRIread = readFastq(inrIndex, ignoreri); if (inrIndex.eof()) { rindexIsGood = false; } }
+ else { ignoreri = true; }
+
+ bool allowOne = false;
+ if ((findex == "") || (rindex == "")) { allowOne = true; }
+ vector<pairFastqRead> frReads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false);
+ vector<pairFastqRead> friReads = getReads(ignorefi, ignoreri, thisFIread, thisRIread, iUniques, allowOne);
+
+ //add in index info if provided
+ vector<pairFastqRead> reads = frReads;
+ if ((findex != "") || (rindex != "")) { reads = mergeReads(frReads, friReads, pairUniques); }
- vector<pairFastqRead> reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques);
-
for (int i = 0; i < reads.size(); i++) {
fastqRead fread = reads[i].forward;
fastqRead rread = reads[i].reverse;
+ fastqRead firead = reads[i].findex;
+ fastqRead riread = reads[i].rindex;
- if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); }
+ if (m->debug) { m->mothurOut(toString(count) + '\t' + fread.name + '\t' + rread.name + '\n'); if (findex != "") { m->mothurOut(toString(count) + '\t' + firead.name + '\n'); } if (rindex != "") { m->mothurOut(toString(count) + '\t' + riread.name + '\n'); } }
//if (checkReads(fread, rread, ffastq, rfastq)) {
- if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { m->mothurRemove(files[i][j]); } } inForward.close(); inReverse.close(); return files; }
+ if (m->control_pressed) { for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } } for (int i = 0; i < files.size(); i++) { for(int j = 0; j < files[i].size(); j++) { if (files[i][j] != "") { m->mothurRemove(files[i][j]); } } } inForward.close(); inReverse.close(); if (findex != "") { infIndex.close(); } if (findex != "") { inrIndex.close(); } return files; }
//if the reads are okay write to output files
int process = count % processors;
*(tempfiles[process][3]) << ">" << rread.name << endl;
for (int i = 0; i < rread.scores.size(); i++) { *(tempfiles[process][3]) << rread.scores[i] << " "; }
*(tempfiles[process][3]) << endl;
-
+ if (findex != "") { *(tempfiles[process][4]) << ">" << firead.name << endl << firead.sequence << endl; }
+ if (rindex != "") { *(tempfiles[process][5]) << ">" << riread.name << endl << riread.sequence << endl; }
+
count++;
//report progress
for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) {
m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n");
}
+ for (map<string, pairFastqRead>:: iterator it = pairUniques.begin(); it != pairUniques.end(); it++) {
+ m->mothurOut("[WARNING]: did not find paired read for " + (it->first).substr(1) + ", ignoring.\n");
+ }
m->mothurOutEndLine();
}
for (it = tempfiles.begin(); it!=tempfiles.end(); it++) { for (int i = 0; i < (it->second).size(); i++) { (*(it->second)[i]).close(); delete (it->second)[i]; } }
inForward.close();
inReverse.close();
-
+ if (findex != "") { infIndex.close(); }
+ if (rindex != "") { inrIndex.close(); }
+
return files;
}
catch(exception& e) {
if (fqualfile != "") { fqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(fqualfile)) + toString(i) + "fqual.temp"; m->openOutputFile(fqualfilename, *outFQ); }
string rqualfilename = "";
if (rqualfile != "") { rqualfilename = thisOutputDir + m->getRootName(m->getSimpleName(rqualfile)) + toString(i) + "rqual.temp"; m->openOutputFile(rqualfilename, *outRQ); }
+ string findexfilename = ""; string rindexfilename = "";
names.push_back(ffastafilename); names.push_back(fqualfilename);
names.push_back(rfastafilename); names.push_back(rqualfilename);
+ names.push_back(findexfilename); names.push_back(rindexfilename);
files.push_back(names);
m->openOutputFile(ffastafilename, *outFF);
}else { ignorer = true; }
}
- vector<pairFastqRead> reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques);
+ vector<pairFastqRead> reads = getReads(ignoref, ignorer, thisFread, thisRread, uniques, false);
for (int i = 0; i < reads.size(); i++) {
fastqRead fread = reads[i].forward;
}
}
//**********************************************************************************************************************
-vector<pairFastqRead> MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques){
+vector<pairFastqRead> MakeContigsCommand::getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques, bool allowOne){
try {
vector<pairFastqRead> reads;
map<string, fastqRead>::iterator itUniques;
}
}else if (!ignoref && ignorer) { //ignore reverse keep forward
- //look for forward pair
- itUniques = uniques.find(forward.name);
- if (itUniques != uniques.end()) { //we have the pair for this read
- pairFastqRead temp(forward, itUniques->second);
+ if (allowOne) {
+ fastqRead dummy;
+ pairFastqRead temp(forward, dummy);
reads.push_back(temp);
- uniques.erase(itUniques);
- }else { //save this read for later
- uniques[forward.name] = forward;
+ }else {
+ //look for forward pair
+ itUniques = uniques.find(forward.name);
+ if (itUniques != uniques.end()) { //we have the pair for this read
+ pairFastqRead temp(forward, itUniques->second);
+ reads.push_back(temp);
+ uniques.erase(itUniques);
+ }else { //save this read for later
+ uniques[forward.name] = forward;
+ }
}
-
}else if (ignoref && !ignorer) { //ignore forward keep reverse
- //look for reverse pair
- itUniques = uniques.find(reverse.name);
- if (itUniques != uniques.end()) { //we have the pair for this read
- pairFastqRead temp(itUniques->second, reverse);
+ if (allowOne) {
+ fastqRead dummy;
+ pairFastqRead temp(dummy, reverse);
reads.push_back(temp);
- uniques.erase(itUniques);
- }else { //save this read for later
- uniques[reverse.name] = reverse;
+ }else {
+ //look for reverse pair
+ itUniques = uniques.find(reverse.name);
+ if (itUniques != uniques.end()) { //we have the pair for this read
+ pairFastqRead temp(itUniques->second, reverse);
+ reads.push_back(temp);
+ uniques.erase(itUniques);
+ }else { //save this read for later
+ uniques[reverse.name] = reverse;
+ }
}
}//else ignore both and do nothing
}
}
//**********************************************************************************************************************
+//look through the reads from the forward and reverse files and try to find matching reads from index files.
+vector<pairFastqRead> MakeContigsCommand::mergeReads(vector<pairFastqRead> thisReads, vector<pairFastqRead> indexes, map<string, pairFastqRead>& uniques){
+ try {
+ vector<pairFastqRead> reads;
+ map<string, pairFastqRead>::iterator itUniques;
+
+ set<int> foundIndexes;
+ for (int i = 0; i < thisReads.size(); i++) {
+ bool found = false;
+ for (int j = 0; j < indexes.size(); j++) {
+
+ //incase only one index
+ string indexName = indexes[j].forward.name;
+ if (indexName == "") { indexName = indexes[j].reverse.name; }
+
+ if (thisReads[i].forward.name == indexName){
+ thisReads[i].findex = indexes[j].forward;
+ thisReads[i].rindex = indexes[j].reverse;
+ reads.push_back(thisReads[i]);
+ found = true;
+ foundIndexes.insert(j);
+ }
+ }
+
+ if (!found) {
+ //look for forward pair
+ itUniques = uniques.find('i'+thisReads[i].forward.name);
+ if (itUniques != uniques.end()) { //we have the pair for this read
+ thisReads[i].findex = itUniques->second.forward;
+ thisReads[i].rindex = itUniques->second.reverse;
+ reads.push_back(thisReads[i]);
+ uniques.erase(itUniques);
+ }else { //save this read for later
+ uniques['r'+thisReads[i].forward.name] = thisReads[i];
+ }
+ }
+ }
+
+ if (foundIndexes.size() != indexes.size()) { //if we didnt match all the indexes look for them in uniques
+ for (int j = 0; j < indexes.size(); j++) {
+ if (foundIndexes.count(j) == 0) { //we didnt find this one
+ //incase only one index
+ string indexName = indexes[j].forward.name;
+ if (indexName == "") { indexName = indexes[j].reverse.name; }
+
+ //look for forward pair
+ itUniques = uniques.find('r'+indexName);
+ if (itUniques != uniques.end()) { //we have the pair for this read
+ pairFastqRead temp(itUniques->second.forward, itUniques->second.reverse, indexes[j].forward, indexes[j].reverse);
+ reads.push_back(temp);
+ uniques.erase(itUniques);
+ }else { //save this read for later
+ uniques['i'+indexName] = indexes[j];
+ }
+ }
+ }
+ }
+
+
+ return reads;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "MakeContigsCommand", "mergeReads");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){
try {
fastqRead read;
read.name = name;
read.sequence = sequence;
read.scores = qualScores;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: " + read.name + " " + read.sequence + " " + quality + "\n"); }
return read;
}
}
}*/
//***************************************************************************************************************
+//lines can be 2, 3, or 4 columns
+// forward.fastq reverse.fastq -> 2 column
+// groupName forward.fastq reverse.fastq -> 3 column
+// forward.fastq reverse.fastq forward.index.fastq reverse.index.fastq -> 4 column
+// forward.fastq reverse.fastq none reverse.index.fastq -> 4 column
+// forward.fastq reverse.fastq forward.index.fastq none -> 4 column
vector< vector<string> > MakeContigsCommand::readFileNames(string filename){
try {
vector< vector<string> > files;
- string forward, reverse;
+ string forward, reverse, findex, rindex;
ifstream in;
m->openInputFile(filename, in);
if (m->control_pressed) { return files; }
- in >> forward; m->gobble(in);
- in >> reverse;
+ string line = m->getline(in); m->gobble(in);
+ vector<string> pieces = m->splitWhiteSpace(line);
string group = "";
- while (!in.eof()) { //do we have a group assigned to this pair
- char c = in.get();
- if (c == 10 || c == 13 || c == -1){ break; }
- else if (c == 32 || c == 9){;} //space or tab
- else { group += c; }
- }
-
- if (group != "") {
- //line in file look like: group forward reverse
- string temp = forward;
- forward = reverse;
- reverse = group;
- group = temp;
+ if (pieces.size() == 2) {
+ forward = pieces[0];
+ reverse = pieces[1];
+ group = "";
+ findex = "";
+ rindex = "";
+ }else if (pieces.size() == 3) {
+ group = pieces[0];
+ forward = pieces[1];
+ reverse = pieces[2];
+ findex = "";
+ rindex = "";
createFileGroup = true;
+ }else if (pieces.size() == 4) {
+ forward = pieces[0];
+ reverse = pieces[1];
+ findex = pieces[2];
+ rindex = pieces[3];
+ if ((findex == "none") || (findex == "NONE")){ findex = ""; }
+ if ((rindex == "none") || (rindex == "NONE")){ rindex = ""; }
+ }else {
+ m->mothurOut("[ERROR]: file lines can be 2, 3, or 4 columns. The forward fastq files in the first column and their matching reverse fastq files in the second column, or a groupName then forward fastq file and reverse fastq file, or forward fastq file then reverse fastq then forward index and reverse index file. If you only have one index file add 'none' for the other one. \n"); m->control_pressed = true;
}
- m->gobble(in);
- if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ".\n"); }
+ if (m->debug) { m->mothurOut("[DEBUG]: group = " + group + ", forward = " + forward + ", reverse = " + reverse + ", forwardIndex = " + findex + ", reverseIndex = " + rindex + ".\n"); }
//check to make sure both are able to be opened
ifstream in2;
m->mothurOut("[WARNING]: can't find " + reverse + ", ignoring pair.\n");
}else{ in3.close(); }
- if ((openForward != 1) && (openReverse != 1)) { //good pair
+ int openFindex = 0;
+ if (findex != "") {
+ ifstream in4;
+ openFindex = m->openInputFile(findex, in4, "noerror"); in4.close();
+
+ //if you can't open it, try default location
+ if (openFindex == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(findex);
+ m->mothurOut("Unable to open " + findex + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in5;
+ openFindex = m->openInputFile(tryPath, in5, "noerror");
+ in5.close();
+ findex = tryPath;
+ }
+ }
+
+ //if you can't open it, try output location
+ if (openFindex == 1) {
+ if (m->getOutputDir() != "") { //default path is set
+ string tryPath = m->getOutputDir() + m->getSimpleName(findex);
+ m->mothurOut("Unable to open " + findex + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+ ifstream in6;
+ openFindex = m->openInputFile(tryPath, in6, "noerror");
+ findex = tryPath;
+ in6.close();
+ }
+ }
+
+ if (openFindex == 1) { //can't find it
+ m->mothurOut("[WARNING]: can't find " + findex + ", ignoring pair.\n");
+ }
+ }
+
+ int openRindex = 0;
+ if (rindex != "") {
+ ifstream in7;
+ openRindex = m->openInputFile(rindex, in7, "noerror"); in7.close();
+
+ //if you can't open it, try default location
+ if (openRindex == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(rindex);
+ m->mothurOut("Unable to open " + rindex + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in8;
+ openRindex = m->openInputFile(tryPath, in8, "noerror");
+ in8.close();
+ rindex = tryPath;
+ }
+ }
+
+ //if you can't open it, try output location
+ if (openRindex == 1) {
+ if (m->getOutputDir() != "") { //default path is set
+ string tryPath = m->getOutputDir() + m->getSimpleName(rindex);
+ m->mothurOut("Unable to open " + rindex + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+ ifstream in9;
+ openRindex = m->openInputFile(tryPath, in9, "noerror");
+ rindex = tryPath;
+ in9.close();
+ }
+ }
+
+ if (openRindex == 1) { //can't find it
+ m->mothurOut("[WARNING]: can't find " + rindex + ", ignoring pair.\n");
+ }
+ }
+
+
+ if ((openForward != 1) && (openReverse != 1) && (openFindex != 1) && (openRindex != 1)) { //good pair
file2Group[files.size()] = group;
vector<string> pair;
pair.push_back(forward);
pair.push_back(reverse);
+ pair.push_back(findex);
+ pair.push_back(rindex);
+ if (((findex != "") || (rindex != "")) && (oligosfile == "")) { m->mothurOut("[ERROR]: You need to provide an oligos file if you are going to use an index file.\n"); m->control_pressed = true; }
files.push_back(pair);
}
}
return files;
}
catch(exception& e) {
- m->errorOut(e, "MakeContigsCommand", "checkReads");
+ m->errorOut(e, "MakeContigsCommand", "readFileNames");
exit(1);
}
}
oligosPair newPrimer(foligo, roligo);
if (m->debug) { m->mothurOut("[DEBUG]: primer pair " + newPrimer.forward + " " + newPrimer.reverse + ", and group = " + group + ".\n"); }
-
+
//check for repeat barcodes
string tempPair = foligo+roligo;
if (uniquePrimers.count(tempPair) != 0) { m->mothurOut("primer pair " + newPrimer.forward + " " + newPrimer.reverse + " is in your oligos file already."); m->mothurOutEndLine(); }
oligosPair newPair(foligo, roligo);
+ if ((foligo == "NONE") || (roligo == "NONE")) { if (!noneOk) { m->mothurOut("[ERROR]: barcodes must be paired unless you are using an index file.\n"); m->control_pressed = true; } }
+
group = "";
while (!in.eof()) {
char c = in.get();
struct pairFastqRead {
fastqRead forward;
fastqRead reverse;
+ fastqRead findex;
+ fastqRead rindex;
pairFastqRead() {};
pairFastqRead(fastqRead f, fastqRead r) : forward(f), reverse(r){};
+ pairFastqRead(fastqRead f, fastqRead r, fastqRead fi, fastqRead ri) : forward(f), reverse(r), findex(fi), rindex(ri) {};
~pairFastqRead() {};
};
/**************************************************************************************************/
void help() { m->mothurOut(getHelpString()); }
private:
- bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup, makeCount;
- string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file, format;
+ bool abort, allFiles, trimOverlap, createFileGroup, createOligosGroup, makeCount, noneOk;
+ string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, findexfile, rindexfile, file, format;
float match, misMatch, gapOpen, gapExtend;
int processors, longestBase, insert, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, deltaq;
vector<string> outputNames;
fastqRead readFastq(ifstream&, bool&);
vector< vector< vector<string> > > preProcessData(unsigned long int&);
vector< vector<string> > readFileNames(string);
- vector< vector<string> > readFastqFiles(unsigned long int&, string, string);
+ vector< vector<string> > readFastqFiles(unsigned long int&, string, string, string, string);
vector< vector<string> > readFastaFiles(unsigned long int&, string, string);
//bool checkReads(fastqRead&, fastqRead&, string, string);
int createProcesses(vector< vector<string> >, string, string, string, vector<vector<string> >, int);
int driver(vector<string>, string, string, string, vector<vector<string> >, int, string);
bool getOligos(vector<vector<string> >&, string);
string reverseOligo(string);
- vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques);
+ vector<pairFastqRead> getReads(bool ignoref, bool ignorer, fastqRead forward, fastqRead reverse, map<string, fastqRead>& uniques, bool);
+ vector<pairFastqRead> mergeReads(vector<pairFastqRead> frReads, vector<pairFastqRead> friReads, map<string, pairFastqRead>& pairUniques);
};
/**************************************************************************************************/
string thisfqualfile = pDataArray->files[1];
string thisrfastafile = pDataArray->files[2];
string thisrqualfile = pDataArray->files[3];
+ string thisfindexfile = pDataArray->files[4];
+ string thisrindexfile = pDataArray->files[5];
- if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n"); }
+ if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: ffasta = " + thisffastafile + ".\n[DEBUG]: fqual = " + thisfqualfile + ".\n[DEBUG]: rfasta = " + thisrfastafile + ".\n[DEBUG]: rqual = " + thisrqualfile + ".\n[DEBUG]: findex = " + thisfindexfile + ".\n[DEBUG]: rindex = " + thisrindexfile + ".\n"); }
if(pDataArray->allFiles){
for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
}
}
- ifstream inFFasta, inRFasta, inFQual, inRQual;
+ ifstream inFFasta, inRFasta, inFQual, inRQual, inFIndex, inRIndex;
ofstream outFasta, outMisMatch, outScrapFasta;
pDataArray->m->openInputFile(thisffastafile, inFFasta);
pDataArray->m->openInputFile(thisrfastafile, inRFasta);
pDataArray->m->openInputFile(thisfqualfile, inFQual);
pDataArray->m->openInputFile(thisrqualfile, inRQual);
}
+
+ if (thisfindexfile != "") { pDataArray->m->openInputFile(thisfindexfile, inFIndex); }
+ if (thisrindexfile != "") { pDataArray->m->openInputFile(thisrindexfile, inRIndex); }
+
pDataArray->m->openOutputFile(pDataArray->outputFasta, outFasta);
pDataArray->m->openOutputFile(pDataArray->outputMisMatches, outMisMatch);
pDataArray->m->openOutputFile(pDataArray->outputScrapFasta, outScrapFasta);
rQual = new QualityScores(inRQual); pDataArray->m->gobble(inRQual);
}
+ Sequence findexBarcode("findex", "NONE"); Sequence rindexBarcode("rindex", "NONE");
+ if (thisfindexfile != "") {
+ Sequence temp(inFIndex); pDataArray->m->gobble(inFIndex);
+ findexBarcode.setAligned(temp.getAligned());
+ }
+
+ if (thisrindexfile != "") {
+ Sequence temp(inRIndex); pDataArray->m->gobble(inRIndex);
+ rindexBarcode.setAligned(temp.getAligned());
+ }
+
int barcodeIndex = 0;
int primerIndex = 0;
if(pDataArray->barcodes.size() != 0){
if (thisfqualfile != "") {
- success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
+ if ((thisfindexfile != "") || (thisrindexfile != "")) {
+ success = trimOligos.stripBarcode(findexBarcode, rindexBarcode, *fQual, *rQual, barcodeIndex);
+ }else {
+ success = trimOligos.stripBarcode(fSeq, rSeq, *fQual, *rQual, barcodeIndex);
+ }
}else {
success = trimOligos.stripBarcode(fSeq, rSeq, barcodeIndex);
}
CYGWIN_BUILD ?= no
USECOMPRESSION ?= no
MOTHUR_FILES="\"Enter_your_default_path_here\""
-RELEASE_DATE = "\"5/29/2013\""
-VERSION = "\"1.31.1\""
+RELEASE_DATE = "\"8/19/2013\""
+VERSION = "\"1.31.2\""
FORTAN_COMPILER = gfortran
FORTRAN_FLAGS =
#TARGET_ARCH += -m64 -static
#if you are a linux user use the following line
- #CXXFLAGS += -mtune=native -march=native -m64
+ #CXXFLAGS += -mtune=native -march=native
CXXFLAGS += -DBIT_VERSION
- FORTRAN_FLAGS = -m64
endif
designMap = new DesignMap(designfile);
vector<string> categories = designMap->getNamesOfCategories();
+ if (categories.size() > 3) { m->mothurOut("\n[NOTE]: LEfSe input files allow for a class, subclass and subject. More than 3 categories can cause formatting errors.\n\n"); }
+
for (int j = 0; j < categories.size(); j++) {
out << categories[j] << "\t";
- for (int i = 0; i < lookup.size(); i++) {
+ for (int i = 0; i < lookup.size()-1; i++) {
if (m->control_pressed) { out.close(); delete designMap; return 0; }
string value = designMap->get(lookup[i]->getGroup(), categories[j]);
if (value == "not found") {
m->mothurOut("[ERROR]: " + lookup[i]->getGroup() + " is not in your design file, please correct.\n"); m->control_pressed = true;
}else { out << value << '\t'; }
}
+ string value = designMap->get(lookup[lookup.size()-1]->getGroup(), categories[j]);
+ if (value == "not found") {
+ m->mothurOut("[ERROR]: " + lookup[lookup.size()-1]->getGroup() + " is not in your design file, please correct.\n"); m->control_pressed = true;
+ }else { out << value; }
out << endl;
}
}
out << "group\t";
- for (int i = 0; i < lookup.size(); i++) { out << lookup[i]->getGroup() << '\t'; }
- out << endl;
+ for (int i = 0; i < lookup.size()-1; i++) { out << lookup[i]->getGroup() << '\t'; }
+ out << lookup[lookup.size()-1]->getGroup() << endl;
for (int i = 0; i < lookup[0]->getNumBins(); i++) { //process each otu
if (m->control_pressed) { break; }
out << nameOfOtu << '\t';
//print out relabunds for each otu
- for (int j = 0; j < lookup.size(); j++) { out << lookup[j]->getAbundance(i) << '\t'; }
- out << endl;
+ for (int j = 0; j < lookup.size()-1; j++) { out << lookup[j]->getAbundance(i) << '\t'; }
+
+ out << lookup[lookup.size()-1]->getAbundance(i)<< endl;
}
out.close();
}
}
/***********************************************************************/
-
+string MothurOut::findEdianness() {
+ try {
+ // find real endian type
+ unsigned char EndianTest[2] = {1,0};
+ short x = *(short *)EndianTest;
+
+ string endianType = "unknown";
+ if(x == 1) { endianType = "BIG_ENDIAN"; }
+ else { endianType = "LITTLE_ENDIAN"; }
+
+ return endianType;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "findEdianness");
+ exit(1);
+ }
+}
+/***********************************************************************/
+double MothurOut::median(vector<double> x) {
+ try {
+ double value = 0.0;
+
+ if (x.size() == 0) { } //error
+ else {
+ //For example, if a < b < c, then the median of the list {a, b, c} is b, and, if a < b < c < d, then the median of the list {a, b, c, d} is the mean of b and c; i.e., it is (b + c)/2.
+ sort(x.begin(), x.end());
+ //is x.size even?
+ if ((x.size()%2) == 0) { //size() is even. median = average of 2 midpoints
+ int midIndex1 = (x.size()/2)-1;
+ int midIndex2 = (x.size()/2);
+ value = (x[midIndex1]+ x[midIndex2]) / 2.0;
+ }else {
+ int midIndex = (x.size()/2);
+ value = x[midIndex];
+ }
+ }
+ return value;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "median");
+ exit(1);
+ }
+}
+/***********************************************************************/
int MothurOut::factorial(int num){
try {
int total = 1;
return stdDev;
}
catch(exception& e) {
- errorOut(e, "MothurOut", "getAverages");
+ errorOut(e, "MothurOut", "getStandardDeviation");
exit(1);
}
}
}
}
/**************************************************************************************************/
+// returns largest value in vector
+double MothurOut::max(vector<double>& featureVector){
+ try {
+ if (featureVector.size() == 0) { mothurOut("[ERROR]: vector size = 0!\n"); control_pressed=true; return 0.0; }
+
+ //finds largest
+ double largest = featureVector[0];
+ for (int i = 1; i < featureVector.size(); i++) {
+ if (featureVector[i] > largest) { largest = featureVector[i]; }
+ }
+
+ return largest;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "max");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+// returns smallest value in vector
+double MothurOut::min(vector<double>& featureVector){
+ try {
+ if (featureVector.size() == 0) { mothurOut("[ERROR]: vector size = 0!\n"); control_pressed=true; return 0.0; }
+
+ //finds smallest
+ double smallest = featureVector[0];
+ for (int i = 1; i < featureVector.size(); i++) {
+ if (featureVector[i] < smallest) { smallest = featureVector[i]; }
+ }
+
+ return smallest;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "min");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
bool isTrue(string);
bool isContainingOnlyDigits(string);
bool isNumeric1(string);
+ string findEdianness();
//string manipulation
map<string, vector<string> > parseClasses(string);
//math operation
+ double max(vector<double>&); //returns largest value in vector
+ double min(vector<double>&); //returns smallest value in vector
int factorial(int num);
vector<vector<double> > binomial(int);
float ceilDist(float, int);
float roundDist(float, int);
unsigned int fromBase36(string);
+ double median(vector<double>);
int getRandomIndex(int); //highest
double getStandardDeviation(vector<int>&);
vector<double> getStandardDeviation(vector< vector<double> >&);
alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
seqI.setAligned(alignment->getSeqAAln());
seqJ.setAligned(alignment->getSeqBAln());
-
- //cout << seqI.getName() << '\t' << seqJ.getName() << endl;
- //cout << alignment->getSeqAAln() << endl << alignment->getSeqBAln() << endl;
distCalculator->calcDist(seqI, seqJ);
double dist = distCalculator->getDist();
- //cout << "dist = " << dist << endl;
+ if (m->debug) { m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
if(dist <= cutoff){
if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
distCalculator->calcDist(seqI, seqJ);
double dist = distCalculator->getDist();
- outFile << dist << '\t';
+ outFile << dist << '\t';
+
+ if (m->debug) { m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
}
outFile << endl;
distCalculator->calcDist(seqI, seqJ);
double dist = distCalculator->getDist();
+
+ if (m->debug) { cout << ("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
if(dist <= cutoff){
outputString += (alignDB.get(i).getName() + ' ' + alignDB.get(j).getName() + ' ' + toString(dist) + '\n');
distCalculator->calcDist(seqI, seqJ);
double dist = distCalculator->getDist();
+
+ if (m->debug) { cout << ("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
outputString += toString(dist) + "\t";
}
distCalculator->calcDist(seqI, seqJ);
double dist = distCalculator->getDist();
- outputString += toString(dist) + "\t";
+ outputString += toString(dist) + "\t";
+
+ if (m->debug) { cout << ("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
}
outputString += "\n";
distCalculator->calcDist(seqI, seqJ);
double dist = distCalculator->getDist();
+ if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
+
outFile << dist << '\t';
}
distCalculator->calcDist(seqI, seqJ);
double dist = distCalculator->getDist();
+ if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: " + seqI.getName() + '\t' + alignment->getSeqAAln() + '\n' + seqJ.getName() + alignment->getSeqBAln() + '\n' + "distance = " + toString(dist) + "\n"); }
+
if(dist <= pDataArray->cutoff){
if (pDataArray->output == "column") { outFile << pDataArray->alignDB.get(i).getName() << ' ' << pDataArray->alignDB.get(j).getName() << ' ' << dist << endl; }
}
while(qFile.peek() != '>' && qFile.peek() != EOF){
if (m->control_pressed) { break; }
string temp = m->getline(qFile); m->gobble(qFile);
- if (m->debug) { m->mothurOut("[DEBUG]: scores = '" + temp + "'\n."); }
+ //if (m->debug) { m->mothurOut("[DEBUG]: scores = '" + temp + "'\n."); }
qScoreString += ' ' + temp;
}
//cout << "done reading " << endl;
string temp;
qScoreStringStream >> temp; m->gobble(qScoreStringStream);
- if (m->debug) { m->mothurOut("[DEBUG]: score " + toString(qScores.size()) + " = '" + temp + "'\n."); }
+ //if (m->debug) { m->mothurOut("[DEBUG]: score " + toString(qScores.size()) + " = '" + temp + "'\n."); }
//check temp to make sure its a number
if (!m->isContainingOnlyDigits(temp)) { m->mothurOut("[ERROR]: In sequence " + seqName + "'s quality scores, expected a number and got " + temp + ", setting score to 0."); m->mothurOutEndLine(); temp = "0"; }
double meanstartPosition, meanendPosition, meanseqLength, meanambigBases, meanlongHomoPolymer;
- meanstartPosition /= (double) size; meanendPosition /= (double) size; meanlongHomoPolymer /= (double) size; meanseqLength /= (double) size; meanambigBases /= (double) size;
+ meanstartPosition = meanStartPosition / (double) size; meanendPosition = meanEndPosition /(double) size; meanlongHomoPolymer = meanLongHomoPolymer / (double) size; meanseqLength = meanSeqLength / (double) size; meanambigBases = meanAmbigBases /(double) size;
int ptile0_25 = int(size * 0.025);
int ptile25 = int(size * 0.250);
if (m->control_pressed) { count = 0; break; }
if (count >= header.numReads) { break; }
+ //if (count >= 100) { break; }
}
//report progress
char buffer4 [4];
in.read(buffer4, 4);
header.numReads = be_int4(*(unsigned int *)(&buffer4));
+
+ if (m->debug) { m->mothurOut("[DEBUG]: numReads = " + toString(header.numReads) + "\n"); }
//read header length
char buffer5 [2];
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
out.write(mybuffer, in.gcount());
out.close();
}
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
out.write(mybuffer, in.gcount());
out.close();
}
thisbuffer[6] = (offset >> 8) & 0xFF;
thisbuffer[7] = offset & 0xFF;
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
out.write(thisbuffer, 8);
out.close();
}
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
- int offset = 0;
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
+ unsigned int offset = 0;
char* thisbuffer = new char[4];
thisbuffer[0] = (offset >> 24) & 0xFF;
thisbuffer[1] = (offset >> 16) & 0xFF;
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
//convert number of reads to 4 byte char*
char* thisbuffer = new char[4];
- thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF;
- thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF;
- thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF;
- thisbuffer[3] = numSplitReads[i][j] & 0xFF;
+ if ((m->findEdianness()) == "BIG_ENDIAN") {
+ thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF;
+ thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF;
+ thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF;
+ thisbuffer[3] = numSplitReads[i][j] & 0xFF;
+ }else {
+ thisbuffer[0] = numSplitReads[i][j] & 0xFF;
+ thisbuffer[1] = (numSplitReads[i][j] >> 8) & 0xFF;
+ thisbuffer[2] = (numSplitReads[i][j] >> 16) & 0xFF;
+ thisbuffer[3] = (numSplitReads[i][j] >> 24) & 0xFF;
+ }
out.write(thisbuffer, 4);
out.close();
delete[] thisbuffer;
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
out.write(mybuffer, in.gcount());
out.close();
}
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
out.write(mybuffer, in.gcount());
out.close();
}
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
out.write(mybuffer, in.gcount());
out.close();
}
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
out.write(mybuffer, in.gcount());
out.close();
}
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
out.write(mybuffer, in.gcount());
out.close();
}
for (int i = 0; i < filehandlesHeaders.size(); i++) {
for (int j = 0; j < filehandlesHeaders[i].size(); j++) {
ofstream out;
- m->openOutputFileAppend(filehandlesHeaders[i][j], out);
+ m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out);
out.write(mybuffer, in.gcount());
out.close();
}
//read header length
char buffer [2];
- in.read(buffer, 2);
+ in.read(buffer, 2);
header.headerLength = be_int2(*(unsigned short *)(&buffer));
//read name length
char buffer3 [4];
in.read(buffer3, 4);
header.numBases = be_int4(*(unsigned int *)(&buffer3));
+
//read clip qual left
char buffer4 [2];
in.read(buffer4, 2);
header.clipQualLeft = be_int2(*(unsigned short *)(&buffer4));
- header.clipQualLeft = 5;
+ header.clipQualLeft = 5;
+
//read clip qual right
char buffer5 [2];
in.read(buffer5, 2);
header.clipQualRight = be_int2(*(unsigned short *)(&buffer5));
+
//read clipAdapterLeft
char buffer6 [2];
in.read(buffer6, 2);
header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6));
+
//read clipAdapterRight
char buffer7 [2];
in.read(buffer7, 2);
header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7));
+
//read name
char* tempBuffer = new char[header.nameLength];
in.read(&(*tempBuffer), header.nameLength);
header.name = tempBuffer;
if (header.name.length() > header.nameLength) { header.name = header.name.substr(0, header.nameLength); }
+
delete[] tempBuffer;
//extract info from name
int trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex);
if(trashCodeLength == 0){
+ //cout << header.name << " length = " << spot << '\t' << startSpotInFile << '\t' << in2.gcount() << endl;
+
ofstream out;
m->openOutputFileBinaryAppend(filehandles[barcodeIndex][primerIndex], out);
out.write(mybuffer, in2.gcount());
out.close();
- delete[] mybuffer;
numSplitReads[barcodeIndex][primerIndex]++;
}
else{
m->openOutputFileBinaryAppend(noMatchFile, out);
out.write(mybuffer, in2.gcount());
out.close();
- delete[] mybuffer;
}
-
+ delete[] mybuffer;
}
}else{
m->mothurOut("Error reading."); m->mothurOutEndLine();
}
filehandles[itBar->second][itPrimer->second] = thisFilename;
- m->openOutputFile(thisFilename, temp); temp.close();
+ temp.open(thisFilename.c_str(), ios::binary); temp.close();
}
}
}
if (m->control_pressed) { break; }
int index = 0;
- int base = 0;
if(nSeqsPerOTU[i] > 0){
- qualities[i].assign(1024, -1);
while(index < numFlowCells){
double maxPrValue = 1e8;
value = (int)floor(temp);
if(value > 100){ value = 100; }
- qualities[i][base] = (int)value;
- base++;
+ qualities[i].push_back((int)value);
}
}
if(otuCounts[i] > 0){
qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
-
- int j=4; //need to get past the first four bases
- while(qualities[i][j] != -1){
- qualityFile << qualities[i][j] << ' ';
- j++;
- }
+ for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
qualityFile << endl;
}
}
m->mothurOut("\nFinalizing...\n");
fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+ if (m->debug) { m->mothurOut("[DEBUG]: done fill().\n"); }
+
if (m->control_pressed) { break; }
setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
+ if (m->debug) { m->mothurOut("[DEBUG]: done setOTUs().\n"); }
+
if (m->control_pressed) { break; }
vector<int> otuCounts(numOTUs, 0);
for(int j=0;j<numSeqs;j++) { otuCounts[otuData[j]]++; }
- calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: done calcCentroidsDriver().\n"); }
if (m->control_pressed) { break; }
if (m->control_pressed) { break; }
int index = 0;
- int base = 0;
-
+
if(nSeqsPerOTU[i] > 0){
- qualities[i].assign(1024, -1);
while(index < numFlowCells){
+
double maxPrValue = 1e8;
short maxPrIndex = -1;
double count = 0.0000;
pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
}
}
-
+
maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
maxPrValue = pr[maxPrIndex];
for(int s=0;s<HOMOPS;s++){
norm += exp(-(pr[s] - maxPrValue));
}
-
+
for(int s=1;s<=maxPrIndex;s++){
int value = 0;
double temp = 0.0000;
value = (int)floor(temp);
if(value > 100){ value = 100; }
- qualities[i][base] = (int)value;
- base++;
+ qualities[i].push_back((int)value);
}
- }
+ }//end if
index++;
- }
- }
-
+
+ }//end while
+
+ }//end if
+
if(otuCounts[i] > 0){
qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
-
- int j=4; //need to get past the first four bases
- while(qualities[i][j] != -1){
- qualityFile << qualities[i][j] << ' ';
- if (j > qualities[i].size()) { break; }
- j++;
- }
+ //need to get past the first four bases
+ for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
qualityFile << endl;
}
- }
+ }//end for
qualityFile.close();
outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
for (int i = 0; i < thisSeqsGroups.size(); i++) {
if (m->inUsersGroups(thisSeqsGroups[i], Groups)) { //if this sequence belongs to a group we want them print
seq.printSequence(*(ffiles[thisSeqsGroups[i]]));
- int numSeqs = ct.getGroupCount(seq.getName(), Groups[i]);
+ int numSeqs = ct.getGroupCount(seq.getName(), thisSeqsGroups[i]);
(*(cfiles[thisSeqsGroups[i]])) << seq.getName() << '\t' << numSeqs << '\t' << numSeqs << endl;
}
}
else{
itGroupCounts = groupCounts.find(group);
if (itGroupCounts != groupCounts.end()) {
- if (groupCounts[group] < size) { subset.insert(names[j]); groupCounts[group]++; }
+ if (itGroupCounts->second < size) { subset.insert(names[j]); (itGroupCounts->second)++; }
}
}
}
primers[""] = 0;
primerNameVector.push_back("");
}
-
-
+
outFlowFileNames.resize(barcodeNameVector.size());
for(int i=0;i<outFlowFileNames.size();i++){
outFlowFileNames[i].assign(primerNameVector.size(), "");
temp = temp.substr(0,alnLength);
int numDiff = countDiffs(oligo, temp);
+ if (m->debug) { m->mothurOut("[DEBUG]: " + seq.getName() + " aligned fragment =" + temp + ", barcode =" + oligo + ", numDiffs = " + toString(numDiff) + "\n"); }
+
if(numDiff < minDiff){
minDiff = numDiff;
minCount = 1;
temp = temp.substr(0,alnLength);
int numDiff = countDiffs(oligo, temp);
+ if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); }
+
if (alnLength == 0) { numDiff = bdiffs + 100; }
//cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
oligo = oligo.substr(0,alnLength);
temp = temp.substr(0,alnLength);
int numDiff = countDiffs(oligo, temp);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: reverse " + forwardSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); }
+
if (alnLength == 0) { numDiff = bdiffs + 100; }
//cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
int numDiff = countDiffs(oligo, temp);
if (alnLength == 0) { numDiff = bdiffs + 100; }
- //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
+ if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); }
if(numDiff < minDiff){
minDiff = numDiff;
int numDiff = countDiffs(oligo, temp);
if (alnLength == 0) { numDiff = bdiffs + 100; }
+ if (m->debug) { m->mothurOut("[DEBUG]: reverse " + reverseSeq.getName() + " aligned fragment=" + temp + ", barcode=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); }
+
//cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
if(numDiff < minDiff){
minDiff = numDiff;
if (alnLength == 0) { numDiff = pdiffs + 100; }
//cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
+ if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); }
+
if(numDiff < minDiff){
minDiff = numDiff;
minCount = 1;
int numDiff = countDiffs(oligo, temp);
if (alnLength == 0) { numDiff = pdiffs + 100; }
+ if (m->debug) { m->mothurOut("[DEBUG]: reverse " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); }
+
//cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
if(numDiff < minDiff){
minDiff = numDiff;
temp = temp.substr(0,alnLength);
int numDiff = countDiffs(oligo, temp);
+ if (m->debug) { m->mothurOut("[DEBUG]: forward " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); }
+
if (alnLength == 0) { numDiff = pdiffs + 100; }
//cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
oligo = oligo.substr(0,alnLength);
temp = temp.substr(0,alnLength);
int numDiff = countDiffs(oligo, temp);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: reverse " + forwardSeq.getName() + " aligned fragment=" + temp + ", primer=" + oligo + ", numDiffs=" + toString(numDiff) + ".\n"); }
+
if (alnLength == 0) { numDiff = pdiffs + 100; }
//cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
int numDiff = countDiffs(oligo, temp);
+ if (m->debug) { m->mothurOut("[DEBUG]: " + seq.getName() + " aligned fragment =" + temp + ", barcode =" + oligo + ", numDiffs = " + toString(numDiff) + "\n"); }
+
if(numDiff < minDiff){
minDiff = numDiff;
minCount = 1;
float rcumul = 1.0000;
//this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
- for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
+ for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
//make rscoreFreq map and rCumul
map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
rCumul[a][it->first] = rcumul;
for(int a = 0; a < numComp; a++) {
output->initFile(groupComb[a], tags);
//print each line
- for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
+ for (map<double,double>::iterator it = validScores.begin(); it != validScores.end(); it++) {
data.push_back(it->first); data.push_back(rScoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]);
output->output(data);
data.clear();
for (int f = 0; f < numComp; f++) {
for (int i = 0; i < rScores[f].size(); i++) { //looks like 0,0,1,1,1,2,4,7... you want to make a map that say rScoreFreq[0] = 2, rScoreFreq[1] = 3...
validScores[rScores[f][i]] = rScores[f][i];
- map<float,float>::iterator it = rScoreFreq[f].find(rScores[f][i]);
+ map<double,double>::iterator it = rScoreFreq[f].find(rScores[f][i]);
if (it != rScoreFreq[f].end()) {
rScoreFreq[f][rScores[f][i]]++;
}else{
for(int a = 0; a < numComp; a++) {
float rcumul = 1.0000;
//this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
- for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
+ for (map<double,double>::iterator it = validScores.begin(); it != validScores.end(); it++) {
//make rscoreFreq map and rCumul
- map<float,float>::iterator it2 = rScoreFreq[a].find(it->first);
+ map<double,double>::iterator it2 = rScoreFreq[a].find(it->first);
rCumul[a][it->first] = rcumul;
//get percentage of random trees with that info
if (it2 != rScoreFreq[a].end()) { rScoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
int iters, numGroups, numComp, counter;
vector< vector<double> > rScores; //vector<weighted scores for random trees.> each group comb has an entry
vector< vector<double> > uScores; //vector<weighted scores for user trees.> each group comb has an entry
- vector< map<float, float> > rScoreFreq; //map <weighted score, number of random trees with that score.> -vector entry for each combination.
- vector< map<float, float> > rCumul; //map <weighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each c
- map<float, float> validScores; //map contains scores from random
+ vector< map<double, double> > rScoreFreq; //map <weighted score, number of random trees with that score.> -vector entry for each combination.
+ vector< map<double, double> > rCumul; //map <weighted score, cumulative percentage of number of random trees with that score or higher.> -vector entry for each c
+ map<double, double> validScores; //map contains scores from random
bool abort, phylip, random, includeRoot, subsample, consensus;
string groups, itersString, outputForm, treefile, groupfile, namefile, countfile;