From 605ab6fa594317a38f0df7bb6797740c735b2348 Mon Sep 17 00:00:00 2001 From: westcott Date: Fri, 7 Jan 2011 18:18:59 +0000 Subject: [PATCH] finished chimera.slayer adding trim parameter, added persample parameter to sub.sample command and create linearalgebra class --- Mothur.xcodeproj/project.pbxproj | 6 + chimeraslayer.cpp | 6 +- chimeraslayer.h | 2 - chimeraslayercommand.cpp | 1 + linearalgebra.cpp | 238 +++++++++++++++++++++++++++ linearalgebra.h | 33 ++++ pcoacommand.cpp | 229 +------------------------- pcoacommand.h | 6 +- subsamplecommand.cpp | 271 +++++++++++++++++++++---------- subsamplecommand.h | 2 +- 10 files changed, 473 insertions(+), 321 deletions(-) create mode 100644 linearalgebra.cpp create mode 100644 linearalgebra.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index e7e8183..20b0e48 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -280,6 +280,7 @@ A7E9B98D12D37EC400DA6239 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87C12D37EC400DA6239 /* weighted.cpp */; }; A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */; }; A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87F12D37EC400DA6239 /* whittaker.cpp */; }; + A7FC480E12D788F20055BC5C /* linearalgebra.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FC480D12D788F20055BC5C /* linearalgebra.cpp */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -860,6 +861,8 @@ A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = weightedlinkage.cpp; sourceTree = SOURCE_ROOT; }; A7E9B87F12D37EC400DA6239 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = ""; }; A7E9B88012D37EC400DA6239 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = ""; }; + A7FC480C12D788F20055BC5C /* linearalgebra.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = linearalgebra.h; sourceTree = ""; }; + A7FC480D12D788F20055BC5C /* linearalgebra.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = linearalgebra.cpp; sourceTree = ""; }; C6A0FF2C0290799A04C91782 /* mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = mothur.1; sourceTree = ""; }; /* End PBXFileReference section */ @@ -930,6 +933,8 @@ A7E9B72D12D37EC400DA6239 /* inputdata.cpp */, A7E9B73912D37EC400DA6239 /* libshuff.cpp */, A7E9B73A12D37EC400DA6239 /* libshuff.h */, + A7FC480C12D788F20055BC5C /* linearalgebra.h */, + A7FC480D12D788F20055BC5C /* linearalgebra.cpp */, A7E9BA5612D39BD800DA6239 /* metastats */, A7E9B75B12D37EC400DA6239 /* mothur.cpp */, A7E9B75C12D37EC400DA6239 /* mothur.h */, @@ -1856,6 +1861,7 @@ A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */, A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */, A70332B712D3A13400761E33 /* makefile in Sources */, + A7FC480E12D788F20055BC5C /* linearalgebra.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 020b8a6..38f7abe 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -419,7 +419,7 @@ void ChimeraSlayer::printHeader(ostream& out) { Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) { try { Sequence* trim = NULL; - if (trimChimera) { trim = new Sequence(trimQuery->getName(), trimQuery->getAligned()); } + if (trimChimera) { trim = trimQuery; } if (chimeraFlags == "yes") { string chimeraFlag = "no"; @@ -472,7 +472,7 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { string outputString = ""; Sequence* trim = NULL; - if (trimChimera) { trim = new Sequence(trimQuery->getName(), trimQuery->getAligned()); } + if (trimChimera) { trim = trimQuery; } if (chimeraFlags == "yes") { string chimeraFlag = "no"; @@ -546,7 +546,7 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { //*************************************************************************************************************** int ChimeraSlayer::getChimeras(Sequence* query) { try { - trimQuery = query; + if (trimChimera) { trimQuery = new Sequence(query->getName(), query->getAligned()); } chimeraFlags = "no"; diff --git a/chimeraslayer.h b/chimeraslayer.h index 1e03d92..87439a3 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -15,8 +15,6 @@ #include "maligner.h" #include "slayer.h" - - //***********************************************************************/ //This class was modeled after the chimeraSlayer written by the Broad Institute /***********************************************************************/ diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index 39483b1..bd9ff4d 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -690,6 +690,7 @@ int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_Fil if (trim) { string outputString = ">" + trimmed->getName() + "\n" + trimmed->getAligned() + "\n"; + delete trimmed; //write to accnos file int length = outputString.length(); diff --git a/linearalgebra.cpp b/linearalgebra.cpp new file mode 100644 index 0000000..e015b5f --- /dev/null +++ b/linearalgebra.cpp @@ -0,0 +1,238 @@ +/* + * linearalgebra.cpp + * mothur + * + * Created by westcott on 1/7/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "linearalgebra.h" + +/*********************************************************************************************************************************/ + +inline double SIGN(const double a, const double b) +{ + return b>=0 ? (a>=0 ? a:-a) : (a>=0 ? -a:a); +} +/*********************************************************************************************************************************/ + +vector > LinearAlgebra::matrix_mult(vector > first, vector > second){ + try { + vector > product; + + int first_rows = first.size(); + int first_cols = first[0].size(); + int second_cols = second[0].size(); + + product.resize(first_rows); + for(int i=0;icontrol_pressed) { return product; } + + product[i][j] = 0.0; + for(int k=0;kerrorOut(e, "LinearAlgebra", "matrix_mult"); + exit(1); + } + +} + +/*********************************************************************************************************************************/ + +// This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479 + +int LinearAlgebra::tred2(vector >& a, vector& d, vector& e){ + try { + double scale, hh, h, g, f; + + int n = a.size(); + + d.resize(n); + e.resize(n); + + for(int i=n-1;i>0;i--){ + int l=i-1; + h = scale = 0.0000; + if(l>0){ + for(int k=0;k= 0.0 ? -sqrt(h) : sqrt(h)); + e[i] = scale * g; + h -= f * g; + a[i][l] = f - g; + f = 0.0; + for(int j=0;jerrorOut(e, "LinearAlgebra", "tred2"); + exit(1); + } + +} +/*********************************************************************************************************************************/ + +double LinearAlgebra::pythag(double a, double b) { return(pow(a*a+b*b,0.5)); } + +/*********************************************************************************************************************************/ + +// This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479 + +int LinearAlgebra::qtli(vector& d, vector& e, vector >& z) { + try { + int myM, i, iter; + double s, r, p, g, f, dd, c, b; + + int n = d.size(); + for(int i=1;i<=n;i++){ + e[i-1] = e[i]; + } + e[n-1] = 0.0000; + + for(int l=0;l=l;i--){ + f = s * e[i]; + b = c * e[i]; + e[i+1] = (r=pythag(f,g)); + if(r==0.0){ + d[i+1] -= p; + e[myM] = 0.0000; + break; + } + s = f / r; + c = g / r; + g = d[i+1] - p; + r = (d[i] - g) * s + 2.0 * c * b; + d[i+1] = g + ( p = s * r); + g = c * r - b; + for(int k=0;k= l) continue; + d[l] -= p; + e[l] = g; + e[myM] = 0.0; + } + } while (myM != l); + } + + int k; + for(int i=0;i= p){ + p=d[k=j]; + } + } + if(k!=i){ + d[k]=d[i]; + d[i]=p; + for(int j=0;jerrorOut(e, "LinearAlgebra", "qtli"); + exit(1); + } +} +/*********************************************************************************************************************************/ + + diff --git a/linearalgebra.h b/linearalgebra.h new file mode 100644 index 0000000..2acf5cb --- /dev/null +++ b/linearalgebra.h @@ -0,0 +1,33 @@ +#ifndef LINEARALGEBRA +#define LINEARALGEBRA + +/* + * linearalgebra.h + * mothur + * + * Created by westcott on 1/7/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "mothurout.h" + +class LinearAlgebra { + +public: + LinearAlgebra() { m = MothurOut::getInstance(); } + ~LinearAlgebra() {} + + vector > matrix_mult(vector >, vector >); + int tred2(vector >&, vector&, vector&); + int qtli(vector&, vector&, vector >&); + + +private: + MothurOut* m; + + double pythag(double, double); +}; + +#endif + diff --git a/pcoacommand.cpp b/pcoacommand.cpp index cb2e758..9375144 100644 --- a/pcoacommand.cpp +++ b/pcoacommand.cpp @@ -171,14 +171,13 @@ int PCOACommand::execute(){ vector e; vector > G = D; vector > copy_G; - //int rank = D.size(); - + m->mothurOut("\nProcessing...\n\n"); for(int count=0;count<2;count++){ - recenter(offset, D, G); if (m->control_pressed) { return 0; } - tred2(G, d, e); if (m->control_pressed) { return 0; } - qtli(d, e, G); if (m->control_pressed) { return 0; } + recenter(offset, D, G); if (m->control_pressed) { return 0; } + linearCalc.tred2(G, d, e); if (m->control_pressed) { return 0; } + linearCalc.qtli(d, e, G); if (m->control_pressed) { return 0; } offset = d[d.size()-1]; if(offset > 0.0) break; } @@ -327,13 +326,6 @@ double PCOACommand::calcPearson(vector< vector >& euclidDists, vector< v } /*********************************************************************************************************************************/ -inline double SIGN(const double a, const double b) -{ - return b>=0 ? (a>=0 ? a:-a) : (a>=0 ? -a:a); -} - -/*********************************************************************************************************************************/ - void PCOACommand::get_comment(istream& f, char begin, char end){ try { char d=f.get(); @@ -444,39 +436,6 @@ void PCOACommand::read(string fname, vector& names, vector > first, vector > second, vector >& product){ - try { - int first_rows = first.size(); - int first_cols = first[0].size(); - int second_cols = second[0].size(); - - product.resize(first_rows); - for(int i=0;ierrorOut(e, "PCOACommand", "matrix_mult"); - exit(1); - } - -} - -/*********************************************************************************************************************************/ - void PCOACommand::recenter(double offset, vector > D, vector >& G){ try { int rank = D.size(); @@ -499,8 +458,8 @@ void PCOACommand::recenter(double offset, vector > D, vectorerrorOut(e, "PCOACommand", "recenter"); @@ -511,182 +470,6 @@ void PCOACommand::recenter(double offset, vector > D, vector >& a, vector& d, vector& e){ - try { - double scale, hh, h, g, f; - - int n = a.size(); - - d.resize(n); - e.resize(n); - - for(int i=n-1;i>0;i--){ - int l=i-1; - h = scale = 0.0000; - if(l>0){ - for(int k=0;k= 0.0 ? -sqrt(h) : sqrt(h)); - e[i] = scale * g; - h -= f * g; - a[i][l] = f - g; - f = 0.0; - for(int j=0;jerrorOut(e, "PCOACommand", "tred2"); - exit(1); - } - -} - -/*********************************************************************************************************************************/ - -// This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479 - -void PCOACommand::qtli(vector& d, vector& e, vector >& z) { - try { - int m, i, iter; - double s, r, p, g, f, dd, c, b; - - int n = d.size(); - for(int i=1;i<=n;i++){ - e[i-1] = e[i]; - } - e[n-1] = 0.0000; - - for(int l=0;l=l;i--){ - f = s * e[i]; - b = c * e[i]; - e[i+1] = (r=pythag(f,g)); - if(r==0.0){ - d[i+1] -= p; - e[m] = 0.0000; - break; - } - s = f / r; - c = g / r; - g = d[i+1] - p; - r = (d[i] - g) * s + 2.0 * c * b; - d[i+1] = g + ( p = s * r); - g = c * r - b; - for(int k=0;k= l) continue; - d[l] -= p; - e[l] = g; - e[m] = 0.0; - } - } while (m != l); - } - - int k; - for(int i=0;i= p){ - p=d[k=j]; - } - } - if(k!=i){ - d[k]=d[i]; - d[i]=p; - for(int j=0;jerrorOut(e, "PCOACommand", "qtli"); - exit(1); - } -} - -/*********************************************************************************************************************************/ - void PCOACommand::output(string fnameRoot, vector name_list, vector >& G, vector d) { try { int rank = name_list.size(); diff --git a/pcoacommand.h b/pcoacommand.h index ea1738b..8bb7c1b 100644 --- a/pcoacommand.h +++ b/pcoacommand.h @@ -11,6 +11,7 @@ */ #include "command.hpp" +#include "linearalgebra.h" /*****************************************************************/ @@ -34,15 +35,12 @@ private: float cutoff, precision; vector outputNames; map > outputTypes; + LinearAlgebra linearCalc; void get_comment(istream&, char, char); int read_phylip(istream&, int, vector&, vector >&); void read(string, vector&, vector >&); - double pythag(double, double); - void matrix_mult(vector >, vector >, vector >&); void recenter(double, vector >, vector >&); - void tred2(vector >&, vector&, vector&); - void qtli(vector&, vector&, vector >&); void output(string, vector, vector >&, vector); vector< vector > calculateEuclidianDistance(vector >&, int); double calcPearson(vector >&, vector >&); diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index b8bb7ad..2a3d138 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -13,7 +13,7 @@ //********************************************************************************************************************** vector SubSampleCommand::getValidParameters(){ try { - string Array[] = {"fasta", "group", "list","shared","rabund", "name","sabund","size","groups","label","outputdir","inputdir"}; + string Array[] = {"fasta", "group", "list","shared","rabund","persample", "name","sabund","size","groups","label","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); return myArray; } @@ -77,7 +77,7 @@ SubSampleCommand::SubSampleCommand(string option) { else { //valid paramters for this command - string Array[] = {"fasta", "group", "list","shared","rabund", "sabund","name","size","groups","label","outputdir","inputdir"}; + string Array[] = {"fasta", "group", "list","shared","rabund","persample", "sabund","name","size","groups","label","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -216,6 +216,11 @@ SubSampleCommand::SubSampleCommand(string option) { string temp = validParameter.validFile(parameters, "size", false); if (temp == "not found"){ temp = "0"; } convert(temp, size); + temp = validParameter.validFile(parameters, "persample", false); if (temp == "not found"){ temp = "f"; } + persample = m->isTrue(temp); + + if (groupfile == "") { persample = false; } + if ((namefile != "") && (fastafile == "")) { m->mothurOut("You may only use a namefile with a fastafile."); m->mothurOutEndLine(); abort = true; } if ((fastafile == "") && (listfile == "") && (sabundfile == "") && (rabundfile == "") && (sharedfile == "")) { @@ -244,12 +249,14 @@ SubSampleCommand::SubSampleCommand(string option) { void SubSampleCommand::help(){ try { m->mothurOut("The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n"); - m->mothurOut("The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n"); + m->mothurOut("The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size, persample and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n"); m->mothurOut("The namefile is only used with the fasta file, not with the listfile, because the list file should contain all sequences.\n"); m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n"); m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"); m->mothurOut("The size parameter allows you indicate the size of your subsample.\n"); - m->mothurOut("The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files, 10% of number of seqs.\n"); + m->mothurOut("The persample parameter allows you indicate you want to select subsample of the same size from each of your groups, default=false. It is only used with the list and fasta files if a groupfile is given.\n"); + m->mothurOut("persample=false will select a random set of sequences of the size you select, but the number of seqs from each group may differ.\n"); + m->mothurOut("The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files if a groupfile is given and persample=true, then size=number of seqs in smallest sample, otherwise size=10% of number of seqs.\n"); m->mothurOut("The sub.sample command should be in the following format: sub.sample(list=yourListFile, group=yourGroupFile, groups=yourGroups, label=yourLabels).\n"); m->mothurOut("Example sub.sample(list=abrecovery.fn.list, group=abrecovery.groups, groups=B-C, size=20).\n"); m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n"); @@ -340,72 +347,117 @@ int SubSampleCommand::getSubSampleFasta() { outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); //make sure that if your picked groups size is not too big - - if (pickedGroups) { - int total = 0; - for(int i = 0; i < Groups.size(); i++) { - total += groupMap->getNumSeqs(Groups[i]); + int thisSize = names.size(); + if (persample) { + if (size == 0) { //user has not set size, set size = smallest samples size + size = groupMap->getNumSeqs(Groups[0]); + for (int i = 1; i < Groups.size(); i++) { + int thisSize = groupMap->getNumSeqs(Groups[i]); + + if (thisSize < size) { size = thisSize; } + } + }else { //make sure size is not too large + int smallestSize = groupMap->getNumSeqs(Groups[0]); + for (int i = 1; i < Groups.size(); i++) { + int thisSize = groupMap->getNumSeqs(Groups[i]); + + if (thisSize < smallestSize) { smallestSize = thisSize; } + } + if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); } + } + + m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); + }else { + if (pickedGroups) { + int total = 0; + for(int i = 0; i < Groups.size(); i++) { + total += groupMap->getNumSeqs(Groups[i]); + } + + if (size == 0) { //user has not set size, set size = 10% samples size + size = int (total * 0.10); + } + + if (total < size) { + if (size != 0) { + m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine(); + } + size = int (total * 0.10); + } + + m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine(); } if (size == 0) { //user has not set size, set size = 10% samples size - size = int (total * 0.10); + size = int (names.size() * 0.10); } - if (total < size) { - if (size != 0) { - m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine(); - } - size = int (total * 0.10); + if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); + size = thisSize; } - m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine(); - } - - if (size == 0) { //user has not set size, set size = 10% samples size - size = int (names.size() * 0.10); - } - - int thisSize = names.size(); - if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); - size = thisSize; + if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); } + } - random_shuffle(names.begin(), names.end()); - if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); } - - //randomly select a subset of those names to include in the subsample set subset; //dont want repeat sequence names added - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { return 0; } - - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1)); + if (persample) { + for (int i = 0; i < Groups.size(); i++) { - if (subset.count(names[myrand]) == 0) { + //randomly select a subset of those names from this group to include in the subsample + for (int j = 0; j < size; j++) { - if (groupfile != "") { //if there is a groupfile given fill in group info - string group = groupMap->getGroup(names[myrand]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + if (m->control_pressed) { return 0; } + + //get random sequence to add, making sure we have not already added it + bool done = false; + int myrand; + while (!done) { + myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1)); - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { + if (subset.count(names[myrand]) == 0) { + + string group = groupMap->getGroup(names[myrand]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + + if (group == Groups[i]) { subset.insert(names[myrand]); break; } + } + } + } + } + }else { + //randomly select a subset of those names to include in the subsample + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { return 0; } + + //get random sequence to add, making sure we have not already added it + bool done = false; + int myrand; + while (!done) { + myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1)); + + if (subset.count(names[myrand]) == 0) { + + if (groupfile != "") { //if there is a groupfile given fill in group info + string group = groupMap->getGroup(names[myrand]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + + if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups + if (m->inUsersGroups(group, Groups)) { + subset.insert(names[myrand]); break; + } + }else{ subset.insert(names[myrand]); break; } - }else{ + }else{ //save everyone, group subset.insert(names[myrand]); break; - } - }else{ //save everyone, group - subset.insert(names[myrand]); break; - } + } + } } - } - } - + } + } //read through fasta file outputting only the names on the subsample list ifstream in; m->openInputFile(fastafile, in); @@ -794,37 +846,59 @@ int SubSampleCommand::getSubSampleList() { } //make sure that if your picked groups size is not too big - if (pickedGroups) { - int total = 0; - for(int i = 0; i < Groups.size(); i++) { - total += groupMap->getNumSeqs(Groups[i]); - } - - if (size == 0) { //user has not set size, set size = 10% samples size - size = int (total * 0.10); - } - - if (total < size) { - m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine(); - size = int (total * 0.10); + if (persample) { + if (size == 0) { //user has not set size, set size = smallest samples size + size = groupMap->getNumSeqs(Groups[0]); + for (int i = 1; i < Groups.size(); i++) { + int thisSize = groupMap->getNumSeqs(Groups[i]); + + if (thisSize < size) { size = thisSize; } + } + }else { //make sure size is not too large + int smallestSize = groupMap->getNumSeqs(Groups[0]); + for (int i = 1; i < Groups.size(); i++) { + int thisSize = groupMap->getNumSeqs(Groups[i]); + + if (thisSize < smallestSize) { smallestSize = thisSize; } + } + if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); } } - m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine(); + m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine(); }else{ - - if (size == 0) { //user has not set size, set size = 10% samples size - size = int (list->getNumSeqs() * 0.10); - } - - int thisSize = list->getNumSeqs(); - if (size > thisSize) { m->mothurOut("Your list file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); - size = thisSize; + if (pickedGroups) { + int total = 0; + for(int i = 0; i < Groups.size(); i++) { + total += groupMap->getNumSeqs(Groups[i]); + } + + if (size == 0) { //user has not set size, set size = 10% samples size + size = int (total * 0.10); + } + + if (total < size) { + m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine(); + size = int (total * 0.10); + } + + m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine(); + }else{ + + if (size == 0) { //user has not set size, set size = 10% samples size + size = int (list->getNumSeqs() * 0.10); + } + + int thisSize = list->getNumSeqs(); + if (size > thisSize) { m->mothurOut("Your list file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine(); + size = thisSize; + } + + m->mothurOut("Sampling " + toString(size) + " from " + toString(list->getNumSeqs()) + "."); m->mothurOutEndLine(); } - - m->mothurOut("Sampling " + toString(size) + " from " + toString(list->getNumSeqs()) + "."); m->mothurOutEndLine(); } + //fill names for (int i = 0; i < list->getNumBins(); i++) { string binnames = list->get(i); @@ -872,22 +946,43 @@ int SubSampleCommand::getSubSampleList() { } random_shuffle(names.begin(), names.end()); - + //randomly select a subset of those names to include in the subsample set subset; //dont want repeat sequence names added - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { break; } - - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = (int)((float)(rand()) / (RAND_MAX / (names.size()-1) + 1)); + if (persample) { + for (int i = 0; i < Groups.size(); i++) { - if (subset.count(names[myrand]) == 0) { subset.insert(names[myrand]); break; } + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { break; } + + //get random sequence to add, making sure we have not already added it + bool done = false; + int myrand; + while (!done) { + myrand = (int)((float)(rand()) / (RAND_MAX / (names.size()-1) + 1)); + + if (subset.count(names[myrand]) == 0) { //you are not already added + if (groupMap->getGroup(names[myrand]) == Groups[i]) { subset.insert(names[myrand]); break; } + } + } + } } - } + }else{ + for (int j = 0; j < size; j++) { + + if (m->control_pressed) { break; } + + //get random sequence to add, making sure we have not already added it + bool done = false; + int myrand; + while (!done) { + myrand = (int)((float)(rand()) / (RAND_MAX / (names.size()-1) + 1)); + + if (subset.count(names[myrand]) == 0) { subset.insert(names[myrand]); break; } + } + } + } if (groupfile != "") { //write out new groupfile diff --git a/subsamplecommand.h b/subsamplecommand.h index b700506..b00defd 100644 --- a/subsamplecommand.h +++ b/subsamplecommand.h @@ -34,7 +34,7 @@ public: private: GlobalData* globaldata; - bool abort, pickedGroups, allLines; + bool abort, pickedGroups, allLines, persample; string listfile, groupfile, sharedfile, rabundfile, sabundfile, fastafile, namefile; set labels; //holds labels to be used string groups, label, outputDir; -- 2.39.2