A7E9B98D12D37EC400DA6239 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87C12D37EC400DA6239 /* weighted.cpp */; };
A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */; };
A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87F12D37EC400DA6239 /* whittaker.cpp */; };
+ A7FC480E12D788F20055BC5C /* linearalgebra.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FC480D12D788F20055BC5C /* linearalgebra.cpp */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = weightedlinkage.cpp; sourceTree = SOURCE_ROOT; };
A7E9B87F12D37EC400DA6239 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = "<group>"; };
A7E9B88012D37EC400DA6239 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = "<group>"; };
+ A7FC480C12D788F20055BC5C /* linearalgebra.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = linearalgebra.h; sourceTree = "<group>"; };
+ A7FC480D12D788F20055BC5C /* linearalgebra.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = linearalgebra.cpp; sourceTree = "<group>"; };
C6A0FF2C0290799A04C91782 /* mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = mothur.1; sourceTree = "<group>"; };
/* End PBXFileReference section */
A7E9B72D12D37EC400DA6239 /* inputdata.cpp */,
A7E9B73912D37EC400DA6239 /* libshuff.cpp */,
A7E9B73A12D37EC400DA6239 /* libshuff.h */,
+ A7FC480C12D788F20055BC5C /* linearalgebra.h */,
+ A7FC480D12D788F20055BC5C /* linearalgebra.cpp */,
A7E9BA5612D39BD800DA6239 /* metastats */,
A7E9B75B12D37EC400DA6239 /* mothur.cpp */,
A7E9B75C12D37EC400DA6239 /* mothur.h */,
A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */,
A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */,
A70332B712D3A13400761E33 /* makefile in Sources */,
+ A7FC480E12D788F20055BC5C /* linearalgebra.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
try {
Sequence* trim = NULL;
- if (trimChimera) { trim = new Sequence(trimQuery->getName(), trimQuery->getAligned()); }
+ if (trimChimera) { trim = trimQuery; }
if (chimeraFlags == "yes") {
string chimeraFlag = "no";
string outputString = "";
Sequence* trim = NULL;
- if (trimChimera) { trim = new Sequence(trimQuery->getName(), trimQuery->getAligned()); }
+ if (trimChimera) { trim = trimQuery; }
if (chimeraFlags == "yes") {
string chimeraFlag = "no";
//***************************************************************************************************************
int ChimeraSlayer::getChimeras(Sequence* query) {
try {
- trimQuery = query;
+ if (trimChimera) { trimQuery = new Sequence(query->getName(), query->getAligned()); }
chimeraFlags = "no";
#include "maligner.h"
#include "slayer.h"
-
-
//***********************************************************************/
//This class was modeled after the chimeraSlayer written by the Broad Institute
/***********************************************************************/
if (trim) {
string outputString = ">" + trimmed->getName() + "\n" + trimmed->getAligned() + "\n";
+ delete trimmed;
//write to accnos file
int length = outputString.length();
--- /dev/null
+/*
+ * linearalgebra.cpp
+ * mothur
+ *
+ * Created by westcott on 1/7/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "linearalgebra.h"
+
+/*********************************************************************************************************************************/
+
+inline double SIGN(const double a, const double b)
+{
+ return b>=0 ? (a>=0 ? a:-a) : (a>=0 ? -a:a);
+}
+/*********************************************************************************************************************************/
+
+vector<vector<double> > LinearAlgebra::matrix_mult(vector<vector<double> > first, vector<vector<double> > second){
+ try {
+ vector<vector<double> > product;
+
+ int first_rows = first.size();
+ int first_cols = first[0].size();
+ int second_cols = second[0].size();
+
+ product.resize(first_rows);
+ for(int i=0;i<first_rows;i++){
+ product[i].resize(second_cols);
+ }
+
+ for(int i=0;i<first_rows;i++){
+ for(int j=0;j<second_cols;j++){
+
+ if (m->control_pressed) { return product; }
+
+ product[i][j] = 0.0;
+ for(int k=0;k<first_cols;k++){
+ product[i][j] += first[i][k] * second[k][j];
+ }
+ }
+ }
+
+ return product;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "matrix_mult");
+ exit(1);
+ }
+
+}
+
+/*********************************************************************************************************************************/
+
+// This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
+
+int LinearAlgebra::tred2(vector<vector<double> >& a, vector<double>& d, vector<double>& e){
+ try {
+ double scale, hh, h, g, f;
+
+ int n = a.size();
+
+ d.resize(n);
+ e.resize(n);
+
+ for(int i=n-1;i>0;i--){
+ int l=i-1;
+ h = scale = 0.0000;
+ if(l>0){
+ for(int k=0;k<l+1;k++){
+ scale += fabs(a[i][k]);
+ }
+ if(scale == 0.0){
+ e[i] = a[i][l];
+ }
+ else{
+ for(int k=0;k<l+1;k++){
+ a[i][k] /= scale;
+ h += a[i][k] * a[i][k];
+ }
+ f = a[i][l];
+ g = (f >= 0.0 ? -sqrt(h) : sqrt(h));
+ e[i] = scale * g;
+ h -= f * g;
+ a[i][l] = f - g;
+ f = 0.0;
+ for(int j=0;j<l+1;j++){
+ a[j][i] = a[i][j] / h;
+ g = 0.0;
+ for(int k=0;k<j+1;k++){
+ g += a[j][k] * a[i][k];
+ }
+ for(int k=j+1;k<l+1;k++){
+ g += a[k][j] * a[i][k];
+ }
+ e[j] = g / h;
+ f += e[j] * a[i][j];
+ }
+ hh = f / (h + h);
+ for(int j=0;j<l+1;j++){
+ f = a[i][j];
+ e[j] = g = e[j] - hh * f;
+ for(int k=0;k<j+1;k++){
+ a[j][k] -= (f * e[k] + g * a[i][k]);
+ }
+ }
+ }
+ }
+ else{
+ e[i] = a[i][l];
+ }
+
+ d[i] = h;
+ }
+
+ d[0] = 0.0000;
+ e[0] = 0.0000;
+
+ for(int i=0;i<n;i++){
+ int l = i;
+ if(d[i] != 0.0){
+ for(int j=0;j<l;j++){
+ g = 0.0000;
+ for(int k=0;k<l;k++){
+ g += a[i][k] * a[k][j];
+ }
+ for(int k=0;k<l;k++){
+ a[k][j] -= g * a[k][i];
+ }
+ }
+ }
+ d[i] = a[i][i];
+ a[i][i] = 1.0000;
+ for(int j=0;j<l;j++){
+ a[j][i] = a[i][j] = 0.0;
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "tred2");
+ exit(1);
+ }
+
+}
+/*********************************************************************************************************************************/
+
+double LinearAlgebra::pythag(double a, double b) { return(pow(a*a+b*b,0.5)); }
+
+/*********************************************************************************************************************************/
+
+// This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
+
+int LinearAlgebra::qtli(vector<double>& d, vector<double>& e, vector<vector<double> >& z) {
+ try {
+ int myM, i, iter;
+ double s, r, p, g, f, dd, c, b;
+
+ int n = d.size();
+ for(int i=1;i<=n;i++){
+ e[i-1] = e[i];
+ }
+ e[n-1] = 0.0000;
+
+ for(int l=0;l<n;l++){
+ iter = 0;
+ do {
+ for(myM=l;myM<n-1;myM++){
+ dd = fabs(d[myM]) + fabs(d[myM+1]);
+ if(fabs(e[myM])+dd == dd) break;
+ }
+ if(myM != l){
+ if(iter++ == 30) cerr << "Too many iterations in tqli\n";
+ g = (d[l+1]-d[l]) / (2.0 * e[l]);
+ r = pythag(g, 1.0);
+ g = d[myM] - d[l] + e[l] / (g + SIGN(r,g));
+ s = c = 1.0;
+ p = 0.0000;
+ for(i=myM-1;i>=l;i--){
+ f = s * e[i];
+ b = c * e[i];
+ e[i+1] = (r=pythag(f,g));
+ if(r==0.0){
+ d[i+1] -= p;
+ e[myM] = 0.0000;
+ break;
+ }
+ s = f / r;
+ c = g / r;
+ g = d[i+1] - p;
+ r = (d[i] - g) * s + 2.0 * c * b;
+ d[i+1] = g + ( p = s * r);
+ g = c * r - b;
+ for(int k=0;k<n;k++){
+ f = z[k][i+1];
+ z[k][i+1] = s * z[k][i] + c * f;
+ z[k][i] = c * z[k][i] - s * f;
+ }
+ }
+ if(r == 0.00 && i >= l) continue;
+ d[l] -= p;
+ e[l] = g;
+ e[myM] = 0.0;
+ }
+ } while (myM != l);
+ }
+
+ int k;
+ for(int i=0;i<n;i++){
+ p=d[k=i];
+ for(int j=i;j<n;j++){
+ if(d[j] >= p){
+ p=d[k=j];
+ }
+ }
+ if(k!=i){
+ d[k]=d[i];
+ d[i]=p;
+ for(int j=0;j<n;j++){
+ p=z[j][i];
+ z[j][i] = z[j][k];
+ z[j][k] = p;
+ }
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "qtli");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+
+
--- /dev/null
+#ifndef LINEARALGEBRA
+#define LINEARALGEBRA
+
+/*
+ * linearalgebra.h
+ * mothur
+ *
+ * Created by westcott on 1/7/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothurout.h"
+
+class LinearAlgebra {
+
+public:
+ LinearAlgebra() { m = MothurOut::getInstance(); }
+ ~LinearAlgebra() {}
+
+ vector<vector<double> > matrix_mult(vector<vector<double> >, vector<vector<double> >);
+ int tred2(vector<vector<double> >&, vector<double>&, vector<double>&);
+ int qtli(vector<double>&, vector<double>&, vector<vector<double> >&);
+
+
+private:
+ MothurOut* m;
+
+ double pythag(double, double);
+};
+
+#endif
+
vector<double> e;
vector<vector<double> > G = D;
vector<vector<double> > copy_G;
- //int rank = D.size();
-
+
m->mothurOut("\nProcessing...\n\n");
for(int count=0;count<2;count++){
- recenter(offset, D, G); if (m->control_pressed) { return 0; }
- tred2(G, d, e); if (m->control_pressed) { return 0; }
- qtli(d, e, G); if (m->control_pressed) { return 0; }
+ recenter(offset, D, G); if (m->control_pressed) { return 0; }
+ linearCalc.tred2(G, d, e); if (m->control_pressed) { return 0; }
+ linearCalc.qtli(d, e, G); if (m->control_pressed) { return 0; }
offset = d[d.size()-1];
if(offset > 0.0) break;
}
}
/*********************************************************************************************************************************/
-inline double SIGN(const double a, const double b)
-{
- return b>=0 ? (a>=0 ? a:-a) : (a>=0 ? -a:a);
-}
-
-/*********************************************************************************************************************************/
-
void PCOACommand::get_comment(istream& f, char begin, char end){
try {
char d=f.get();
/*********************************************************************************************************************************/
-double PCOACommand::pythag(double a, double b) { return(pow(a*a+b*b,0.5)); }
-
-/*********************************************************************************************************************************/
-
-void PCOACommand::matrix_mult(vector<vector<double> > first, vector<vector<double> > second, vector<vector<double> >& product){
- try {
- int first_rows = first.size();
- int first_cols = first[0].size();
- int second_cols = second[0].size();
-
- product.resize(first_rows);
- for(int i=0;i<first_rows;i++){
- product[i].resize(second_cols);
- }
-
- for(int i=0;i<first_rows;i++){
- for(int j=0;j<second_cols;j++){
- product[i][j] = 0.0;
- for(int k=0;k<first_cols;k++){
- product[i][j] += first[i][k] * second[k][j];
- }
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "PCOACommand", "matrix_mult");
- exit(1);
- }
-
-}
-
-/*********************************************************************************************************************************/
-
void PCOACommand::recenter(double offset, vector<vector<double> > D, vector<vector<double> >& G){
try {
int rank = D.size();
}
}
- matrix_mult(C,A,A);
- matrix_mult(A,C,G);
+ A = linearCalc.matrix_mult(C,A);
+ G = linearCalc.matrix_mult(A,C);
}
catch(exception& e) {
m->errorOut(e, "PCOACommand", "recenter");
/*********************************************************************************************************************************/
-// This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
-
-void PCOACommand::tred2(vector<vector<double> >& a, vector<double>& d, vector<double>& e){
- try {
- double scale, hh, h, g, f;
-
- int n = a.size();
-
- d.resize(n);
- e.resize(n);
-
- for(int i=n-1;i>0;i--){
- int l=i-1;
- h = scale = 0.0000;
- if(l>0){
- for(int k=0;k<l+1;k++){
- scale += fabs(a[i][k]);
- }
- if(scale == 0.0){
- e[i] = a[i][l];
- }
- else{
- for(int k=0;k<l+1;k++){
- a[i][k] /= scale;
- h += a[i][k] * a[i][k];
- }
- f = a[i][l];
- g = (f >= 0.0 ? -sqrt(h) : sqrt(h));
- e[i] = scale * g;
- h -= f * g;
- a[i][l] = f - g;
- f = 0.0;
- for(int j=0;j<l+1;j++){
- a[j][i] = a[i][j] / h;
- g = 0.0;
- for(int k=0;k<j+1;k++){
- g += a[j][k] * a[i][k];
- }
- for(int k=j+1;k<l+1;k++){
- g += a[k][j] * a[i][k];
- }
- e[j] = g / h;
- f += e[j] * a[i][j];
- }
- hh = f / (h + h);
- for(int j=0;j<l+1;j++){
- f = a[i][j];
- e[j] = g = e[j] - hh * f;
- for(int k=0;k<j+1;k++){
- a[j][k] -= (f * e[k] + g * a[i][k]);
- }
- }
- }
- }
- else{
- e[i] = a[i][l];
- }
-
- d[i] = h;
- }
-
- d[0] = 0.0000;
- e[0] = 0.0000;
-
- for(int i=0;i<n;i++){
- int l = i;
- if(d[i] != 0.0){
- for(int j=0;j<l;j++){
- g = 0.0000;
- for(int k=0;k<l;k++){
- g += a[i][k] * a[k][j];
- }
- for(int k=0;k<l;k++){
- a[k][j] -= g * a[k][i];
- }
- }
- }
- d[i] = a[i][i];
- a[i][i] = 1.0000;
- for(int j=0;j<l;j++){
- a[j][i] = a[i][j] = 0.0;
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "PCOACommand", "tred2");
- exit(1);
- }
-
-}
-
-/*********************************************************************************************************************************/
-
-// This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
-
-void PCOACommand::qtli(vector<double>& d, vector<double>& e, vector<vector<double> >& z) {
- try {
- int m, i, iter;
- double s, r, p, g, f, dd, c, b;
-
- int n = d.size();
- for(int i=1;i<=n;i++){
- e[i-1] = e[i];
- }
- e[n-1] = 0.0000;
-
- for(int l=0;l<n;l++){
- iter = 0;
- do {
- for(m=l;m<n-1;m++){
- dd = fabs(d[m]) + fabs(d[m+1]);
- if(fabs(e[m])+dd == dd) break;
- }
- if(m != l){
- if(iter++ == 30) cerr << "Too many iterations in tqli\n";
- g = (d[l+1]-d[l]) / (2.0 * e[l]);
- r = pythag(g, 1.0);
- g = d[m] - d[l] + e[l] / (g + SIGN(r,g));
- s = c = 1.0;
- p = 0.0000;
- for(i=m-1;i>=l;i--){
- f = s * e[i];
- b = c * e[i];
- e[i+1] = (r=pythag(f,g));
- if(r==0.0){
- d[i+1] -= p;
- e[m] = 0.0000;
- break;
- }
- s = f / r;
- c = g / r;
- g = d[i+1] - p;
- r = (d[i] - g) * s + 2.0 * c * b;
- d[i+1] = g + ( p = s * r);
- g = c * r - b;
- for(int k=0;k<n;k++){
- f = z[k][i+1];
- z[k][i+1] = s * z[k][i] + c * f;
- z[k][i] = c * z[k][i] - s * f;
- }
- }
- if(r == 0.00 && i >= l) continue;
- d[l] -= p;
- e[l] = g;
- e[m] = 0.0;
- }
- } while (m != l);
- }
-
- int k;
- for(int i=0;i<n;i++){
- p=d[k=i];
- for(int j=i;j<n;j++){
- if(d[j] >= p){
- p=d[k=j];
- }
- }
- if(k!=i){
- d[k]=d[i];
- d[i]=p;
- for(int j=0;j<n;j++){
- p=z[j][i];
- z[j][i] = z[j][k];
- z[j][k] = p;
- }
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "PCOACommand", "qtli");
- exit(1);
- }
-}
-
-/*********************************************************************************************************************************/
-
void PCOACommand::output(string fnameRoot, vector<string> name_list, vector<vector<double> >& G, vector<double> d) {
try {
int rank = name_list.size();
*/
#include "command.hpp"
+#include "linearalgebra.h"
/*****************************************************************/
float cutoff, precision;
vector<string> outputNames;
map<string, vector<string> > outputTypes;
+ LinearAlgebra linearCalc;
void get_comment(istream&, char, char);
int read_phylip(istream&, int, vector<string>&, vector<vector<double> >&);
void read(string, vector<string>&, vector<vector<double> >&);
- double pythag(double, double);
- void matrix_mult(vector<vector<double> >, vector<vector<double> >, vector<vector<double> >&);
void recenter(double, vector<vector<double> >, vector<vector<double> >&);
- void tred2(vector<vector<double> >&, vector<double>&, vector<double>&);
- void qtli(vector<double>&, vector<double>&, vector<vector<double> >&);
void output(string, vector<string>, vector<vector<double> >&, vector<double>);
vector< vector<double> > calculateEuclidianDistance(vector<vector<double> >&, int);
double calcPearson(vector<vector<double> >&, vector<vector<double> >&);
//**********************************************************************************************************************
vector<string> SubSampleCommand::getValidParameters(){
try {
- string Array[] = {"fasta", "group", "list","shared","rabund", "name","sabund","size","groups","label","outputdir","inputdir"};
+ string Array[] = {"fasta", "group", "list","shared","rabund","persample", "name","sabund","size","groups","label","outputdir","inputdir"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
return myArray;
}
else {
//valid paramters for this command
- string Array[] = {"fasta", "group", "list","shared","rabund", "sabund","name","size","groups","label","outputdir","inputdir"};
+ string Array[] = {"fasta", "group", "list","shared","rabund","persample", "sabund","name","size","groups","label","outputdir","inputdir"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
string temp = validParameter.validFile(parameters, "size", false); if (temp == "not found"){ temp = "0"; }
convert(temp, size);
+ temp = validParameter.validFile(parameters, "persample", false); if (temp == "not found"){ temp = "f"; }
+ persample = m->isTrue(temp);
+
+ if (groupfile == "") { persample = false; }
+
if ((namefile != "") && (fastafile == "")) { m->mothurOut("You may only use a namefile with a fastafile."); m->mothurOutEndLine(); abort = true; }
if ((fastafile == "") && (listfile == "") && (sabundfile == "") && (rabundfile == "") && (sharedfile == "")) {
void SubSampleCommand::help(){
try {
m->mothurOut("The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n");
- m->mothurOut("The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n");
+ m->mothurOut("The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size, persample and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n");
m->mothurOut("The namefile is only used with the fasta file, not with the listfile, because the list file should contain all sequences.\n");
m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n");
m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n");
m->mothurOut("The size parameter allows you indicate the size of your subsample.\n");
- m->mothurOut("The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files, 10% of number of seqs.\n");
+ m->mothurOut("The persample parameter allows you indicate you want to select subsample of the same size from each of your groups, default=false. It is only used with the list and fasta files if a groupfile is given.\n");
+ m->mothurOut("persample=false will select a random set of sequences of the size you select, but the number of seqs from each group may differ.\n");
+ m->mothurOut("The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files if a groupfile is given and persample=true, then size=number of seqs in smallest sample, otherwise size=10% of number of seqs.\n");
m->mothurOut("The sub.sample command should be in the following format: sub.sample(list=yourListFile, group=yourGroupFile, groups=yourGroups, label=yourLabels).\n");
m->mothurOut("Example sub.sample(list=abrecovery.fn.list, group=abrecovery.groups, groups=B-C, size=20).\n");
m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n");
outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName);
//make sure that if your picked groups size is not too big
-
- if (pickedGroups) {
- int total = 0;
- for(int i = 0; i < Groups.size(); i++) {
- total += groupMap->getNumSeqs(Groups[i]);
+ int thisSize = names.size();
+ if (persample) {
+ if (size == 0) { //user has not set size, set size = smallest samples size
+ size = groupMap->getNumSeqs(Groups[0]);
+ for (int i = 1; i < Groups.size(); i++) {
+ int thisSize = groupMap->getNumSeqs(Groups[i]);
+
+ if (thisSize < size) { size = thisSize; }
+ }
+ }else { //make sure size is not too large
+ int smallestSize = groupMap->getNumSeqs(Groups[0]);
+ for (int i = 1; i < Groups.size(); i++) {
+ int thisSize = groupMap->getNumSeqs(Groups[i]);
+
+ if (thisSize < smallestSize) { smallestSize = thisSize; }
+ }
+ if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); }
+ }
+
+ m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine();
+ }else {
+ if (pickedGroups) {
+ int total = 0;
+ for(int i = 0; i < Groups.size(); i++) {
+ total += groupMap->getNumSeqs(Groups[i]);
+ }
+
+ if (size == 0) { //user has not set size, set size = 10% samples size
+ size = int (total * 0.10);
+ }
+
+ if (total < size) {
+ if (size != 0) {
+ m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine();
+ }
+ size = int (total * 0.10);
+ }
+
+ m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine();
}
if (size == 0) { //user has not set size, set size = 10% samples size
- size = int (total * 0.10);
+ size = int (names.size() * 0.10);
}
- if (total < size) {
- if (size != 0) {
- m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine();
- }
- size = int (total * 0.10);
+ if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
+ size = thisSize;
}
- m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine();
- }
-
- if (size == 0) { //user has not set size, set size = 10% samples size
- size = int (names.size() * 0.10);
- }
-
- int thisSize = names.size();
- if (size > thisSize) { m->mothurOut("Your fasta file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
- size = thisSize;
+ if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); }
+
}
-
random_shuffle(names.begin(), names.end());
- if (!pickedGroups) { m->mothurOut("Sampling " + toString(size) + " from " + toString(thisSize) + "."); m->mothurOutEndLine(); }
-
- //randomly select a subset of those names to include in the subsample
set<string> subset; //dont want repeat sequence names added
- for (int j = 0; j < size; j++) {
-
- if (m->control_pressed) { return 0; }
-
- //get random sequence to add, making sure we have not already added it
- bool done = false;
- int myrand;
- while (!done) {
- myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1));
+ if (persample) {
+ for (int i = 0; i < Groups.size(); i++) {
- if (subset.count(names[myrand]) == 0) {
+ //randomly select a subset of those names from this group to include in the subsample
+ for (int j = 0; j < size; j++) {
- if (groupfile != "") { //if there is a groupfile given fill in group info
- string group = groupMap->getGroup(names[myrand]);
- if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+ if (m->control_pressed) { return 0; }
+
+ //get random sequence to add, making sure we have not already added it
+ bool done = false;
+ int myrand;
+ while (!done) {
+ myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1));
- if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
- if (m->inUsersGroups(group, Groups)) {
+ if (subset.count(names[myrand]) == 0) {
+
+ string group = groupMap->getGroup(names[myrand]);
+ if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+
+ if (group == Groups[i]) { subset.insert(names[myrand]); break; }
+ }
+ }
+ }
+ }
+ }else {
+ //randomly select a subset of those names to include in the subsample
+ for (int j = 0; j < size; j++) {
+
+ if (m->control_pressed) { return 0; }
+
+ //get random sequence to add, making sure we have not already added it
+ bool done = false;
+ int myrand;
+ while (!done) {
+ myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1));
+
+ if (subset.count(names[myrand]) == 0) {
+
+ if (groupfile != "") { //if there is a groupfile given fill in group info
+ string group = groupMap->getGroup(names[myrand]);
+ if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+
+ if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
+ if (m->inUsersGroups(group, Groups)) {
+ subset.insert(names[myrand]); break;
+ }
+ }else{
subset.insert(names[myrand]); break;
}
- }else{
+ }else{ //save everyone, group
subset.insert(names[myrand]); break;
- }
- }else{ //save everyone, group
- subset.insert(names[myrand]); break;
- }
+ }
+ }
}
- }
- }
-
+ }
+ }
//read through fasta file outputting only the names on the subsample list
ifstream in;
m->openInputFile(fastafile, in);
}
//make sure that if your picked groups size is not too big
- if (pickedGroups) {
- int total = 0;
- for(int i = 0; i < Groups.size(); i++) {
- total += groupMap->getNumSeqs(Groups[i]);
- }
-
- if (size == 0) { //user has not set size, set size = 10% samples size
- size = int (total * 0.10);
- }
-
- if (total < size) {
- m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine();
- size = int (total * 0.10);
+ if (persample) {
+ if (size == 0) { //user has not set size, set size = smallest samples size
+ size = groupMap->getNumSeqs(Groups[0]);
+ for (int i = 1; i < Groups.size(); i++) {
+ int thisSize = groupMap->getNumSeqs(Groups[i]);
+
+ if (thisSize < size) { size = thisSize; }
+ }
+ }else { //make sure size is not too large
+ int smallestSize = groupMap->getNumSeqs(Groups[0]);
+ for (int i = 1; i < Groups.size(); i++) {
+ int thisSize = groupMap->getNumSeqs(Groups[i]);
+
+ if (thisSize < smallestSize) { smallestSize = thisSize; }
+ }
+ if (smallestSize < size) { size = smallestSize; m->mothurOut("You have selected a size that is larger than your smallest sample, using your samllest sample size, " + toString(smallestSize) + "."); m->mothurOutEndLine(); }
}
- m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine();
+ m->mothurOut("Sampling " + toString(size) + " from each group."); m->mothurOutEndLine();
}else{
-
- if (size == 0) { //user has not set size, set size = 10% samples size
- size = int (list->getNumSeqs() * 0.10);
- }
-
- int thisSize = list->getNumSeqs();
- if (size > thisSize) { m->mothurOut("Your list file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
- size = thisSize;
+ if (pickedGroups) {
+ int total = 0;
+ for(int i = 0; i < Groups.size(); i++) {
+ total += groupMap->getNumSeqs(Groups[i]);
+ }
+
+ if (size == 0) { //user has not set size, set size = 10% samples size
+ size = int (total * 0.10);
+ }
+
+ if (total < size) {
+ m->mothurOut("Your size is too large for the number of groups you selected. Adjusting to " + toString(int (total * 0.10)) + "."); m->mothurOutEndLine();
+ size = int (total * 0.10);
+ }
+
+ m->mothurOut("Sampling " + toString(size) + " from " + toString(total) + "."); m->mothurOutEndLine();
+ }else{
+
+ if (size == 0) { //user has not set size, set size = 10% samples size
+ size = int (list->getNumSeqs() * 0.10);
+ }
+
+ int thisSize = list->getNumSeqs();
+ if (size > thisSize) { m->mothurOut("Your list file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
+ size = thisSize;
+ }
+
+ m->mothurOut("Sampling " + toString(size) + " from " + toString(list->getNumSeqs()) + "."); m->mothurOutEndLine();
}
-
- m->mothurOut("Sampling " + toString(size) + " from " + toString(list->getNumSeqs()) + "."); m->mothurOutEndLine();
}
+ //fill names
for (int i = 0; i < list->getNumBins(); i++) {
string binnames = list->get(i);
}
random_shuffle(names.begin(), names.end());
-
+
//randomly select a subset of those names to include in the subsample
set<string> subset; //dont want repeat sequence names added
- for (int j = 0; j < size; j++) {
-
- if (m->control_pressed) { break; }
-
- //get random sequence to add, making sure we have not already added it
- bool done = false;
- int myrand;
- while (!done) {
- myrand = (int)((float)(rand()) / (RAND_MAX / (names.size()-1) + 1));
+ if (persample) {
+ for (int i = 0; i < Groups.size(); i++) {
- if (subset.count(names[myrand]) == 0) { subset.insert(names[myrand]); break; }
+ for (int j = 0; j < size; j++) {
+
+ if (m->control_pressed) { break; }
+
+ //get random sequence to add, making sure we have not already added it
+ bool done = false;
+ int myrand;
+ while (!done) {
+ myrand = (int)((float)(rand()) / (RAND_MAX / (names.size()-1) + 1));
+
+ if (subset.count(names[myrand]) == 0) { //you are not already added
+ if (groupMap->getGroup(names[myrand]) == Groups[i]) { subset.insert(names[myrand]); break; }
+ }
+ }
+ }
}
- }
+ }else{
+ for (int j = 0; j < size; j++) {
+
+ if (m->control_pressed) { break; }
+
+ //get random sequence to add, making sure we have not already added it
+ bool done = false;
+ int myrand;
+ while (!done) {
+ myrand = (int)((float)(rand()) / (RAND_MAX / (names.size()-1) + 1));
+
+ if (subset.count(names[myrand]) == 0) { subset.insert(names[myrand]); break; }
+ }
+ }
+ }
if (groupfile != "") {
//write out new groupfile
private:
GlobalData* globaldata;
- bool abort, pickedGroups, allLines;
+ bool abort, pickedGroups, allLines, persample;
string listfile, groupfile, sharedfile, rabundfile, sabundfile, fastafile, namefile;
set<string> labels; //holds labels to be used
string groups, label, outputDir;