*/
#include "matrixoutputcommand.h"
+#include "subsample.h"
//**********************************************************************************************************************
vector<string> MatrixOutputCommand::setParameters(){
try {
CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
+ CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson", "jclass-thetayc", "", "", "",true,false); parameters.push_back(pcalc);
CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "",false,false); parameters.push_back(poutput);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
vector<string> myArray;
try {
string helpString = "";
ValidCalculators validCalculator;
- helpString += "The dist.shared command parameters are shared, groups, calc, output, processors and label. shared is a required, unless you have a valid current file.\n";
+ helpString += "The dist.shared command parameters are shared, groups, calc, output, processors, subsample, iters and label. shared is a required, unless you have a valid current file.\n";
helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included used.\n";
helpString += "The group names are separated by dashes. The label parameter allows you to select what distance levels you would like distance matrices created for, and is also separated by dashes.\n";
+ helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n";
+ helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n";
helpString += "The dist.shared command should be in the following format: dist.shared(groups=yourGroups, calc=yourCalcs, label=yourLabels).\n";
helpString += "The output parameter allows you to specify format of your distance matrix. Options are lt, and square. The default is lt.\n";
helpString += "Example dist.shared(groups=A-B-C, calc=jabund-sorabund).\n";
setParameters();
vector<string> tempOutNames;
outputTypes["phylip"] = tempOutNames;
+ outputTypes["subsample"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "MatrixOutputCommand", "MatrixOutputCommand");
//initialize outputTypes
vector<string> tempOutNames;
outputTypes["phylip"] = tempOutNames;
+ outputTypes["subsample"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
//remove citation from list of calcs
for (int i = 0; i < Estimators.size(); i++) { if (Estimators[i] == "citation") { Estimators.erase(Estimators.begin()+i); break; } }
}
-
+
+ temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
+ m->mothurConvert(temp, iters);
+
+ temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; }
+ if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+ else {
+ if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later
+ else { subsample = false; }
+ }
+
+ if (subsample == false) { iters = 1; }
+
if (abort == false) {
ValidCalculators validCalculator;
lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
lines[i].end = int (sqrt(float(i+1)/float(processors)) * numGroups);
}
+
+ if (subsample) {
+ if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
+ subsampleSize = lookup[0]->getNumSeqs();
+ for (int i = 1; i < lookup.size(); i++) {
+ int thisSize = lookup[i]->getNumSeqs();
+
+ if (thisSize < subsampleSize) { subsampleSize = thisSize; }
+ }
+ }else {
+ m->clearGroups();
+ Groups.clear();
+ vector<SharedRAbundVector*> temp;
+ for (int i = 0; i < lookup.size(); i++) {
+ if (lookup[i]->getNumSeqs() < subsampleSize) {
+ m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+ delete lookup[i];
+ }else {
+ Groups.push_back(lookup[i]->getGroup());
+ temp.push_back(lookup[i]);
+ }
+ }
+ lookup = temp;
+ m->setGroups(Groups);
+ }
+ }
if (m->control_pressed) { delete input; for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } m->clearGroups(); return 0; }
try {
EstOutput data;
vector<SharedRAbundVector*> subset;
- vector< vector<seqDist> > calcDists; calcDists.resize(matrixCalculators.size()); //one for each calc, this will be used to make .dist files
-
+ vector< vector< vector<seqDist> > > calcDistsTotals; //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files
+
+ vector< vector<seqDist> > calcDists; calcDists.resize(matrixCalculators.size());
- if(processors == 1){
- driver(thisLookup, 0, numGroups, calcDists);
- }else{
- int process = 1;
- vector<int> processIDS;
+ for (int thisIter = 0; thisIter < iters; thisIter++) {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
- driver(thisLookup, lines[process].start, lines[process].end, calcDists);
-
- string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(getpid()) + ".dist";
- ofstream outtemp;
- m->openOutputFile(tempdistFileName, outtemp);
-
- for (int i = 0; i < calcDists.size(); i++) {
- outtemp << calcDists[i].size() << endl;
-
- for (int j = 0; j < calcDists[i].size(); j++) {
- outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
- }
- }
- outtemp.close();
-
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
- for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
- exit(0);
- }
- }
-
- //parent do your part
- driver(thisLookup, lines[0].start, lines[0].end, calcDists);
-
- //force parent to wait until all the processes are done
- for (int i = 0; i < processIDS.size(); i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- for (int i = 0; i < processIDS.size(); i++) {
- string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(processIDS[i]) + ".dist";
- ifstream intemp;
- m->openInputFile(tempdistFileName, intemp);
-
- for (int k = 0; k < calcDists.size(); k++) {
- int size = 0;
- intemp >> size; m->gobble(intemp);
-
- for (int j = 0; j < size; j++) {
- int seq1 = 0;
- int seq2 = 0;
- float dist = 1.0;
-
- intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
-
- seqDist tempDist(seq1, seq2, dist);
- calcDists[k].push_back(tempDist);
- }
- }
- intemp.close();
- m->mothurRemove(tempdistFileName);
- }
- #else
- //////////////////////////////////////////////////////////////////////////////////////////////////////
- //Windows version shared memory, so be careful when passing variables through the distSharedData struct.
- //Above fork() will clone, so memory is separate, but that's not the case with windows,
- //Taking advantage of shared memory to pass results vectors.
- //////////////////////////////////////////////////////////////////////////////////////////////////////
-
- vector<distSharedData*> pDataArray;
- DWORD dwThreadIdArray[processors-1];
- HANDLE hThreadArray[processors-1];
+ vector<SharedRAbundVector*> thisItersLookup = thisLookup;
- //Create processor worker threads.
- for( int i=1; i<processors; i++ ){
+ if (subsample) {
+ SubSample sample;
+ vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
+ thisItersLookup = sample.getSamplePreserve(thisLookup, tempLabels, subsampleSize);
+ }
+ cout << thisIter << endl;
+ if(processors == 1){
+ driver(thisItersLookup, 0, numGroups, calcDists);
+ }else{
+ int process = 1;
+ vector<int> processIDS;
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+
+ driver(thisItersLookup, lines[process].start, lines[process].end, calcDists);
+
+ string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(getpid()) + ".dist";
+ ofstream outtemp;
+ m->openOutputFile(tempdistFileName, outtemp);
+
+ for (int i = 0; i < calcDists.size(); i++) {
+ outtemp << calcDists[i].size() << endl;
+
+ for (int j = 0; j < calcDists[i].size(); j++) {
+ outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
+ }
+ }
+ outtemp.close();
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
- //make copy of lookup so we don't get access violations
- vector<SharedRAbundVector*> newLookup;
- for (int k = 0; k < thisLookup.size(); k++) {
- SharedRAbundVector* temp = new SharedRAbundVector();
- temp->setLabel(thisLookup[k]->getLabel());
- temp->setGroup(thisLookup[k]->getGroup());
- newLookup.push_back(temp);
+ //parent do your part
+ driver(thisItersLookup, lines[0].start, lines[0].end, calcDists);
+
+ //force parent to wait until all the processes are done
+ for (int i = 0; i < processIDS.size(); i++) {
+ int temp = processIDS[i];
+ wait(&temp);
}
- //for each bin
- for (int k = 0; k < thisLookup[0]->getNumBins(); k++) {
- if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
- for (int j = 0; j < thisLookup.size(); j++) { newLookup[j]->push_back(thisLookup[j]->getAbundance(k), thisLookup[j]->getGroup()); }
+ for (int i = 0; i < processIDS.size(); i++) {
+ string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(processIDS[i]) + ".dist";
+ ifstream intemp;
+ m->openInputFile(tempdistFileName, intemp);
+
+ for (int k = 0; k < calcDists.size(); k++) {
+ int size = 0;
+ intemp >> size; m->gobble(intemp);
+
+ for (int j = 0; j < size; j++) {
+ int seq1 = 0;
+ int seq2 = 0;
+ float dist = 1.0;
+
+ intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
+
+ seqDist tempDist(seq1, seq2, dist);
+ calcDists[k].push_back(tempDist);
+ }
+ }
+ intemp.close();
+ m->mothurRemove(tempdistFileName);
}
+ #else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the distSharedData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to pass results vectors.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
- // Allocate memory for thread data.
- distSharedData* tempSum = new distSharedData(m, lines[i].start, lines[i].end, Estimators, newLookup);
- pDataArray.push_back(tempSum);
- processIDS.push_back(i);
+ vector<distSharedData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
- hThreadArray[i-1] = CreateThread(NULL, 0, MyDistSharedThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+
+ //make copy of lookup so we don't get access violations
+ vector<SharedRAbundVector*> newLookup;
+ for (int k = 0; k < thisItersLookup.size(); k++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(thisItersLookup[k]->getLabel());
+ temp->setGroup(thisItersLookup[k]->getGroup());
+ newLookup.push_back(temp);
+ }
+
+ //for each bin
+ for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) {
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
+ for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); }
+ }
+
+ // Allocate memory for thread data.
+ distSharedData* tempSum = new distSharedData(m, lines[i].start, lines[i].end, Estimators, newLookup);
+ pDataArray.push_back(tempSum);
+ processIDS.push_back(i);
+
+ hThreadArray[i-1] = CreateThread(NULL, 0, MyDistSharedThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+ }
+
+ //parent do your part
+ driver(thisItersLookup, lines[0].start, lines[0].end, calcDists);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) { delete pDataArray[i]->thisLookup[j]; }
+
+ for (int k = 0; k < calcDists.size(); k++) {
+ int size = pDataArray[i]->calcDists[k].size();
+ for (int j = 0; j < size; j++) { calcDists[k].push_back(pDataArray[i]->calcDists[k][j]); }
+ }
+
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+ #endif
}
- //parent do your part
- driver(thisLookup, lines[0].start, lines[0].end, calcDists);
-
- //Wait until all threads have terminated.
- WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+ calcDistsTotals.push_back(calcDists);
- //Close all thread handles and free memory allocations.
- for(int i=0; i < pDataArray.size(); i++){
- for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) { delete pDataArray[i]->thisLookup[j]; }
+ if (subsample) {
+ //clean up memory
+ // for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
+ // thisItersLookup.clear();
+ }
+ }
+
+ if (iters != 1) {
+ //we need to find the average distance and standard deviation for each groups distance
+
+ vector< vector<seqDist> > calcAverages; calcAverages.resize(matrixCalculators.size());
+ for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
+ calcAverages[i].resize(calcDists[i].size());
- for (int k = 0; k < calcDists.size(); k++) {
- int size = pDataArray[i]->calcDists[k].size();
- for (int j = 0; j < size; j++) { calcDists[k].push_back(pDataArray[i]->calcDists[k][j]); }
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].seq1 = calcDists[i][j].seq1;
+ calcAverages[i][j].seq2 = calcDists[i][j].seq2;
+ calcAverages[i][j].dist = 0.0;
+ }
+ }
+
+ for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
+ for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
+ }
+ }
+ }
+
+ for (int i = 0; i < calcAverages.size(); i++) { //finds average.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].dist /= (float) iters;
}
+ }
+
+ //find standard deviation
+ vector< vector<seqDist> > stdDev; stdDev.resize(matrixCalculators.size());
+ for (int i = 0; i < stdDev.size(); i++) { //initialize sums to zero.
+ stdDev[i].resize(calcDists[i].size());
- CloseHandle(hThreadArray[i]);
- delete pDataArray[i];
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].seq1 = calcDists[i][j].seq1;
+ stdDev[i][j].seq2 = calcDists[i][j].seq2;
+ stdDev[i][j].dist = 0.0;
+ }
+ }
+
+ for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+ for (int i = 0; i < stdDev.size(); i++) {
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist));
+ }
+ }
}
- #endif
- }
+ for (int i = 0; i < stdDev.size(); i++) { //finds average.
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].dist /= (float) iters;
+ stdDev[i][j].dist = sqrt(stdDev[i][j].dist);
+ }
+ }
+
+ //print results
+ for (int i = 0; i < calcDists.size(); i++) {
+ vector< vector<float> > matrix; //square matrix to represent the distance
+ matrix.resize(thisLookup.size());
+ for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
+
+ vector< vector<float> > stdmatrix; //square matrix to represent the stdDev
+ stdmatrix.resize(thisLookup.size());
+ for (int k = 0; k < thisLookup.size(); k++) { stdmatrix[k].resize(thisLookup.size(), 0.0); }
+
+
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ int row = calcAverages[i][j].seq1;
+ int column = calcAverages[i][j].seq2;
+ float dist = calcAverages[i][j].dist;
+ float stdDist = stdDev[i][j].dist;
+
+ matrix[row][column] = dist;
+ matrix[column][row] = dist;
+ stdmatrix[row][column] = stdDist;
+ stdmatrix[column][row] = stdDist;
+ }
+
+ string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".results";
+ outputNames.push_back(distFileName); outputTypes["subsample"].push_back(distFileName);
+ ofstream outDist;
+ m->openOutputFile(distFileName, outDist);
+ outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+
+ outDist << "Group1\tGroup2\tAverageDist\tStdDev\n";
+ for (int m = 0; m < matrix.size(); m++) {
+ for (int n = 0; n < m; n++) {
+ outDist << lookup[m]->getGroup() << '\t' << lookup[n]->getGroup() << '\t';
+ outDist << matrix[m][n] << '\t' << stdmatrix[m][n] << endl;
+ }
+ }
+ outDist.close();
+ }
+
+ //output averages as distance matrix
+ calcDists = calcAverages;
+ }
+
+ for (int i = 0; i < calcDists.size(); i++) {
+ if (m->control_pressed) { break; }
+
+ //initialize matrix
+ vector< vector<float> > matrix; //square matrix to represent the distance
+ matrix.resize(thisLookup.size());
+ for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
+
+ for (int j = 0; j < calcDists[i].size(); j++) {
+ int row = calcDists[i][j].seq1;
+ int column = calcDists[i][j].seq2;
+ float dist = calcDists[i][j].dist;
+
+ matrix[row][column] = dist;
+ matrix[column][row] = dist;
+ }
+
+ string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist";
+ outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+ ofstream outDist;
+ m->openOutputFile(distFileName, outDist);
+ outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+
+ printSims(outDist, matrix);
+
+ outDist.close();
+ }
-
-
- for (int i = 0; i < calcDists.size(); i++) {
- if (m->control_pressed) { break; }
-
- //initialize matrix
- vector< vector<float> > matrix; //square matrix to represent the distance
- matrix.resize(thisLookup.size());
- for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
-
- for (int j = 0; j < calcDists[i].size(); j++) {
- int row = calcDists[i][j].seq1;
- int column = calcDists[i][j].seq2;
- float dist = calcDists[i][j].dist;
-
- matrix[row][column] = dist;
- matrix[column][row] = dist;
- }
-
- string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist";
- outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
- ofstream outDist;
- m->openOutputFile(distFileName, outDist);
- outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-
- printSims(outDist, matrix);
-
- outDist.close();
- }
return 0;
}
--- /dev/null
+//
+// subsample.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 4/2/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "subsample.h"
+
+//**********************************************************************************************************************
+vector<SharedRAbundVector*> SubSample::getSamplePreserve(vector<SharedRAbundVector*>& thislookup, vector<string>& newLabels, int size) {
+ try {
+
+ vector<SharedRAbundVector*> newlookup; newlookup.resize(thislookup.size(), NULL);
+
+ //save mothurOut's binLabels to restore for next label
+ vector<string> saveBinLabels = m->currentBinLabels;
+
+ int numBins = thislookup[0]->getNumBins();
+ for (int i = 0; i < thislookup.size(); i++) {
+ int thisSize = thislookup[i]->getNumSeqs();
+
+ if (thisSize != size) {
+
+ string thisgroup = thislookup[i]->getGroup();
+
+ OrderVector order;
+ for(int p=0;p<numBins;p++){
+ for(int j=0;j<thislookup[i]->getAbundance(p);j++){
+ order.push_back(p);
+ }
+ }
+ random_shuffle(order.begin(), order.end());
+
+ SharedRAbundVector* temp = new SharedRAbundVector(numBins);
+ temp->setLabel(thislookup[i]->getLabel());
+ temp->setGroup(thislookup[i]->getGroup());
+
+ newlookup[i] = temp;
+
+ for (int j = 0; j < size; j++) {
+
+ if (m->control_pressed) { return newlookup; }
+
+ int bin = order.get(j);
+
+ int abund = newlookup[i]->getAbundance(bin);
+ newlookup[i]->set(bin, (abund+1), thisgroup);
+ }
+ }
+ }
+
+ //subsampling may have created some otus with no sequences in them
+ eliminateZeroOTUS(newlookup);
+
+ if (m->control_pressed) { return newlookup; }
+
+ //save mothurOut's binLabels to restore for next label
+ newLabels = m->currentBinLabels;
+ m->currentBinLabels = saveBinLabels;
+
+ return newlookup;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSample", "getSamplePreserve");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
+ try {
+
+ //save mothurOut's binLabels to restore for next label
+ vector<string> saveBinLabels = m->currentBinLabels;
+
+ int numBins = thislookup[0]->getNumBins();
+ for (int i = 0; i < thislookup.size(); i++) {
+ int thisSize = thislookup[i]->getNumSeqs();
+
+ if (thisSize != size) {
+
+ string thisgroup = thislookup[i]->getGroup();
+
+ OrderVector order;
+ for(int p=0;p<numBins;p++){
+ for(int j=0;j<thislookup[i]->getAbundance(p);j++){
+ order.push_back(p);
+ }
+ }
+ random_shuffle(order.begin(), order.end());
+
+ SharedRAbundVector* temp = new SharedRAbundVector(numBins);
+ temp->setLabel(thislookup[i]->getLabel());
+ temp->setGroup(thislookup[i]->getGroup());
+
+ delete thislookup[i];
+ thislookup[i] = temp;
+
+
+ for (int j = 0; j < size; j++) {
+
+ if (m->control_pressed) { return m->currentBinLabels; }
+
+ int bin = order.get(j);
+
+ int abund = thislookup[i]->getAbundance(bin);
+ thislookup[i]->set(bin, (abund+1), thisgroup);
+ }
+ }
+ }
+
+ //subsampling may have created some otus with no sequences in them
+ eliminateZeroOTUS(thislookup);
+
+ if (m->control_pressed) { return m->currentBinLabels; }
+
+ //save mothurOut's binLabels to restore for next label
+ vector<string> subsampleBinLabels = m->currentBinLabels;
+ m->currentBinLabels = saveBinLabels;
+
+ return subsampleBinLabels;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSample", "getSample");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
+ try {
+
+ vector<SharedRAbundVector*> newLookup;
+ for (int i = 0; i < thislookup.size(); i++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(thislookup[i]->getLabel());
+ temp->setGroup(thislookup[i]->getGroup());
+ newLookup.push_back(temp);
+ }
+
+ //for each bin
+ vector<string> newBinLabels;
+ string snumBins = toString(thislookup[0]->getNumBins());
+ for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
+
+ //look at each sharedRabund and make sure they are not all zero
+ bool allZero = true;
+ for (int j = 0; j < thislookup.size(); j++) {
+ if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
+ }
+
+ //if they are not all zero add this bin
+ if (!allZero) {
+ for (int j = 0; j < thislookup.size(); j++) {
+ newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
+ }
+ //if there is a bin label use it otherwise make one
+ string binLabel = "Otu";
+ string sbinNumber = toString(i+1);
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { binLabel += "0"; }
+ }
+ binLabel += sbinNumber;
+ if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; }
+
+ newBinLabels.push_back(binLabel);
+ }
+ }
+
+ for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
+ thislookup.clear();
+
+ thislookup = newLookup;
+ m->currentBinLabels = newBinLabels;
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSample", "eliminateZeroOTUS");
+ exit(1);
+ }
+}
+
+
+//**********************************************************************************************************************
+
+
#include "subsamplecommand.h"
#include "sharedutilities.h"
#include "deconvolutecommand.h"
+#include "subsample.h"
//**********************************************************************************************************************
vector<string> SubSampleCommand::setParameters(){
string thisOutputDir = outputDir;
if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); }
string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + ".subsample" + m->getExtension(sharedfile);
-
-
- ofstream out;
+
+ SubSample sample;
+ vector<string> subsampledLabels = sample.getSample(thislookup, size);
+
+ if (m->control_pressed) { return 0; }
+
+ ofstream out;
m->openOutputFile(outputFileName, out);
outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName);
- int numBins = thislookup[0]->getNumBins();
- for (int i = 0; i < thislookup.size(); i++) {
- int thisSize = thislookup[i]->getNumSeqs();
-
- if (thisSize != size) {
-
- string thisgroup = thislookup[i]->getGroup();
-
- OrderVector* order = new OrderVector();
- for(int p=0;p<numBins;p++){
- for(int j=0;j<thislookup[i]->getAbundance(p);j++){
- order->push_back(p);
- }
- }
- random_shuffle(order->begin(), order->end());
-
- SharedRAbundVector* temp = new SharedRAbundVector(numBins);
- temp->setLabel(thislookup[i]->getLabel());
- temp->setGroup(thislookup[i]->getGroup());
-
- delete thislookup[i];
- thislookup[i] = temp;
-
-
- for (int j = 0; j < size; j++) {
-
- if (m->control_pressed) { delete order; out.close(); return 0; }
-
- //get random number to sample from order between 0 and thisSize-1.
- //don't need this because of the random shuffle above
- //int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
-
- int bin = order->get(j);
-
- int abund = thislookup[i]->getAbundance(bin);
- thislookup[i]->set(bin, (abund+1), thisgroup);
- }
- delete order;
- }
- }
-
- //subsampling may have created some otus with no sequences in them
- eliminateZeroOTUS(thislookup);
-
- if (m->control_pressed) { out.close(); return 0; }
-
+ m->currentBinLabels = subsampledLabels;
+
thislookup[0]->printHeaders(out);
for (int i = 0; i < thislookup.size(); i++) {
out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t';
thislookup[i]->print(out);
}
-
out.close();
-
- //save mothurOut's binLabels to restore for next label
+
+
+ //save mothurOut's binLabels to restore for next label
m->currentBinLabels = saveBinLabels;
return 0;
}
}
//**********************************************************************************************************************
-int SubSampleCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
- try {
-
- vector<SharedRAbundVector*> newLookup;
- for (int i = 0; i < thislookup.size(); i++) {
- SharedRAbundVector* temp = new SharedRAbundVector();
- temp->setLabel(thislookup[i]->getLabel());
- temp->setGroup(thislookup[i]->getGroup());
- newLookup.push_back(temp);
- }
-
- //for each bin
- vector<string> newBinLabels;
- string snumBins = toString(thislookup[0]->getNumBins());
- for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
- if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
-
- //look at each sharedRabund and make sure they are not all zero
- bool allZero = true;
- for (int j = 0; j < thislookup.size(); j++) {
- if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
- }
-
- //if they are not all zero add this bin
- if (!allZero) {
- for (int j = 0; j < thislookup.size(); j++) {
- newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
- }
- //if there is a bin label use it otherwise make one
- string binLabel = "Otu";
- string sbinNumber = toString(i+1);
- if (sbinNumber.length() < snumBins.length()) {
- int diff = snumBins.length() - sbinNumber.length();
- for (int h = 0; h < diff; h++) { binLabel += "0"; }
- }
- binLabel += sbinNumber;
- if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; }
-
- newBinLabels.push_back(binLabel);
- }
- }
-
- for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
- thislookup.clear();
-
- thislookup = newLookup;
- m->currentBinLabels = newBinLabels;
-
- return 0;
-
- }
- catch(exception& e) {
- m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
- exit(1);
- }
-}
-
-//**********************************************************************************************************************