From: westcott Date: Fri, 5 Nov 2010 20:02:24 +0000 (+0000) Subject: fixed metastats, added resize to cluster.classic, added code to kill children if... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=89f19f9c6ab89c2f6c7c6921a328ae87bce6f8e3 fixed metastats, added resize to cluster.classic, added code to kill children if parent is unable to spawn and dies. --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 1c282d9..855dad8 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -1124,7 +1124,6 @@ }; buildConfigurationList = 1DEB919308733D9F0010E9CD /* Build configuration list for PBXProject "Mothur" */; compatibilityVersion = "Xcode 3.0"; - developmentRegion = English; hasScannedForEncodings = 1; knownRegions = ( English, diff --git a/clusterclassic.cpp b/clusterclassic.cpp index 41c1647..0e48690 100644 --- a/clusterclassic.cpp +++ b/clusterclassic.cpp @@ -226,7 +226,7 @@ int ClusterClassic::readPhylipFile(string filename, NameAssignment* nameMap) { //sets smallCol and smallRow, returns distance double ClusterClassic::getSmallCell() { try { - + smallDist = aboveCutoff; smallRow = 1; smallCol = 0; @@ -279,6 +279,10 @@ void ClusterClassic::clusterBins(){ rabund->set(smallRow, rabund->get(smallRow)+rabund->get(smallCol)); rabund->set(smallCol, 0); + for (int i = smallCol+1; i < rabund->size(); i++) { + rabund->set((i-1), rabund->get(i)); + } + rabund->resize((rabund->size()-1)); rabund->setLabel(toString(smallDist)); // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl; @@ -296,6 +300,10 @@ void ClusterClassic::clusterNames(){ list->set(smallRow, list->get(smallRow)+','+list->get(smallCol)); list->set(smallCol, ""); + for (int i = smallCol+1; i < list->size(); i++) { + list->set((i-1), list->get(i)); + } + list->resize((list->size()-1)); list->setLabel(toString(smallDist)); // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl; @@ -308,36 +316,21 @@ void ClusterClassic::clusterNames(){ /***********************************************************************/ void ClusterClassic::update(double& cutOFF){ try { -//cout << "before update " << endl; //print(); getSmallCell(); int r, c; r = smallRow; c = smallCol; - //because we only store lt, we need to make sure we grab the right location - //if (smallRow < smallCol) { c = smallRow; r = smallCol; } //smallRow is really our column value - //else { r = smallRow; c = smallCol; } //smallRow is the row value - - //reset rows smallest distance - //rowSmallDists[r].dist = aboveCutoff; rowSmallDists[r].row = 0; rowSmallDists[r].col = 0; - //rowSmallDists[c].dist = aboveCutoff; rowSmallDists[c].row = 0; rowSmallDists[c].col = 0; - - //if your rows smallest distance is from smallRow or smallCol, reset - //for(int i=0;i r) { distRow = dMatrix[i][r]; } else { distRow = dMatrix[r][i]; } - + if (i > c) { distCol = dMatrix[i][c]; dMatrix[i][c] = aboveCutoff; } //like removeCell else { distCol = dMatrix[c][i]; dMatrix[c][i] = aboveCutoff; } - + if(method == "furthest"){ newDist = max(distRow, distCol); } @@ -352,39 +345,33 @@ void ClusterClassic::update(double& cutOFF){ else if (method == "nearest"){ newDist = min(distRow, distCol); } - + //cout << "newDist = " << newDist << endl; if (i > r) { dMatrix[i][r] = newDist; } else { dMatrix[r][i] = newDist; } - //if (newDist < rowSmallDists[i].dist) { rowSmallDists[i].dist = newDist; rowSmallDists[i].col = r; rowSmallDists[i].row = i; } } - //cout << "rowsmall = " << i << '\t' << rowSmallDists[i].dist << endl; } clusterBins(); clusterNames(); - - //find new small for 2 rows we just merged - //colDist temp(0,0,100.0); - //rowSmallDists[r] = temp; - - //for (int i = 0; i < dMatrix[r].size(); i++) { - // if (dMatrix[r][i] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[r][i]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; } - //} - //for (int i = dMatrix[r].size()+1; i < dMatrix.size(); i++) { - // if (dMatrix[i][dMatrix[r].size()] < rowSmallDists[r].dist) { rowSmallDists[r].dist = dMatrix[i][dMatrix[r].size()]; rowSmallDists[r].col = r; rowSmallDists[r].row = i; } - //} - //rowSmallDists[c] = temp; - //for (int i = 0; i < dMatrix[c].size(); i++) { - // if (dMatrix[c][i] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[c][i]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; } - //} - //for (int i = dMatrix[c].size()+1; i < dMatrix.size(); i++) { - // if (dMatrix[i][dMatrix[c].size()] < rowSmallDists[c].dist) { rowSmallDists[c].dist = dMatrix[i][dMatrix[c].size()]; rowSmallDists[c].col = c; rowSmallDists[c].row = i; } - //} + //resize each row + for(int i=0;ierrorOut(e, "ClusterClassic", "update"); diff --git a/clusterdoturcommand.cpp b/clusterdoturcommand.cpp index 4987d4e..56fbd5c 100644 --- a/clusterdoturcommand.cpp +++ b/clusterdoturcommand.cpp @@ -227,7 +227,7 @@ int ClusterDoturCommand::execute(){ int estart = time(NULL); - while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 0)){ + while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){ if (m->control_pressed) { delete cluster; delete list; delete rabund; sabundFile.close();rabundFile.close();listFile.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear(); return 0; } cluster->update(cutoff); diff --git a/commandfactory.cpp b/commandfactory.cpp index 927f374..3ac3781 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -96,6 +96,7 @@ #include "deuniqueseqscommand.h" #include "pairwiseseqscommand.h" #include "clusterdoturcommand.h" +#include "subsamplecommand.h" /*******************************************************/ @@ -197,6 +198,7 @@ CommandFactory::CommandFactory(){ commands["fastq.info"] = "fastq.info"; commands["deunique.seqs"] = "deunique.seqs"; commands["cluster.classic"] = "cluster.classic"; + commands["sub.sample"] = "sub.sample"; commands["pairwise.seqs"] = "MPIEnabled"; commands["pipeline.pds"] = "MPIEnabled"; commands["classify.seqs"] = "MPIEnabled"; @@ -344,6 +346,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "deunique.seqs") { command = new DeUniqueSeqsCommand(optionString); } else if(commandName == "pairwise.seqs") { command = new PairwiseSeqsCommand(optionString); } else if(commandName == "cluster.classic") { command = new ClusterDoturCommand(optionString); } + else if(commandName == "sub.sample") { command = new SubSampleCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -458,6 +461,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "deunique.seqs") { pipecommand = new DeUniqueSeqsCommand(optionString); } else if(commandName == "pairwise.seqs") { pipecommand = new PairwiseSeqsCommand(optionString); } else if(commandName == "cluster.classic") { pipecommand = new ClusterDoturCommand(optionString); } + else if(commandName == "sub.sample") { pipecommand = new SubSampleCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -560,6 +564,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "deunique.seqs") { shellcommand = new DeUniqueSeqsCommand(); } else if(commandName == "pairwise.seqs") { shellcommand = new PairwiseSeqsCommand(); } else if(commandName == "cluster.classic") { shellcommand = new ClusterDoturCommand(); } + else if(commandName == "sub.sample") { shellcommand = new SubSampleCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/datavector.hpp b/datavector.hpp index 0b09e0d..a5f96eb 100644 --- a/datavector.hpp +++ b/datavector.hpp @@ -33,6 +33,7 @@ public: virtual void resize(int) = 0; virtual int size() = 0; virtual void print(ostream&) = 0; + virtual void clear() = 0; void setLabel(string l) { label = l; } string getLabel() { return label; } diff --git a/getseqscommand.cpp b/getseqscommand.cpp index 4b93d1f..9fbfb93 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -14,7 +14,7 @@ //********************************************************************************************************************** vector GetSeqsCommand::getValidParameters(){ try { - string Array[] = {"fasta","name", "group", "alignreport", "accnos", "dups", "list","taxonomy","outputdir","inputdir"}; + string Array[] = {"fasta","name", "group", "qfile","alignreport", "accnos", "dups", "list","taxonomy","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); return myArray; } @@ -35,6 +35,7 @@ GetSeqsCommand::GetSeqsCommand(){ outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "GetSeqsCommand", "GetSeqsCommand"); @@ -74,7 +75,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { else { //valid paramters for this command - string Array[] = {"fasta","name", "group", "alignreport", "accnos", "dups", "list","taxonomy","outputdir","inputdir"}; + string Array[] = {"fasta","name", "group", "alignreport", "qfile", "accnos", "dups", "list","taxonomy","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -96,6 +97,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -160,6 +162,14 @@ GetSeqsCommand::GetSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("qfile"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["qfile"] = inputDir + it->second; } + } } @@ -192,11 +202,15 @@ GetSeqsCommand::GetSeqsCommand(string option) { if (taxfile == "not open") { abort = true; } else if (taxfile == "not found") { taxfile = ""; } + qualfile = validParameter.validFile(parameters, "qfile", true); + if (qualfile == "not open") { abort = true; } + else if (qualfile == "not found") { qualfile = ""; } + string usedDups = "true"; string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "false"; usedDups = ""; } dups = m->isTrue(temp); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; } if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } @@ -212,9 +226,9 @@ GetSeqsCommand::GetSeqsCommand(string option) { void GetSeqsCommand::help(){ try { - m->mothurOut("The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, list, taxonomy or alignreport file.\n"); + m->mothurOut("The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, list, taxonomy, quality or alignreport file.\n"); m->mothurOut("It outputs a file containing only the sequences in the .accnos file.\n"); - m->mothurOut("The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy, alignreport and dups. You must provide accnos and at least one of the other parameters.\n"); + m->mothurOut("The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups. You must provide accnos and at least one of the other parameters.\n"); m->mothurOut("The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"); m->mothurOut("The get.seqs command should be in the following format: get.seqs(accnos=yourAccnos, fasta=yourFasta).\n"); m->mothurOut("Example get.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n"); @@ -245,6 +259,7 @@ int GetSeqsCommand::execute(){ if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } if (taxfile != "") { readTax(); } + if (qualfile != "") { readQual(); } if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } @@ -313,6 +328,71 @@ int GetSeqsCommand::readFasta(){ } } //********************************************************************************************************************** +int GetSeqsCommand::readQual(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(qualfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(qualfile)) + "pick" + m->getExtension(qualfile); + ofstream out; + m->openOutputFile(outputFileName, out); + + + ifstream in; + m->openInputFile(qualfile, in); + string name; + + bool wroteSomething = false; + + + while(!in.eof()){ + string saveName = ""; + string name = ""; + string scores = ""; + + in >> name; + + if (name.length() != 0) { + saveName = name.substr(1); + while (!in.eof()) { + char c = in.get(); + if (c == 10 || c == 13){ break; } + else { name += c; } + } + m->gobble(in); + } + + while(in){ + char letter= in.get(); + if(letter == '>'){ in.putback(letter); break; } + else{ scores += letter; } + } + + m->gobble(in); + + if (names.count(saveName) != 0) { + wroteSomething = true; + + out << name << endl << scores; + } + + m->gobble(in); + } + in.close(); + out.close(); + + + if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); outputTypes["qfile"].push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "GetSeqsCommand", "readQual"); + exit(1); + } +} +//********************************************************************************************************************** int GetSeqsCommand::readList(){ try { string thisOutputDir = outputDir; diff --git a/getseqscommand.h b/getseqscommand.h index ea93b5a..b249072 100644 --- a/getseqscommand.h +++ b/getseqscommand.h @@ -29,7 +29,7 @@ class GetSeqsCommand : public Command { private: set names; vector outputNames; - string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir; + string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; bool abort, dups; map > outputTypes; @@ -40,6 +40,7 @@ class GetSeqsCommand : public Command { int readAccnos(); int readList(); int readTax(); + int readQual(); }; diff --git a/metastats.h b/metastats.h index 2f3cc62..63fb0ee 100644 --- a/metastats.h +++ b/metastats.h @@ -31,7 +31,7 @@ void calc_twosample_ts(double *Pmatrix,int *g,int *nc,int *nr,double *Ts,double *Tinitial,double *counter1); void meanvar(double *pmatrix,int *g,int *nr,int *nc,double *storage); void start(double *Imatrix,int *g,int *nr,int *nc,double *testing, - double storage[][9]); + double**); int metastat_main (char*, int, int, double, int, double**, int); diff --git a/metastats2.c b/metastats2.c index 136c042..b8db243 100644 --- a/metastats2.c +++ b/metastats2.c @@ -22,18 +22,23 @@ int metastat_main (char* outputFileName, int numRows, int numCols, double thresh } // Initialize the matrices - size = row*col; -//printf("size = %d\n.", size); - double matrix[row][col]; - double pmatrix[size],pmatrix2[size],permuted[size]; - double storage[row][9]; -//printf("here\n.", size); - for (i=0;i.999999999){ @@ -198,11 +211,14 @@ int metastat_main (char* outputFileName, int numRows, int numCols, double thresh } } else{ - +printf("here before testp\n"); testp(permuted_ttests, permutes, permuted,pmatrix, nc, nr, gvalue,tinitial,pvalues); // Checks to make sure the matrix isn't sparse. - double sparse[row], sparse2[row]; + double * sparse, * sparse2; + sparse = (double *) malloc(size*sizeof(double *)); + sparse2 = (double *) malloc(size*sizeof(double *)); + for(i=0;i.999999999){ @@ -271,8 +287,12 @@ int metastat_main (char* outputFileName, int numRows, int numCols, double thresh } // Calculates the mean of counts (not normalized) - double temp[row][2]; - + double ** temp; + temp = malloc(row*sizeof(double *)); + for (i = 0; i::iterator begin(); vector::iterator end(); diff --git a/qualityscores.cpp b/qualityscores.cpp index fa78b69..f76d982 100644 --- a/qualityscores.cpp +++ b/qualityscores.cpp @@ -45,7 +45,7 @@ QualityScores::QualityScores(ifstream& qFile, int l){ else { seqName = seqName.substr(1); } - + //m->getline(qFile, line); //istringstream qualStream(line); @@ -57,6 +57,40 @@ QualityScores::QualityScores(ifstream& qFile, int l){ //seqLength = qScores.size(); + /*while(!in.eof()){ + string saveName = ""; + string name = ""; + string scores = ""; + + in >> name; + //cout << name << endl; + if (name.length() != 0) { + saveName = name.substr(1); + while (!in.eof()) { + char c = in.get(); + if (c == 10 || c == 13){ break; } + else { name += c; } + } + m->gobble(in); + } + + while(in){ + char letter= in.get(); + if(letter == '>'){ in.putback(letter); break; } + else{ scores += letter; } + } + + //istringstream iss (scores,istringstream::in); + + //int count = 0; int tempScore; + //while (iss) { iss >> tempScore; count++; } + //cout << saveName << '\t' << count << endl; + + m->gobble(in); + }*/ + + + for(int i=0;i> score; qScores.push_back(score); @@ -70,6 +104,7 @@ QualityScores::QualityScores(ifstream& qFile, int l){ } } +/**************************************************************************************************/ /**************************************************************************************************/ diff --git a/rabundvector.cpp b/rabundvector.cpp index 43dbfac..6cbaa0d 100644 --- a/rabundvector.cpp +++ b/rabundvector.cpp @@ -114,7 +114,15 @@ int RAbundVector::get(int index){ return data[index]; } +/***********************************************************************/ +void RAbundVector::clear(){ + numBins = 0; + maxRank = 0; + numSeqs = 0; + data.clear(); + +} /***********************************************************************/ void RAbundVector::push_back(int binSize){ diff --git a/rabundvector.hpp b/rabundvector.hpp index fab02a9..9516564 100644 --- a/rabundvector.hpp +++ b/rabundvector.hpp @@ -41,6 +41,7 @@ public: int sum(); int sum(int); int numNZ(); + void clear(); vector::reverse_iterator rbegin(); vector::reverse_iterator rend(); diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp index f1804ed..384a360 100644 --- a/removeseqscommand.cpp +++ b/removeseqscommand.cpp @@ -14,7 +14,7 @@ //********************************************************************************************************************** vector RemoveSeqsCommand::getValidParameters(){ try { - string Array[] = {"fasta","name", "group", "alignreport", "accnos", "list","taxonomy","outputdir","inputdir", "dups" }; + string Array[] = {"fasta","name", "group", "alignreport", "accnos", "qfile","list","taxonomy","outputdir","inputdir", "dups" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); return myArray; } @@ -35,6 +35,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(){ outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "RemoveSeqsCommand", "RemoveSeqsCommand"); @@ -74,7 +75,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { else { //valid paramters for this command - string Array[] = {"fasta","name", "group", "alignreport", "accnos", "list","taxonomy","outputdir","inputdir", "dups" }; + string Array[] = {"fasta","name", "group", "alignreport", "accnos", "qfile", "list","taxonomy","outputdir","inputdir", "dups" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -96,6 +97,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -160,6 +162,14 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("qfile"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["qfile"] = inputDir + it->second; } + } } @@ -191,6 +201,10 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { taxfile = validParameter.validFile(parameters, "taxonomy", true); if (taxfile == "not open") { abort = true; } else if (taxfile == "not found") { taxfile = ""; } + + qualfile = validParameter.validFile(parameters, "qfile", true); + if (qualfile == "not open") { abort = true; } + else if (qualfile == "not found") { qualfile = ""; } string usedDups = "true"; @@ -201,7 +215,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { } dups = m->isTrue(temp); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, alignreport or list."); m->mothurOutEndLine(); abort = true; } + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; } if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } } @@ -216,9 +230,9 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { void RemoveSeqsCommand::help(){ try { - m->mothurOut("The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, list, taxonomy or alignreport file.\n"); + m->mothurOut("The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, list, taxonomy, quality or alignreport file.\n"); m->mothurOut("It outputs a file containing the sequences NOT in the .accnos file.\n"); - m->mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list, taxonomy, alignreport and dups. You must provide accnos and at least one of the file parameters.\n"); + m->mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups. You must provide accnos and at least one of the file parameters.\n"); m->mothurOut("The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=true. \n"); m->mothurOut("The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n"); m->mothurOut("Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n"); @@ -249,6 +263,7 @@ int RemoveSeqsCommand::execute(){ if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } if (taxfile != "") { readTax(); } + if (qualfile != "") { readQual(); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } @@ -304,7 +319,7 @@ int RemoveSeqsCommand::readFasta(){ out.close(); if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } - outputTypes["fasta"].push_back(outputFileName); + outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); return 0; @@ -315,6 +330,71 @@ int RemoveSeqsCommand::readFasta(){ } } //********************************************************************************************************************** +int RemoveSeqsCommand::readQual(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(qualfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(qualfile)) + "pick" + m->getExtension(qualfile); + ofstream out; + m->openOutputFile(outputFileName, out); + + + ifstream in; + m->openInputFile(qualfile, in); + string name; + + bool wroteSomething = false; + + + while(!in.eof()){ + string saveName = ""; + string name = ""; + string scores = ""; + + in >> name; + + if (name.length() != 0) { + saveName = name.substr(1); + while (!in.eof()) { + char c = in.get(); + if (c == 10 || c == 13){ break; } + else { name += c; } + } + m->gobble(in); + } + + while(in){ + char letter= in.get(); + if(letter == '>'){ in.putback(letter); break; } + else{ scores += letter; } + } + + m->gobble(in); + + if (names.count(saveName) == 0) { + wroteSomething = true; + + out << name << endl << scores; + } + + m->gobble(in); + } + in.close(); + out.close(); + + + if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); outputTypes["qfile"].push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "RemoveSeqsCommand", "readQual"); + exit(1); + } +} +//********************************************************************************************************************** int RemoveSeqsCommand::readList(){ try { string thisOutputDir = outputDir; @@ -375,7 +455,7 @@ int RemoveSeqsCommand::readList(){ out.close(); if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } - outputTypes["list"].push_back(outputFileName); + outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName); return 0; @@ -461,7 +541,7 @@ int RemoveSeqsCommand::readName(){ out.close(); if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } - outputTypes["name"].push_back(outputFileName); + outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName); return 0; } @@ -505,7 +585,7 @@ int RemoveSeqsCommand::readGroup(){ out.close(); if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } - outputTypes["group"].push_back(outputFileName); + outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName); return 0; } @@ -547,7 +627,7 @@ int RemoveSeqsCommand::readTax(){ out.close(); if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } - outputTypes["taxonomy"].push_back(outputFileName); + outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName); return 0; } @@ -613,7 +693,7 @@ int RemoveSeqsCommand::readAlign(){ out.close(); if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } - outputTypes["alignreport"].push_back(outputFileName); + outputTypes["alignreport"].push_back(outputFileName); outputNames.push_back(outputFileName); return 0; diff --git a/removeseqscommand.h b/removeseqscommand.h index 8321212..de1e3d9 100644 --- a/removeseqscommand.h +++ b/removeseqscommand.h @@ -28,7 +28,7 @@ class RemoveSeqsCommand : public Command { private: set names; - string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir; + string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; bool abort, dups; vector outputNames; map > outputTypes; @@ -40,6 +40,7 @@ class RemoveSeqsCommand : public Command { void readAccnos(); int readList(); int readTax(); + int readQual(); }; diff --git a/sabundvector.cpp b/sabundvector.cpp index 6e4401b..1bceec2 100644 --- a/sabundvector.cpp +++ b/sabundvector.cpp @@ -153,7 +153,13 @@ void SAbundVector::print(string prefix, ostream& output){ } output << endl; } - +/***********************************************************************/ +void SAbundVector::clear(){ + numBins = 0; + maxRank = 0; + numSeqs = 0; + data.clear(); +} /***********************************************************************/ void SAbundVector::print(ostream& output){ try { diff --git a/sabundvector.hpp b/sabundvector.hpp index 769a35d..561294c 100644 --- a/sabundvector.hpp +++ b/sabundvector.hpp @@ -40,6 +40,7 @@ public: int sum(); void resize(int); int size(); + void clear(); void print(ostream&); void print(string, ostream&); diff --git a/sharedordervector.cpp b/sharedordervector.cpp index 3944212..15ee7fc 100644 --- a/sharedordervector.cpp +++ b/sharedordervector.cpp @@ -171,7 +171,14 @@ void SharedOrderVector::print(ostream& output){ } } +/***********************************************************************/ +void SharedOrderVector::clear(){ + numBins = 0; + maxRank = 0; + numSeqs = 0; + data.clear(); +} /***********************************************************************/ void SharedOrderVector::resize(int){ diff --git a/sharedordervector.h b/sharedordervector.h index 3568450..6aa8ba1 100644 --- a/sharedordervector.h +++ b/sharedordervector.h @@ -24,6 +24,7 @@ struct individual { bool operator()(const individual& i1, const individual& i2) { return (i1.abundance > i2.abundance); } + individual() { group = ""; bin = 0; abundance = 0; } }; struct individualFloat { @@ -33,6 +34,7 @@ struct individualFloat { bool operator()(const individual& i1, const individual& i2) { return (i1.abundance > i2.abundance); } + individualFloat() { group = ""; bin = 0; abundance = 0.0; } }; @@ -66,6 +68,7 @@ public: vector::iterator end(); void push_back(int, int, string); //OTU, abundance, group MUST CALL UPDATE STATS AFTER PUSHBACK!!! void updateStats(); + void clear(); int getNumBins(); int getNumSeqs(); diff --git a/sharedrabundfloatvector.cpp b/sharedrabundfloatvector.cpp index 84b1cf5..52ffbb8 100644 --- a/sharedrabundfloatvector.cpp +++ b/sharedrabundfloatvector.cpp @@ -131,11 +131,20 @@ void SharedRAbundFloatVector::set(int binNumber, float newBinSize, string groupn numSeqs += (newBinSize - oldBinSize); } catch(exception& e) { - m->errorOut(e, "SharedRAbundVector", "set"); + m->errorOut(e, "SharedRAbundFloatVector", "set"); exit(1); } } +/***********************************************************************/ +void SharedRAbundFloatVector::clear(){ + numBins = 0; + maxRank = 0; + numSeqs = 0; + data.clear(); + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; } + lookup.clear(); +} /***********************************************************************/ float SharedRAbundFloatVector::getAbundance(int index){ return data[index].abundance; diff --git a/sharedrabundfloatvector.h b/sharedrabundfloatvector.h index f826d9c..a3407f4 100644 --- a/sharedrabundfloatvector.h +++ b/sharedrabundfloatvector.h @@ -51,6 +51,7 @@ public: void push_back(float, string); //abundance, groupname void pop_back(); void resize(int); + void clear(); int size(); void print(ostream&); diff --git a/sharedrabundvector.cpp b/sharedrabundvector.cpp index 9328cbd..9276b7b 100644 --- a/sharedrabundvector.cpp +++ b/sharedrabundvector.cpp @@ -205,6 +205,16 @@ vector SharedRAbundVector::getData(){ } /***********************************************************************/ +void SharedRAbundVector::clear(){ + numBins = 0; + maxRank = 0; + numSeqs = 0; + data.clear(); + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; } + lookup.clear(); +} +/***********************************************************************/ + void SharedRAbundVector::push_back(int binSize, string groupName){ try { individual newGuy; @@ -486,15 +496,15 @@ SharedOrderVector SharedRAbundVector::getSharedOrderVector() { OrderVector SharedRAbundVector::getOrderVector(map* nameMap = NULL) { try { OrderVector ov; - - for(int i=0;i::reverse_iterator rbegin(); vector::reverse_iterator rend(); diff --git a/sharedsabundvector.cpp b/sharedsabundvector.cpp index 4aea589..1df608b 100644 --- a/sharedsabundvector.cpp +++ b/sharedsabundvector.cpp @@ -234,6 +234,15 @@ SharedOrderVector SharedSAbundVector::getSharedOrderVector() { exit(1); } } +/***********************************************************************/ + +void SharedSAbundVector::clear(){ + numBins = 0; + maxRank = 0; + numSeqs = 0; + data.clear(); +} + /***********************************************************************/ OrderVector SharedSAbundVector::getOrderVector(map* hold = NULL){ try { diff --git a/sharedsabundvector.h b/sharedsabundvector.h index 29b3745..cd78a2e 100644 --- a/sharedsabundvector.h +++ b/sharedsabundvector.h @@ -45,6 +45,7 @@ public: void pop_back(); void resize(int); int size(); + void clear(); void print(ostream&); diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index 04f6c7c..036341f 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -12,7 +12,7 @@ //********************************************************************************************************************** vector SubSampleCommand::getValidParameters(){ try { - string Array[] = {"groups","label","outputdir","inputdir"}; + string Array[] = {"fasta", "group", "list","shared","rabund", "name","sabund","size","groups","label","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); return myArray; } @@ -31,6 +31,9 @@ SubSampleCommand::SubSampleCommand(){ outputTypes["list"] = tempOutNames; outputTypes["rabund"] = tempOutNames; outputTypes["sabund"] = tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["group"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "SubSampleCommand", "GetRelAbundCommand"); @@ -40,7 +43,8 @@ SubSampleCommand::SubSampleCommand(){ //********************************************************************************************************************** vector SubSampleCommand::getRequiredParameters(){ try { - vector myArray; + string Array[] = {"fasta","list","shared","rabund", "sabund","or"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); return myArray; } catch(exception& e) { @@ -51,8 +55,7 @@ vector SubSampleCommand::getRequiredParameters(){ //********************************************************************************************************************** vector SubSampleCommand::getRequiredFiles(){ try { - string Array[] = {"shared","list","rabund","sabund","or"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + vector myArray; return myArray; } catch(exception& e) { @@ -73,8 +76,8 @@ SubSampleCommand::SubSampleCommand(string option) { else { //valid paramters for this command - string AlignArray[] = {"groups","label","outputdir","inputdir"}; - vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + string Array[] = {"fasta", "group", "list","shared","rabund", "sabund","name","size","groups","label","outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); map parameters = parser.getParameters(); @@ -82,7 +85,8 @@ SubSampleCommand::SubSampleCommand(string option) { ValidParameters validParameter; //check to make sure all parameters are valid for command - for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + map::iterator it; + for (it = parameters.begin(); it != parameters.end(); it++) { if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } @@ -92,16 +96,105 @@ SubSampleCommand::SubSampleCommand(string option) { outputTypes["list"] = tempOutNames; outputTypes["rabund"] = tempOutNames; outputTypes["sabund"] = tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["group"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ - outputDir = ""; - outputDir += m->hasPath(globaldata->inputFileName); //if user entered a file with a path then preserve it + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("list"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["list"] = inputDir + it->second; } + } + + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("shared"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["shared"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + + it = parameters.find("sabund"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["sabund"] = inputDir + it->second; } + } + + it = parameters.find("rabund"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["rabund"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } } - //make sure the user has already run the read.otu command - if ((globaldata->getSharedFile() == "") && (globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { m->mothurOut("You must read a list, sabund, rabund or shared file before you can use the sub.sample command."); m->mothurOutEndLine(); abort = true; } - + //check for required parameters + listfile = validParameter.validFile(parameters, "list", true); + if (listfile == "not open") { listfile = ""; abort = true; } + else if (listfile == "not found") { listfile = ""; } + + sabundfile = validParameter.validFile(parameters, "sabund", true); + if (sabundfile == "not open") { sabundfile = ""; abort = true; } + else if (sabundfile == "not found") { sabundfile = ""; } + + rabundfile = validParameter.validFile(parameters, "rabund", true); + if (rabundfile == "not open") { rabundfile = ""; abort = true; } + else if (rabundfile == "not found") { rabundfile = ""; } + + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { fastafile = ""; abort = true; } + else if (fastafile == "not found") { fastafile = ""; } + + sharedfile = validParameter.validFile(parameters, "shared", true); + if (sharedfile == "not open") { sharedfile = ""; abort = true; } + else if (sharedfile == "not found") { sharedfile = ""; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { namefile = ""; abort = true; } + else if (namefile == "not found") { namefile = ""; } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { groupfile = ""; abort = true; } + else if (groupfile == "not found") { groupfile = ""; } + + //check for optional parameter and set defaults // ...at some point should added some additional type checking... label = validParameter.validFile(parameters, "label", false); @@ -111,12 +204,6 @@ SubSampleCommand::SubSampleCommand(string option) { else { allLines = 1; } } - //if the user has not specified any labels use the ones from read.otu - if (label == "") { - allLines = globaldata->allLines; - labels = globaldata->labels; - } - groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = ""; pickedGroups = false; } else { @@ -125,6 +212,20 @@ SubSampleCommand::SubSampleCommand(string option) { globaldata->Groups = Groups; } + string temp = validParameter.validFile(parameters, "size", false); if (temp == "not found"){ temp = "0"; } + convert(temp, size); + + if ((namefile != "") && (fastafile == "")) { m->mothurOut("You may only use a namefile with a fastafile."); m->mothurOutEndLine(); abort = true; } + + if ((fastafile == "") && (listfile == "") && (sabundfile == "") && (rabundfile == "") && (sharedfile == "")) { + m->mothurOut("You must provide a fasta, list, sabund, rabund or shared file as an input file."); m->mothurOutEndLine(); abort = true; } + + if (pickedGroups && ((groupfile == "") && (sharedfile == ""))) { + m->mothurOut("You cannot pick groups without a valid group file or shared file."); m->mothurOutEndLine(); abort = true; } + + if ((groupfile != "") && ((fastafile == "") && (listfile == ""))) { + m->mothurOut("Group file only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; } + } } @@ -138,15 +239,16 @@ SubSampleCommand::SubSampleCommand(string option) { void SubSampleCommand::help(){ try { - m->mothurOut("The get.relabund command can only be executed after a successful read.otu command of a list and group or shared file.\n"); - m->mothurOut("The get.relabund command parameters are groups, scale and label. No parameters are required.\n"); + m->mothurOut("The sub.sample command is designed to be used as a way to normalize your data, or create a smaller set from your original set.\n"); + m->mothurOut("The sub.sample command parameters are fasta, name, list, group, rabund, sabund, shared, groups, size and label. You must provide a fasta, list, sabund, rabund or shared file as an input file.\n"); m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n"); m->mothurOut("The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"); - m->mothurOut("The scale parameter allows you to select what scale you would like to use. Choices are totalgroup, totalotu, averagegroup, averageotu, default is totalgroup.\n"); - m->mothurOut("The get.relabund command should be in the following format: get.relabund(groups=yourGroups, label=yourLabels).\n"); - m->mothurOut("Example get.relabund(groups=A-B-C, scale=averagegroup).\n"); + m->mothurOut("The size parameter allows you indicate the size of your subsample.\n"); + m->mothurOut("The size parameter is not set: with shared file size=number of seqs in smallest sample, with all other files, 10% of number of seqs.\n"); + m->mothurOut("The sub.sample command should be in the following format: sub.sample(list=yourListFile, group=yourGroupFile, groups=yourGroups, label=yourLabels).\n"); + m->mothurOut("Example sub.sample(list=abrecovery.fn.list, group=abrecovery.groups, groups=B-C, size=20).\n"); m->mothurOut("The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n"); - m->mothurOut("The get.relabund command outputs a .relabund file.\n"); + m->mothurOut("The sub.sample command outputs a .subsample file.\n"); m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n"); } @@ -158,8 +260,7 @@ void SubSampleCommand::help(){ //********************************************************************************************************************** -SubSampleCommand::~SubSampleCommand(){ -} +SubSampleCommand::~SubSampleCommand(){} //********************************************************************************************************************** @@ -168,47 +269,15 @@ int SubSampleCommand::execute(){ if (abort == true) { return 0; } - string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->inputFileName)) + "subsample" + m->getExtension(globaldata->inputFileName); - ofstream out; - m->openOutputFile(outputFileName, out); - out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); - - string format = globaldata->getFormat(); - - read = new ReadOTUFile(globaldata->inputFileName); - read->read(&*globaldata); - input = globaldata->ginput; - - if (format == "sharedfile") { - lookup = input->getSharedRAbundVectors(); - outputTypes["shared"].push_back(outputFileName); - getSubSampleShared(lookup, out); - }else if (format == "list") { - list = globaldata->glist; - outputTypes["list"].push_back(outputFileName); - //getSubSamplesList(); - }else if (format == "rabund") { - rabund = globaldata->rabund; - outputTypes["rabund"].push_back(outputFileName); - //getSubSamplesRabund(); - - }else if (format == "sabund") { - sabund = globaldata->sabund; - outputTypes["sabund"].push_back(outputFileName); - //getSubSamplesSabund(); - } - - out.close(); - - //reset groups parameter - delete input; globaldata->ginput = NULL; - delete read; - - if (m->control_pressed) { outputTypes.clear(); remove(outputFileName.c_str()); return 0;} - + if (sharedfile != "") { getSubSampleShared(); } + //if (listfile != "") { getSubSampleList(); } + //if (rabund != "") { getSubSampleRabund(); } + //if (sabundfile != "") { getSubSampleSabund(); } + //if (fastafile != "") { getSubSampleFasta(); } + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(outputFileName); m->mothurOutEndLine(); outputNames.push_back(outputFileName); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); return 0; @@ -219,26 +288,35 @@ int SubSampleCommand::execute(){ } } //********************************************************************************************************************** -int SubSampleCommand::getSubSampleShared(vector& thislookup, ofstream& filename) { +int SubSampleCommand::getSubSampleShared() { try { + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "subsample" + m->getExtension(sharedfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); + + InputData* input = new InputData(sharedfile, "sharedfile"); + vector lookup = input->getSharedRAbundVectors(); + string lastLabel = lookup[0]->getLabel(); //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = labels; - string lastLabel = lookup[0]->getLabel(); //as long as you are not at the end of the file or done wih the lines you want while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { out.close(); return 0; } if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); - //process lookup - - + processShared(lookup, out); processedLabels.insert(lookup[0]->getLabel()); userLabels.erase(lookup[0]->getLabel()); @@ -252,7 +330,8 @@ int SubSampleCommand::getSubSampleShared(vector& thislookup lookup = input->getSharedRAbundVectors(lastLabel); m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); - //process lookup + processShared(lookup, out); + processedLabels.insert(lookup[0]->getLabel()); userLabels.erase(lookup[0]->getLabel()); @@ -269,7 +348,7 @@ int SubSampleCommand::getSubSampleShared(vector& thislookup } - if (m->control_pressed) { return 0; } + if (m->control_pressed) { out.close(); return 0; } //output error messages about any remaining user labels set::iterator it; @@ -291,15 +370,12 @@ int SubSampleCommand::getSubSampleShared(vector& thislookup m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); - //process lookup - - + processShared(lookup, out); for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } - //reset groups parameter - globaldata->Groups.clear(); + out.close(); return 0; @@ -309,8 +385,74 @@ int SubSampleCommand::getSubSampleShared(vector& thislookup exit(1); } } - - +//********************************************************************************************************************** +int SubSampleCommand::processShared(vector& thislookup, ofstream& out) { + try { + + if (pickedGroups) { eliminateZeroOTUS(thislookup); } + + if (size == 0) { //user has not set size, set size = smallest samples size + size = thislookup[0]->getNumSeqs(); + for (int i = 1; i < thislookup.size(); i++) { + int thisSize = thislookup[i]->getNumSeqs(); + + if (thisSize < size) { size = thisSize; } + } + } + + int numBins = thislookup[0]->getNumBins(); + for (int i = 0; i < thislookup.size(); i++) { + int thisSize = thislookup[i]->getNumSeqs(); + + if (thisSize != size) { + + string thisgroup = thislookup[i]->getGroup(); + + OrderVector* order = new OrderVector(); + for(int p=0;pgetAbundance(p);j++){ + order->push_back(p); + } + } + random_shuffle(order->begin(), order->end()); + + SharedRAbundVector* temp = new SharedRAbundVector(numBins); + temp->setLabel(thislookup[i]->getLabel()); + temp->setGroup(thislookup[i]->getGroup()); + + delete thislookup[i]; + thislookup[i] = temp; + + + for (int j = 0; j < size; j++) { + //get random number to sample from order between 0 and thisSize-1. + int myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1)); + + int bin = order->get(myrand); + + int abund = thislookup[i]->getAbundance(bin); + thislookup[i]->set(bin, (abund+1), thisgroup); + } + delete order; + } + } + + //subsampling may have created some otus with no sequences in them + eliminateZeroOTUS(thislookup); + + for (int i = 0; i < thislookup.size(); i++) { + out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t'; + thislookup[i]->print(out); + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS"); + exit(1); + } +} //********************************************************************************************************************** int SubSampleCommand::eliminateZeroOTUS(vector& thislookup) { try { @@ -326,7 +468,7 @@ int SubSampleCommand::eliminateZeroOTUS(vector& thislookup) //for each bin for (int i = 0; i < thislookup[0]->getNumBins(); i++) { if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } - + //look at each sharedRabund and make sure they are not all zero bool allZero = true; for (int j = 0; j < thislookup.size(); j++) { @@ -340,13 +482,14 @@ int SubSampleCommand::eliminateZeroOTUS(vector& thislookup) } } } - + for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; } - + thislookup.clear(); + thislookup = newLookup; return 0; - + } catch(exception& e) { m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS"); @@ -357,3 +500,4 @@ int SubSampleCommand::eliminateZeroOTUS(vector& thislookup) //********************************************************************************************************************** + diff --git a/subsamplecommand.h b/subsamplecommand.h index 98085ee..f068ac3 100644 --- a/subsamplecommand.h +++ b/subsamplecommand.h @@ -11,11 +11,12 @@ */ #include "command.hpp" -#include "inputdata.h" -#include "readotu.h" +#include "globaldata.hpp" #include "sharedrabundvector.h" +#include "listvector.hpp" +#include "rabundvector.hpp" +#include "inputdata.h" -class GlobalData; class SubSampleCommand : public Command { @@ -32,21 +33,19 @@ public: private: GlobalData* globaldata; - ReadOTUFile* read; - InputData* input; - vector lookup; - ListVector* list; - RAbundVector* rabund; - SAbundVector* sabund; - bool abort, allLines, pickedGroups; + bool abort, pickedGroups, allLines; + string listfile, groupfile, sharedfile, rabundfile, sabundfile, fastafile, namefile; set labels; //holds labels to be used string groups, label, outputDir; vector Groups, outputNames; map > outputTypes; + int size; int eliminateZeroOTUS(vector&); - int getSubSampleShared(vector&, ofstream&); + int getSubSampleShared(); + int processShared(vector&, ofstream&); + }; #endif