From f5023c911c377e5320c5110c78af98dd8841ef58 Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 2 Nov 2009 15:43:43 +0000 Subject: [PATCH] added groups option to read.otu, added qtrim option to trim.seqs, fixed bug in get.oturep that started bin numbers at 0, finished work on hcluster command. --- getsharedotucommand.cpp | 11 +++++--- hcluster.cpp | 12 ++++----- hcluster.h | 8 +----- hclustercommand.cpp | 3 +-- mothur.h | 8 ++++++ sharedcommand.cpp | 59 ++++++++++++++++++++++++++++++++++++----- sharedcommand.h | 3 +++ sharedlistvector.cpp | 31 +++++++++++++++------- sharedrabundvector.h | 2 +- trimseqscommand.cpp | 18 ++++++++++--- trimseqscommand.h | 2 +- 11 files changed, 116 insertions(+), 41 deletions(-) diff --git a/getsharedotucommand.cpp b/getsharedotucommand.cpp index 56f030d..6d13472 100644 --- a/getsharedotucommand.cpp +++ b/getsharedotucommand.cpp @@ -93,6 +93,7 @@ void GetSharedOTUCommand::help(){ mothurOut("The get.sharedotu command outputs a .names file for each distance level containing a list of sequences in the OTUs shared by the groups specified.\n"); mothurOut("The get.sharedotu command should be in the following format: get.sabund(label=yourLabels, groups=yourGroups, fasta=yourFastafile, output=yourOutput).\n"); mothurOut("Example get.sharedotu(list=amazon.fn.list, label=unique-0.01, group=forest-pasture, fasta=amazon.fasta, output=accnos).\n"); + mothurOut("The output to the screen is the distance and the number of otus at that distance for the groups you specified.\n"); mothurOut("The default value for label is all labels in your inputfile. The default for groups is all groups in your file.\n"); mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n"); } @@ -227,6 +228,7 @@ void GetSharedOTUCommand::process(ListVector* shared) { openOutputFile(outputFileNames, outNames); bool wroteSomething = false; + int num = 0; //go through each bin, find out if shared for (int i = 0; i < shared->getNumBins(); i++) { @@ -248,7 +250,7 @@ void GetSharedOTUCommand::process(ListVector* shared) { //find group string seqGroup = groupMap->getGroup(name); if (output != "accnos") { - namesOfSeqsInThisBin.push_back((name + "|" + seqGroup + "|" + toString(i))); + namesOfSeqsInThisBin.push_back((name + "|" + seqGroup + "|" + toString(i+1))); }else { namesOfSeqsInThisBin.push_back(name); } if (seqGroup == "not found") { mothurOut(name + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1); } @@ -264,7 +266,7 @@ void GetSharedOTUCommand::process(ListVector* shared) { if (sharedByAll) { string seqGroup = groupMap->getGroup(names); if (output != "accnos") { - namesOfSeqsInThisBin.push_back((names + "|" + seqGroup + "|" + toString(i))); + namesOfSeqsInThisBin.push_back((names + "|" + seqGroup + "|" + toString(i+1))); }else { namesOfSeqsInThisBin.push_back(names); } if (seqGroup == "not found") { mothurOut(names + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1); } @@ -285,6 +287,7 @@ void GetSharedOTUCommand::process(ListVector* shared) { if (sharedByAll) { wroteSomething = true; + num++; //output list of names for (int j = 0; j < namesOfSeqsInThisBin.size(); j++) { @@ -310,7 +313,7 @@ void GetSharedOTUCommand::process(ListVector* shared) { if (!wroteSomething) { remove(outputFileNames.c_str()); - string outputString = " - No otus shared by groups"; + string outputString = "\t" + toString(num) + " - No otus shared by groups"; string groupString = ""; for (int h = 0; h < Groups.size(); h++) { @@ -319,7 +322,7 @@ void GetSharedOTUCommand::process(ListVector* shared) { outputString += groupString + "."; mothurOut(outputString); mothurOutEndLine(); - }else { mothurOutEndLine(); } + }else { mothurOut("\t" + toString(num)); mothurOutEndLine(); } //if fasta file provided output new fasta file if ((fastafile != "") && wroteSomething) { diff --git a/hcluster.cpp b/hcluster.cpp index 4e83777..2d980d7 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -235,19 +235,17 @@ void HCluster::updateArrayandLinkTable() { } //cout << "here3" << endl; linkTable[colSpot].erase(size); - linkTable.erase(linkTable.begin()+rowSpot); //delete col + //linkTable.erase(linkTable.begin()+rowSpot); //delete row //printInfo(); //update activerows activeLinks.erase(smallRow); activeLinks.erase(smallCol); - - if(rowSpot>colSpot) { activeLinks[size] = colSpot; } - else{ activeLinks[size] = colSpot; } + activeLinks[size] = colSpot; //adjust everybody elses spot since you deleted - time vs. space - for (it = activeLinks.begin(); it != activeLinks.end(); it++) { - if (it->second > rowSpot) { activeLinks[it->first]--; } - } + //for (it = activeLinks.begin(); it != activeLinks.end(); it++) { + //if (it->second > rowSpot) { activeLinks[it->first]--; } + //} //cout << "here4" << endl; diff --git a/hcluster.h b/hcluster.h index ff02b1c..6b2862b 100644 --- a/hcluster.h +++ b/hcluster.h @@ -15,19 +15,13 @@ class RAbundVector; class ListVector; -/************************************************************/ -struct clusterNode { - int numSeq; - int parent; - int smallChild; //used to make linkTable work with list and rabund. represents bin number of this cluster node - clusterNode(int num, int par, int kid) : numSeq(num), parent(par), smallChild(kid) {}; -}; /***********************************************************************/ class HCluster { public: HCluster(RAbundVector*, ListVector*); + ~HCluster(){}; bool update(int, int, float); //string getTag(); diff --git a/hclustercommand.cpp b/hclustercommand.cpp index 6e3ec24..e140f8c 100644 --- a/hclustercommand.cpp +++ b/hclustercommand.cpp @@ -168,8 +168,6 @@ int HClusterCommand::execute(){ mothurOut("Error: no list vector!"); mothurOutEndLine(); return 0; } - - float previousDist = 0.00000; float rndPreviousDist = 0.00000; oldRAbund = *rabund; @@ -260,6 +258,7 @@ int HClusterCommand::execute(){ rabundFile.close(); listFile.close(); + delete cluster; //if (isTrue(timing)) { mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine(); //} diff --git a/mothur.h b/mothur.h index 84dbdb3..23b9f18 100644 --- a/mothur.h +++ b/mothur.h @@ -78,6 +78,14 @@ struct ThreadNode { IntNode* right; }; +/************************************************************/ +struct clusterNode { + int numSeq; + int parent; + int smallChild; //used to make linkTable work with list and rabund. represents bin number of this cluster node + clusterNode(int num, int par, int kid) : numSeq(num), parent(par), smallChild(kid) {}; +}; + /***********************************************************************/ // snagged from http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.2 diff --git a/sharedcommand.cpp b/sharedcommand.cpp index 727a04b..0dcdfc0 100644 --- a/sharedcommand.cpp +++ b/sharedcommand.cpp @@ -23,20 +23,27 @@ SharedCommand::SharedCommand(){ groupMap = globaldata->gGroupmap; + //if hte user has not specified any groups then use them all + if (globaldata->Groups.size() == 0) { + groups = groupMap->namesOfGroups; + }else{ //they have specified groups + groups = globaldata->Groups; + } + //fill filehandles with neccessary ofstreams int i; ofstream* temp; - for (i=0; igetNumGroups(); i++) { + for (i=0; inamesOfGroups[i]] = temp; + filehandles[groups[i]] = temp; } //set fileroot fileroot = getRootName(globaldata->getListFile()); //clears file before we start to write to it below - for (int i=0; igetNumGroups(); i++) { - remove((fileroot + groupMap->namesOfGroups[i] + ".rabund").c_str()); + for (int i=0; igetLabel(); vector lookup; - if (SharedList->getNumSeqs() != groupMap->getNumSeqs()) { + if ((globaldata->Groups.size() == 0) && (SharedList->getNumSeqs() != groupMap->getNumSeqs())) { //if the user has not specified any groups and their files don't match exit with error mothurOut("Your group file contains " + toString(groupMap->getNumSeqs()) + " sequences and list file contains " + toString(SharedList->getNumSeqs()) + " sequences. Please correct."); mothurOutEndLine(); out.close(); @@ -82,15 +89,36 @@ int SharedCommand::execute(){ return 1; } + //if user has specified groups make new groupfile for them + if (globaldata->Groups.size() != 0) { //make new group file + string groups = ""; + for (int i = 0; i < globaldata->Groups.size(); i++) { + groups += globaldata->Groups[i] + "."; + } + + string newGroupFile = getRootName(globaldata->inputFileName) + groups + "groups"; + ofstream outGroups; + openOutputFile(newGroupFile, outGroups); + + vector names = groupMap->getNamesSeqs(); + string groupName; + for (int i = 0; i < names.size(); i++) { + groupName = groupMap->getGroup(names[i]); + if (isValidGroup(groupName, globaldata->Groups)) { + outGroups << names[i] << '\t' << groupName << endl; + } + } + outGroups.close(); + } + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. set processedLabels; set userLabels = globaldata->labels; while((SharedList != NULL) && ((globaldata->allLines == 1) || (userLabels.size() != 0))) { - if(globaldata->allLines == 1 || globaldata->labels.count(SharedList->getLabel()) == 1){ - + lookup = SharedList->getSharedRAbundVector(); mothurOut(lookup[0]->getLabel()); mothurOutEndLine(); @@ -280,3 +308,20 @@ SharedCommand::~SharedCommand(){ } //********************************************************************************************************************** + +bool SharedCommand::isValidGroup(string groupname, vector groups) { + try { + for (int i = 0; i < groups.size(); i++) { + if (groupname == groups[i]) { return true; } + } + + return false; + } + catch(exception& e) { + errorOut(e, "SharedCommand", "isValidGroup"); + exit(1); + } +} +/************************************************************/ + + diff --git a/sharedcommand.h b/sharedcommand.h index f0c2488..b305e9a 100644 --- a/sharedcommand.h +++ b/sharedcommand.h @@ -34,11 +34,14 @@ public: private: void printSharedData(vector); void createMisMatchFile(); + bool isValidGroup(string, vector); + GlobalData* globaldata; ReadOTUFile* read; SharedListVector* SharedList; InputData* input; GroupMap* groupMap; + vector groups; ofstream out; string filename, fileroot; bool firsttime; diff --git a/sharedlistvector.cpp b/sharedlistvector.cpp index a113ebe..6cac9b1 100644 --- a/sharedlistvector.cpp +++ b/sharedlistvector.cpp @@ -264,22 +264,23 @@ vector SharedListVector::getSharedRAbundVector() { SharedUtil* util; util = new SharedUtil(); vector lookup; + vector lookup2; map finder; + map::iterator it; string group, names, name; - + util->setGroups(globaldata->Groups, globaldata->gGroupmap->namesOfGroups); - delete util; - for (int i = 0; i < globaldata->Groups.size(); i++) { + for (int i = 0; i < globaldata->gGroupmap->namesOfGroups.size(); i++) { SharedRAbundVector* temp = new SharedRAbundVector(data.size()); - finder[globaldata->Groups[i]] = temp; - finder[globaldata->Groups[i]]->setLabel(label); - finder[globaldata->Groups[i]]->setGroup(globaldata->Groups[i]); + finder[globaldata->gGroupmap->namesOfGroups[i]] = temp; + finder[globaldata->gGroupmap->namesOfGroups[i]]->setLabel(label); + finder[globaldata->gGroupmap->namesOfGroups[i]]->setGroup(globaldata->gGroupmap->namesOfGroups[i]); //*temp = getSharedRAbundVector(globaldata->Groups[i]); - lookup.push_back(finder[globaldata->Groups[i]]); + lookup.push_back(finder[globaldata->gGroupmap->namesOfGroups[i]]); } - +//cout << "after blanks" << endl; //fill vectors for(int i=0;i SharedListVector::getSharedRAbundVector() { name = names.substr(0,names.find_first_of(',')); names = names.substr(names.find_first_of(',')+1, names.length()); group = groupmap->getGroup(name); +//cout << i << '\t' << name << '\t' << group << endl; if(group == "not found") { mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine(); exit(1); } finder[group]->set(i, finder[group]->getAbundance(i) + 1, group); //i represents what bin you are in } //get last name group = groupmap->getGroup(names); +//cout << i << '\t' << names << '\t' << group << endl; if(group == "not found") { mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine(); exit(1); } finder[group]->set(i, finder[group]->getAbundance(i) + 1, group); //i represents what bin you are in } - return lookup; + if (globaldata->Groups.size() == globaldata->gGroupmap->namesOfGroups.size()) { //no groups specified + lookup2 = lookup; + }else{ //delete unwanted groups + for (int i = 0; i < globaldata->Groups.size(); i++) { + SharedRAbundVector* temp = new SharedRAbundVector(*finder[globaldata->Groups[i]]); + lookup2.push_back(temp); + delete finder[globaldata->Groups[i]]; //so we don't get dup memory + } + } + + return lookup2; } catch(exception& e) { errorOut(e, "SharedListVector", "getSharedRAbundVector"); diff --git a/sharedrabundvector.h b/sharedrabundvector.h index 0c71394..14ec1b8 100644 --- a/sharedrabundvector.h +++ b/sharedrabundvector.h @@ -30,7 +30,7 @@ public: SharedRAbundVector(); SharedRAbundVector(int); //SharedRAbundVector(string, vector); - SharedRAbundVector(const SharedRAbundVector& bv) : DataVector(bv), data(bv.data), maxRank(bv.maxRank), numBins(bv.numBins), numSeqs(bv.numSeqs){}; + SharedRAbundVector(const SharedRAbundVector& bv) : DataVector(bv), data(bv.data), maxRank(bv.maxRank), numBins(bv.numBins), numSeqs(bv.numSeqs), group(bv.group), index(bv.index){}; SharedRAbundVector(ifstream&); ~SharedRAbundVector(); diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 455cdc2..5ca70a9 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -21,7 +21,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ else { //valid paramters for this command - string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", "qthreshold", "qaverage", "allfiles"}; + string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", "qthreshold", "qaverage", "allfiles", "qtrim"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -72,6 +72,9 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } convert(temp, qThreshold); + + temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "F"; } + qtrim = isTrue(temp); temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; } convert(temp, qAverage); @@ -104,7 +107,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ void TrimSeqsCommand::help(){ try { mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); - mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength and maxlength.\n"); + mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, qtrim and allfiles.\n"); mothurOut("The fasta parameter is required.\n"); mothurOut("The flip parameter .... The default is 0.\n"); mothurOut("The oligos parameter .... The default is "".\n"); @@ -112,11 +115,17 @@ void TrimSeqsCommand::help(){ mothurOut("The maxhomop parameter .... The default is 0.\n"); mothurOut("The minlength parameter .... The default is 0.\n"); mothurOut("The maxlength parameter .... The default is 0.\n"); + mothurOut("The qfile parameter .....\n"); + mothurOut("The qthreshold parameter .... The default is 0.\n"); + mothurOut("The qaverage parameter .... The default is 0.\n"); + mothurOut("The allfiles parameter .... The default is F.\n"); + mothurOut("The qtrim parameter .... The default is F.\n"); mothurOut("The trim.seqs command should be in the following format: \n"); mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n"); mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"); mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"); - mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); + mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"); + mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n"); } catch(exception& e) { @@ -170,6 +179,9 @@ int TrimSeqsCommand::execute(){ if(qFileName != ""){ if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } else if(qAverage != 0) { success = cullQualAverage(currSeq, qFile); } + if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { + success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap + } if(!success) { trashCode += 'q'; } } if(barcodes.size() != 0){ diff --git a/trimseqscommand.h b/trimseqscommand.h index 87b813f..ca02961 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -36,7 +36,7 @@ private: bool abort; string fastaFile, oligoFile, qFileName; - bool flip, allFiles; + bool flip, allFiles, qtrim; int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, qThreshold, qAverage; vector forPrimer, revPrimer; map barcodes; -- 2.39.2