From: westcott Date: Tue, 19 Jan 2010 13:48:32 +0000 (+0000) Subject: added warning about average neighbor merges around cutoff. fixed little bugs. X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=832d53a9dfac6b1795735eec643d8cf627b0d8e3 added warning about average neighbor merges around cutoff. fixed little bugs. --- diff --git a/averagelinkage.cpp b/averagelinkage.cpp index 7a4cb88..db2c51e 100644 --- a/averagelinkage.cpp +++ b/averagelinkage.cpp @@ -11,8 +11,8 @@ /***********************************************************************/ -AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) : - Cluster(rav, lv, dm) +AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) : + Cluster(rav, lv, dm, c) { saveRow = -1; saveCol = -1; @@ -37,8 +37,16 @@ bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) { saveRow = smallRow; saveCol = smallCol; } - + + float oldColCell = colCell->dist; + colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin; + + //warn user if merge with value above cutoff produces a value below cutoff + if ((colCell->dist < cutoff) && ((oldColCell > cutoff) || (rowCell->dist > cutoff)) ) { + mothurOut("Warning: merging " + toString(oldColCell) + " with " + toString(rowCell->dist) + ", new value = " + toString(colCell->dist) + ". Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine(); + } + return(true); } catch(exception& e) { diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 5224897..39dda9f 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -81,7 +81,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){ namefile = validParameter.validFile(parameters, "name", false); - if (fastaFileName == "not found") { namefile = ""; } + if (namefile == "not found") { namefile = ""; } else { splitAtDash(namefile, namefileNames); @@ -297,7 +297,7 @@ int ClassifySeqsCommand::execute(){ #endif //make taxonomy tree from new taxonomy file PhyloTree taxaBrowser; - + ifstream in; openInputFile(tempTaxonomyFile, in); diff --git a/cluster.cpp b/cluster.cpp index 429e19a..00d8083 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -14,8 +14,8 @@ /***********************************************************************/ -Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) : -rabund(rav), list(lv), dMatrix(dm) +Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) : +rabund(rav), list(lv), dMatrix(dm), cutoff(c) { /* cout << "sizeof(MatData): " << sizeof(MatData) << endl; diff --git a/cluster.hpp b/cluster.hpp index a6024e7..38897c9 100644 --- a/cluster.hpp +++ b/cluster.hpp @@ -13,7 +13,7 @@ typedef vector MatVec; class Cluster { public: - Cluster(RAbundVector*, ListVector*, SparseMatrix*); + Cluster(RAbundVector*, ListVector*, SparseMatrix*, float); virtual void update(); virtual string getTag() = 0; virtual void setMapWanted(bool m); @@ -37,6 +37,7 @@ protected: int smallCol; float smallDist; bool mapWanted; + float cutoff; map seq2Bin; vector seqVec; // contains vectors of cells related to a certain sequence @@ -50,7 +51,7 @@ protected: class CompleteLinkage : public Cluster { public: - CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*); + CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*, float); bool updateDistance(MatData& colCell, MatData& rowCell); string getTag(); @@ -62,7 +63,7 @@ private: class SingleLinkage : public Cluster { public: - SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*); + SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*, float); void update(); bool updateDistance(MatData& colCell, MatData& rowCell); string getTag(); @@ -75,7 +76,7 @@ private: class AverageLinkage : public Cluster { public: - AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*); + AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*, float); bool updateDistance(MatData& colCell, MatData& rowCell); string getTag(); diff --git a/clustercommand.cpp b/clustercommand.cpp index baf4224..93ba1f9 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -81,9 +81,9 @@ ClusterCommand::ClusterCommand(string option){ } //create cluster - if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix); } - else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix); } - else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix); } + if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff); } + else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff); } + else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff); } tag = cluster->getTag(); fileroot = getRootName(globaldata->inputFileName); diff --git a/completelinkage.cpp b/completelinkage.cpp index 09ba963..a09af90 100644 --- a/completelinkage.cpp +++ b/completelinkage.cpp @@ -3,8 +3,8 @@ /***********************************************************************/ -CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) : - Cluster(rav, lv, dm) +CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) : + Cluster(rav, lv, dm, c) {} /***********************************************************************/ diff --git a/formatcolumn.cpp b/formatcolumn.cpp index e68d85d..a11601b 100644 --- a/formatcolumn.cpp +++ b/formatcolumn.cpp @@ -47,9 +47,8 @@ void FormatColumnMatrix::read(NameAssignment* nameMap){ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } if (distance == -1) { distance = 1000000; } - - if(distance < cutoff && itA != itB){ - + + if((distance < cutoff) && (itA != itB)){ if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol... refRow = itA->second; refCol = itB->second; @@ -71,7 +70,7 @@ void FormatColumnMatrix::read(NameAssignment* nameMap){ } out.close(); fileHandle.close(); - + string squareFile; if(lt == 0){ // oops, it was square squareFile = filename; @@ -129,10 +128,13 @@ void FormatColumnMatrix::read(NameAssignment* nameMap){ rowMap.clear(); //save row you just read - rowMap[second] = dist; - + if (dist < cutoff) { + rowMap[second] = dist; + } }else{ - rowMap[second] = dist; + if (dist < cutoff) { + rowMap[second] = dist; + } } } diff --git a/formatmatrix.h b/formatmatrix.h index a02ef6c..4ef36d9 100644 --- a/formatmatrix.h +++ b/formatmatrix.h @@ -15,6 +15,43 @@ #include "nameassignment.hpp" +//********************************************************************************************************************** +// This class takes a distance matrix file and converts it to a file where each row contains all distances below the cutoff +// for a given sequence. + +// Example: + /* 5 + A + B 0.01 + C 0.015 0.03 + D 0.03 0.02 0.02 + E 0.04 0.05 0.03 0.02 + + becomes + + 0 4 1 0.01 2 0.015 3 0.03 4 0.04 + 1 4 0 0.01 2 0.03 3 0.02 4 0.05 + 2 4 0 0.015 1 0.03 3 0.02 4 0.03 + 3 4 0 0.03 1 0.02 2 0.02 4 0.02 + 4 4 0 0.04 1 0.05 2 0.03 3 0.02 + + column 1 - sequence name converted to row number + column 2 - numDists under cutoff + rest of line - sequence row -> distance, sequence row -> distance + + if you had a cutoff of 0.03 then the file would look like, + + 0 3 1 0.01 2 0.015 3 0.03 + 1 3 0 0.01 2 0.03 3 0.02 + 2 4 0 0.015 1 0.03 3 0.02 4 0.03 + 3 4 0 0.03 1 0.02 2 0.02 4 0.02 + 4 2 2 0.03 3 0.02 + + This class also creates a vector of ints, rowPos. + + rowPos[0] = position in the file of distances related to sequence 0. + If a sequence is excluded by the cutoff, it's rowPos = -1. +*/ //********************************************************************************************************************** class FormatMatrix { diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index 9667755..d37ee19 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -48,7 +48,7 @@ GetOTURepCommand::GetOTURepCommand(string option){ help(); abort = true; } else { //valid paramters for this command - string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff"}; + string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -139,11 +139,12 @@ GetOTURepCommand::GetOTURepCommand(string option){ void GetOTURepCommand::help(){ try { - mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n"); - mothurOut("The get.oturep command parameters are list, fasta, name, group, large, sorted and label. The fasta and list parameters are required.\n"); + mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n"); mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n"); - mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n"); - mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n"); + mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n"); + mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n"); + mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n"); + mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n"); mothurOut("The default value for label is all labels in your inputfile.\n"); mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n"); mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n"); @@ -328,8 +329,10 @@ int GetOTURepCommand::execute(){ } //close and remove formatted matrix file - inRow.close(); - remove(distFile.c_str()); + if (large) { + inRow.close(); + //remove(distFile.c_str()); + } globaldata->gListVector = NULL; delete input; globaldata->ginput = NULL; diff --git a/hcluster.cpp b/hcluster.cpp index 3b972c0..259036f 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -27,7 +27,9 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAs if (method != "average") { openInputFile(distfile, filehandle); - }else{ firstRead = true; } + }else{ + processFile(); + } } catch(exception& e) { errorOut(e, "HCluster", "HCluster"); @@ -358,7 +360,6 @@ vector HCluster::getSeqs(){ if(method != "average") { sameSeqs = getSeqsFNNN(); }else{ - if (firstRead) { processFile(); } sameSeqs = getSeqsAN(); } @@ -755,8 +756,6 @@ void HCluster::processFile() { remove(distfile.c_str()); rename(outTemp.c_str(), distfile.c_str()); - - firstRead = false; } catch(exception& e) { errorOut(e, "HCluster", "processFile"); diff --git a/hcluster.h b/hcluster.h index 679abbc..bff4328 100644 --- a/hcluster.h +++ b/hcluster.h @@ -71,7 +71,7 @@ protected: int smallCol; float smallDist, cutoff; map seq2Bin; - bool mapWanted, exitedBreak, firstRead; + bool mapWanted, exitedBreak; seqDist next; string method, distfile; ifstream filehandle; diff --git a/hclustercommand.cpp b/hclustercommand.cpp index bdec48e..c39b002 100644 --- a/hclustercommand.cpp +++ b/hclustercommand.cpp @@ -178,9 +178,6 @@ int HClusterCommand::execute(){ print_start = true; start = time(NULL); - - //ifstream in; - //openInputFile(distfile, in); cluster = new HCluster(rabund, list, method, distfile, globaldata->nameMap, cutoff); vector seqs; seqs.resize(1); // to start loop @@ -192,19 +189,17 @@ int HClusterCommand::execute(){ for (int i = 0; i < seqs.size(); i++) { //-1 means skip me if (seqs[i].seq1 != seqs[i].seq2) { - bool clustered = cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); + cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); float rndDist = roundDist(seqs[i].dist, precision); - if (clustered) { - if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ - printData("unique"); - } - else if((rndDist != rndPreviousDist)){ - printData(toString(rndPreviousDist, length-1)); - } + if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ + printData("unique"); } - + else if((rndDist != rndPreviousDist)){ + printData(toString(rndPreviousDist, length-1)); + } + previousDist = seqs[i].dist; rndPreviousDist = rndDist; oldRAbund = *rabund; @@ -212,8 +207,6 @@ int HClusterCommand::execute(){ } } } - - //in.close(); if(previousDist <= 0.0000){ printData("unique"); diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index fad0ed6..81e80cc 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -153,9 +153,9 @@ int MGClusterCommand::execute(){ delete read; //create cluster - if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix); } - else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix); } - else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix); } + if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff); } + else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff); } + else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff); } cluster->setMapWanted(true); //cluster using cluster classes diff --git a/mothur.cpp b/mothur.cpp index 3bdbfcd..7fd5d1e 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -20,8 +20,11 @@ int main(int argc, char *argv[]){ try { //remove old logfile - string logFileName = "mothur.logFile"; - remove(logFileName.c_str()); + string log = "mothur.logFile"; + remove(log.c_str()); + + time_t ltime = time(NULL); /* calendar time */ + string logFileName = "mothur." + toString(asctime( localtime(<ime) )) + ".logfile"; //version #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) @@ -106,7 +109,9 @@ int main(int argc, char *argv[]){ while(bail == 0) { bail = mothur->getInput(); } delete mothur; - + + rename(log.c_str(), logFileName.c_str()); //logfile with timestamp + return 0; } catch(exception& e) { diff --git a/mothur.h b/mothur.h index 8f9f94f..8bd8d7f 100644 --- a/mothur.h +++ b/mothur.h @@ -72,7 +72,6 @@ using namespace std; #define isnan(x) ((x) != (x)) #define isinf(x) (fabs(x) == std::numeric_limits::infinity()) - typedef unsigned long ull; struct IntNode { diff --git a/readdistcommand.cpp b/readdistcommand.cpp index 1c134f5..9d38e76 100644 --- a/readdistcommand.cpp +++ b/readdistcommand.cpp @@ -208,7 +208,7 @@ int ReadDistCommand::execute(){ numDists = globaldata->gSparseMatrix->getNNodes(); //cout << "matrix contains " << numDists << " distances." << endl; - int lines = cutoff / (1.0/precision); + /* int lines = cutoff / (1.0/precision); vector dist_cutoff(lines+1,0); for (int i = 0; i <= lines;i++) { dist_cutoff[i] = (i + 0.5) / precision; @@ -224,7 +224,7 @@ int ReadDistCommand::execute(){ } } } - +*/ // string dist_string = "Dist:"; // string count_string = "Count: "; //for (int i = 0; i <= lines;i++) { diff --git a/readphylip.cpp b/readphylip.cpp index fe278ab..5f92275 100644 --- a/readphylip.cpp +++ b/readphylip.cpp @@ -59,13 +59,13 @@ void ReadPhylipMatrix::read(NameAssignment* nameMap){ } Progress* reading; - + if(square == 0){ reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2); int index = 0; - + for(int i=1;i> name; matrixNames.push_back(name); @@ -156,7 +156,7 @@ void ReadPhylipMatrix::read(NameAssignment* nameMap){ list->setLabel("0"); fileHandle.close(); - if(nameMap != NULL){ + /* if(nameMap != NULL){ for(int i=0;ierase(matrixNames[i]); } @@ -164,7 +164,7 @@ void ReadPhylipMatrix::read(NameAssignment* nameMap){ //should probably tell them what is missing if we missed something mothurOut("missed something\t" + toString(nameMap->size())); mothurOutEndLine(); } - } + } */ } catch(exception& e) { diff --git a/sharedcommand.cpp b/sharedcommand.cpp index c827207..89f57eb 100644 --- a/sharedcommand.cpp +++ b/sharedcommand.cpp @@ -288,10 +288,10 @@ void SharedCommand::createMisMatchFile() { string name = names.substr(0,names.find_first_of(',')); names = names.substr(names.find_first_of(',')+1, names.length()); string group = groupMap->getGroup(name); - + cout << name << endl; if(group == "not found") { outMisMatch << name << endl; } } - + cout << names << endl; //get last name string group = groupMap->getGroup(names); if(group == "not found") { outMisMatch << names << endl; } diff --git a/singlelinkage.cpp b/singlelinkage.cpp index c27b14e..9f015db 100644 --- a/singlelinkage.cpp +++ b/singlelinkage.cpp @@ -5,8 +5,8 @@ /***********************************************************************/ -SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) : -Cluster(rav, lv, dm) +SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) : +Cluster(rav, lv, dm, c) {}