From 68d53ac6682978c5a023c1a3a4e5df6f8302d9fa Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Thu, 18 Jul 2013 13:16:16 -0700 Subject: [PATCH] added adjust parameter to mgcluster. fixed bug in classify.otu with countable. fixed bug in otu.heirarchy with otulabels. fixed bug with summary.seqs overflow when calculating the mean values. --- Mothur.xcodeproj/project.pbxproj | 2 +- aligncommand.cpp | 1 + averagelinkage.cpp | 4 +-- classifyotucommand.cpp | 7 +++--- cluster.cpp | 42 +++++++++++++++++++++++--------- cluster.hpp | 12 ++++----- clustercommand.cpp | 22 ++++++++++++----- clustercommand.h | 1 + clustersplitcommand.cpp | 7 +++--- completelinkage.cpp | 4 +-- mgclustercommand.cpp | 22 ++++++++++++----- mgclustercommand.h | 4 +-- otuhierarchycommand.cpp | 33 ++++++++----------------- screenseqscommand.h | 4 +-- seqsummarycommand.cpp | 11 +++++---- shhhercommand.cpp | 3 ++- singlelinkage.cpp | 4 +-- sparsedistancematrix.cpp | 39 +++++++++++++++++++++++++++++ sparsedistancematrix.h | 4 +++ weightedlinkage.cpp | 4 +-- 20 files changed, 153 insertions(+), 77 deletions(-) diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index b09bab4..0b945f5 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -377,7 +377,7 @@ outputFiles = ( "$(TARGET_BUILD_DIR)/$(INPUT_FILE_BASE).o", ); - script = "/usr/local/bin/gfortran -g -m64 -c ${PROJECT_DIR}/${INPUT_FILE_NAME} -o ${TARGET_BUILD_DIR}/${INPUT_FILE_BASE}.o"; + script = "/usr/local/gfortran/bin/gfortran -g -m64 -c ${PROJECT_DIR}/${INPUT_FILE_NAME} -o ${TARGET_BUILD_DIR}/${INPUT_FILE_BASE}.o"; }; /* End PBXBuildRule section */ diff --git a/aligncommand.cpp b/aligncommand.cpp index f9c0436..f757a79 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -558,6 +558,7 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam if (m->control_pressed) { break; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); + cout << candidateSeq->getAligned() << endl; report.setCandidate(candidateSeq); int origNumBases = candidateSeq->getNumBases(); diff --git a/averagelinkage.cpp b/averagelinkage.cpp index e9ff3b3..8627253 100644 --- a/averagelinkage.cpp +++ b/averagelinkage.cpp @@ -11,8 +11,8 @@ /***********************************************************************/ -AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) : -Cluster(rav, lv, dm, c, s) +AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s, float a) : +Cluster(rav, lv, dm, c, s, a) { saveRow = -1; saveCol = -1; diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp index 170c234..76d7504 100644 --- a/classifyotucommand.cpp +++ b/classifyotucommand.cpp @@ -616,9 +616,10 @@ int ClassifyOtuCommand::process(ListVector* processList) { //add this bins taxonomy to summary if (basis == "sequence") { for(int j = 0; j < names.size(); j++) { - int numReps = 1; - if (countfile != "") { numReps = ct->getNumSeqs(names[j]); } - for(int k = 0; k < numReps; k++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); } + //int numReps = 1; + //if (countfile != "") { numReps = ct->getNumSeqs(names[j]); } + //for(int k = 0; k < numReps; k++) { taxaSum->addSeqToTree(names[j], noConfidenceConTax); } + taxaSum->addSeqToTree(names[j], noConfidenceConTax); } }else { //otu map containsGroup; diff --git a/cluster.cpp b/cluster.cpp index 0a70fbf..6b69e4d 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -13,8 +13,8 @@ /***********************************************************************/ -Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string f) : -rabund(rav), list(lv), dMatrix(dm), method(f) +Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string f, float cs) : +rabund(rav), list(lv), dMatrix(dm), method(f), adjust(cs) { try { @@ -85,7 +85,21 @@ void Cluster::update(double& cutOFF){ changed = updateDistance(dMatrix->seqVec[smallCol][j], dMatrix->seqVec[smallRow][i]); dMatrix->updateCellCompliment(smallCol, j); break; - }else if (dMatrix->seqVec[smallCol][j].index < search) { j+=nColCells; } //we don't have a distance for this cell + }else if (dMatrix->seqVec[smallCol][j].index < search) { //we don't have a distance for this cell + if (adjust != -1.0) { //adjust + merged = true; + PDistCell value(search, adjust); //create a distance for the missing value + int location = dMatrix->addCellSorted(smallCol, value); + changed = updateDistance(dMatrix->seqVec[smallCol][location], dMatrix->seqVec[smallRow][i]); + dMatrix->updateCellCompliment(smallCol, location); + nColCells++; + foundCol.push_back(0); //add a new found column + //adjust value + for (int k = foundCol.size()-1; k > location; k--) { foundCol[k] = foundCol[k-1]; } + foundCol[location] = 1; + } + j+=nColCells; + } } } //if not merged it you need it for warning @@ -105,14 +119,20 @@ void Cluster::update(double& cutOFF){ // Special handling for singlelinkage case, not sure whether this // could be avoided for (int i=nColCells-1;i>=0;i--) { - if (foundCol[i] == 0) { - if (method == "average" || method == "weighted") { - if (dMatrix->seqVec[smallCol][i].index != smallRow) { //if you are not hte smallest distance - if (cutOFF > dMatrix->seqVec[smallCol][i].dist) { - cutOFF = dMatrix->seqVec[smallCol][i].dist; - } - } - } + if (foundCol[i] == 0) { + if (adjust != -1.0) { //adjust + PDistCell value(smallCol, adjust); //create a distance for the missing value + changed = updateDistance(dMatrix->seqVec[smallCol][i], value); + dMatrix->updateCellCompliment(smallCol, i); + }else { + if (method == "average" || method == "weighted") { + if (dMatrix->seqVec[smallCol][i].index != smallRow) { //if you are not hte smallest distance + if (cutOFF > dMatrix->seqVec[smallCol][i].dist) { + cutOFF = dMatrix->seqVec[smallCol][i].dist; + } + } + } + } dMatrix->rmCell(smallCol, i); } } diff --git a/cluster.hpp b/cluster.hpp index 26a01b9..23a3d97 100644 --- a/cluster.hpp +++ b/cluster.hpp @@ -13,7 +13,7 @@ class ListVector; class Cluster { public: - Cluster(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); + Cluster(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float); virtual ~Cluster() {} virtual void update(double&); virtual string getTag() = 0; @@ -33,7 +33,7 @@ protected: ull smallRow; ull smallCol; - float smallDist; + float smallDist, adjust; bool mapWanted; float cutoff; map seq2Bin; @@ -48,7 +48,7 @@ protected: class CompleteLinkage : public Cluster { public: - CompleteLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); + CompleteLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float); bool updateDistance(PDistCell& colCell, PDistCell& rowCell); string getTag(); @@ -60,7 +60,7 @@ private: class SingleLinkage : public Cluster { public: - SingleLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); + SingleLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float); void update(double&); bool updateDistance(PDistCell& colCell, PDistCell& rowCell); string getTag(); @@ -73,7 +73,7 @@ private: class AverageLinkage : public Cluster { public: - AverageLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); + AverageLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float); bool updateDistance(PDistCell& colCell, PDistCell& rowCell); string getTag(); @@ -90,7 +90,7 @@ private: class WeightedLinkage : public Cluster { public: - WeightedLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); + WeightedLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float); bool updateDistance(PDistCell& colCell, PDistCell& rowCell); string getTag(); diff --git a/clustercommand.cpp b/clustercommand.cpp index 2ac2eaa..9412965 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -29,6 +29,7 @@ vector ClusterCommand::setParameters(){ CommandParameter psim("sim", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psim); CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + //CommandParameter padjust("adjust", "String", "", "F", "", "", "","",false,false); parameters.push_back(padjust); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; @@ -45,7 +46,8 @@ string ClusterCommand::getHelpString(){ try { string helpString = ""; helpString += "The cluster command parameter options are phylip, column, name, count, method, cuttoff, hard, precision, sim, showabund and timing. Phylip or column and name are required, unless you have a valid current file.\n"; - helpString += "The cluster command should be in the following format: \n"; + //helpString += "The adjust parameter is used to handle missing distances. If you set a cutoff, adjust=f by default. If not, adjust=t by default. Adjust=f, means ignore missing distances and adjust cutoff as needed with the average neighbor method. Adjust=t, will treat missing distances as 1.0. You can also set the value the missing distances should be set to, adjust=0.5 would give missing distances a value of 0.5.\n"; + helpString += "The cluster command should be in the following format: \n"; helpString += "cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"; helpString += "The acceptable cluster methods are furthest, nearest, average and weighted. If no method is provided then average is assumed.\n"; return helpString; @@ -229,10 +231,18 @@ ClusterCommand::ClusterCommand(string option) { temp = validParameter.validFile(parameters, "sim", false); if (temp == "not found") { temp = "F"; } sim = m->isTrue(temp); + //bool cutoffSet = false; temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; } + //else { cutoffSet = true; } m->mothurConvert(temp, cutoff); - cutoff += (5 / (precision * 10.0)); + cutoff += (5 / (precision * 10.0)); + + //temp = validParameter.validFile(parameters, "adjust", false); if (temp == "not found") { temp = "F"; } + //if (m->isNumeric1(temp)) { m->mothurConvert(temp, adjust); } + //else if (m->isTrue(temp)) { adjust = 1.0; } + //else { adjust = -1.0; } + adjust=-1.0; method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; } @@ -325,10 +335,10 @@ int ClusterCommand::execute(){ } //create cluster - if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); } - else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); } - else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); } - else if(method == "weighted"){ cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method); } + if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); } + else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); } + else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); } + else if(method == "weighted"){ cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method, adjust); } tag = cluster->getTag(); if (outputDir == "") { outputDir += m->hasPath(distfile); } diff --git a/clustercommand.h b/clustercommand.h index 96b7c08..5786da2 100644 --- a/clustercommand.h +++ b/clustercommand.h @@ -56,6 +56,7 @@ private: string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, format, distfile, countfile; double cutoff; + float adjust; string showabund, timing; int precision, length; ofstream sabundFile, rabundFile, listFile; diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 270ea62..b02bd20 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -1330,9 +1330,10 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine(); //create cluster - if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); } - else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); } - else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); } + float adjust = -1.0; + if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); } + else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); } + else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); } tag = cluster->getTag(); if (outputDir == "") { outputDir += m->hasPath(thisDistFile); } diff --git a/completelinkage.cpp b/completelinkage.cpp index 06ed2db..0a3c7b3 100644 --- a/completelinkage.cpp +++ b/completelinkage.cpp @@ -3,8 +3,8 @@ /***********************************************************************/ -CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) : - Cluster(rav, lv, dm, c, s) +CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s, float a) : + Cluster(rav, lv, dm, c, s, a) {} /***********************************************************************/ diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index e3287f7..97f0afd 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -23,6 +23,7 @@ vector MGClusterCommand::setParameters(){ CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard); CommandParameter pmin("min", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmin); CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmerge); + CommandParameter padjust("adjust", "String", "", "F", "", "", "","",false,false); parameters.push_back(padjust); CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(phcluster); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -40,7 +41,7 @@ vector MGClusterCommand::setParameters(){ string MGClusterCommand::getHelpString(){ try { string helpString = ""; - helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n"; + helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty, adjust and hcluster. The blast parameter is required.\n"; helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n"; helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n"; helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n"; @@ -48,6 +49,7 @@ string MGClusterCommand::getHelpString(){ helpString += "The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then average is assumed.\n"; helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n"; helpString += "The length parameter is used to specify the minimum overlap required. The default is 5.\n"; + helpString += "The adjust parameter is used to handle missing distances. If you set a cutoff, adjust=f by default. If not, adjust=t by default. Adjust=f, means ignore missing distances and adjust cutoff as needed with the average neighbor method. Adjust=t, will treat missing distances as 1.0. You can also set the value the missing distances should be set to, adjust=0.5 would give missing distances a value of 0.5.\n"; helpString += "The penalty parameter is used to adjust the error rate. The default is 0.10.\n"; helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n"; helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n"; @@ -184,7 +186,10 @@ MGClusterCommand::MGClusterCommand(string option) { precisionLength = temp.length(); m->mothurConvert(temp, precision); - temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; } + cutoffSet = false; + temp = validParameter.validFile(parameters, "cutoff", false); + if (temp == "not found") { temp = "0.70"; } + else { cutoffSet = true; } m->mothurConvert(temp, cutoff); cutoff += (5 / (precision * 10.0)); @@ -210,7 +215,12 @@ MGClusterCommand::MGClusterCommand(string option) { hclusterWanted = m->isTrue(temp); temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; } - hard = m->isTrue(temp); + hard = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "adjust", false); if (temp == "not found") { if (cutoffSet) { temp = "F"; }else { temp="T"; } } + if (m->isNumeric1(temp)) { m->mothurConvert(temp, adjust); } + else if (m->isTrue(temp)) { adjust = 1.0; } + else { adjust = -1.0; } } } @@ -302,9 +312,9 @@ int MGClusterCommand::execute(){ delete read; //create cluster - if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); } - else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); } - else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); } + if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method, adjust); } + else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method, adjust); } + else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method, adjust); } cluster->setMapWanted(true); Seq2Bin = cluster->getSeqtoBin(); oldSeq2Bin = Seq2Bin; diff --git a/mgclustercommand.h b/mgclustercommand.h index 008bd22..3865bb2 100644 --- a/mgclustercommand.h +++ b/mgclustercommand.h @@ -56,9 +56,9 @@ private: string blastfile, method, namefile, countfile, overlapFile, distFile, outputDir; ofstream sabundFile, rabundFile, listFile; double cutoff; - float penalty; + float penalty, adjust; int precision, length, precisionLength; - bool abort, minWanted, hclusterWanted, merge, hard; + bool abort, minWanted, hclusterWanted, merge, hard, cutoffSet; void printData(ListVector*); ListVector* mergeOPFs(map, float); diff --git a/otuhierarchycommand.cpp b/otuhierarchycommand.cpp index dc026de..a294a77 100644 --- a/otuhierarchycommand.cpp +++ b/otuhierarchycommand.cpp @@ -180,18 +180,10 @@ int OtuHierarchyCommand::execute(){ if (m->control_pressed) { return 0; } - string names = lists[0].get(i); - - //parse bin - while (names.find_first_of(',') != -1) { - string name = names.substr(0,names.find_first_of(',')); - names = names.substr(names.find_first_of(',')+1, names.length()); - littleBins[name] = i; - } - - //get last name - littleBins[names] = i; - } + string bin = lists[0].get(i); + vector names; m->splitAtComma(bin, names); + for (int j = 0; j < names.size(); j++) { littleBins[names[j]] = i; } + } ofstream out; map variables; @@ -207,24 +199,19 @@ int OtuHierarchyCommand::execute(){ if (m->control_pressed) { outputTypes.clear(); out.close(); m->mothurRemove(outputFileName); return 0; } - string names = lists[1].get(i); + string binnames = lists[1].get(i); + vector names; m->splitAtComma(binnames, names); + //output column 1 - if (output == "name") { out << names << '\t'; } - else { out << i << '\t'; } + if (output == "name") { out << binnames << '\t'; } + else { out << (i+1) << '\t'; } map bins; //bin numbers in little that are in this bin in big map::iterator it; //parse bin - while (names.find_first_of(',') != -1) { - string name = names.substr(0,names.find_first_of(',')); - names = names.substr(names.find_first_of(',')+1, names.length()); - bins[littleBins[name]] = littleBins[name]; - } - - //get last name - bins[littleBins[names]] = littleBins[names]; + for (int j = 0; j < names.size(); j++) { bins[littleBins[names[j]]] = littleBins[names[j]]; } string col2 = ""; for (it = bins.begin(); it != bins.end(); it++) { diff --git a/screenseqscommand.h b/screenseqscommand.h index aeaddae..18a55ac 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -74,8 +74,8 @@ private: bool abort; string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile, contigsreport, summaryfile; - int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria, minOverlap, oStart, oEnd, mismatches, maxN, maxInsert; - float minSim, minScore; + int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, minOverlap, oStart, oEnd, mismatches, maxN, maxInsert; + float minSim, minScore, criteria; vector outputNames; vector optimize; map nameMap; diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 27bb8d9..e6b037b 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -364,7 +364,7 @@ int SeqSummaryCommand::execute(){ int size = startPosition.size(); //find means - double meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer; + unsigned long long meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer; meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0; for (int i = 0; i < size; i++) { meanStartPosition += startPosition[i]; @@ -374,8 +374,9 @@ int SeqSummaryCommand::execute(){ meanLongHomoPolymer += longHomoPolymer[i]; } - //this is an int divide so the remainder is lost - meanStartPosition /= (float) size; meanEndPosition /= (float) size; meanLongHomoPolymer /= (float) size; meanSeqLength /= (float) size; meanAmbigBases /= (float) size; + double meanstartPosition, meanendPosition, meanseqLength, meanambigBases, meanlongHomoPolymer; + + meanstartPosition /= (double) size; meanendPosition /= (double) size; meanlongHomoPolymer /= (double) size; meanseqLength /= (double) size; meanambigBases /= (double) size; int ptile0_25 = int(size * 0.025); int ptile25 = int(size * 0.250); @@ -399,7 +400,7 @@ int SeqSummaryCommand::execute(){ m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75]) + "\t" + toString(ptile75+1)); m->mothurOutEndLine(); m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5]) + "\t" + toString(ptile97_5+1)); m->mothurOutEndLine(); m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine(); - m->mothurOut("Mean:\t" + toString(meanStartPosition) + "\t" + toString(meanEndPosition) + "\t" + toString(meanSeqLength) + "\t" + toString(meanAmbigBases) + "\t" + toString(meanLongHomoPolymer)); m->mothurOutEndLine(); + m->mothurOut("Mean:\t" + toString(meanstartPosition) + "\t" + toString(meanendPosition) + "\t" + toString(meanseqLength) + "\t" + toString(meanambigBases) + "\t" + toString(meanlongHomoPolymer)); m->mothurOutEndLine(); if ((namefile == "") && (countfile == "")) { m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); } else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); } @@ -543,7 +544,7 @@ int SeqSummaryCommand::MPICreateSummary(int start, int num, vector& startPo } //for each sequence this sequence represents - for (int i = 0; i < num; i++) { + for (int j = 0; j < num; j++) { startPosition.push_back(current.getStartPos()); endPosition.push_back(current.getEndPos()); seqLength.push_back(current.getNumBases()); diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 9bd437a..b25b9de 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -2727,7 +2727,8 @@ int ShhherCommand::cluster(string filename, string distFileName, string namesFil RAbundVector* rabund = new RAbundVector(list->getRAbundVector()); - Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); + float adjust = -1.0; + Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest", adjust); string tag = cluster->getTag(); double clusterCutoff = cutoff; diff --git a/singlelinkage.cpp b/singlelinkage.cpp index 3af1ea0..3bed931 100644 --- a/singlelinkage.cpp +++ b/singlelinkage.cpp @@ -5,8 +5,8 @@ /***********************************************************************/ -SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) : -Cluster(rav, lv, dm, c, s) +SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s, float a) : +Cluster(rav, lv, dm, c, s, a) {} diff --git a/sparsedistancematrix.cpp b/sparsedistancematrix.cpp index 03e9fa8..417e24a 100644 --- a/sparsedistancematrix.cpp +++ b/sparsedistancematrix.cpp @@ -89,6 +89,30 @@ void SparseDistanceMatrix::addCell(ull row, PDistCell cell){ exit(1); } } +/***********************************************************************/ +int SparseDistanceMatrix::addCellSorted(ull row, PDistCell cell){ + try { + numNodes+=2; + if(cell.dist < smallDist){ smallDist = cell.dist; } + + seqVec[row].push_back(cell); + PDistCell temp(row, cell.dist); + seqVec[cell.index].push_back(temp); + + sortSeqVec(row); + sortSeqVec(cell.index); + + int location = -1; //find location of new cell when sorted + for (int i = 0; i < seqVec[row].size(); i++) { if (seqVec[row][i].index == cell.index) { location = i; break; } } + + return location; + } + catch(exception& e) { + m->errorOut(e, "SparseDistanceMatrix", "addCellSorted"); + exit(1); + } +} + /***********************************************************************/ ull SparseDistanceMatrix::getSmallestCell(ull& row){ @@ -151,3 +175,18 @@ int SparseDistanceMatrix::sortSeqVec(){ } /***********************************************************************/ +int SparseDistanceMatrix::sortSeqVec(int index){ + try { + + //saves time in getSmallestCell, by making it so you dont search the repeats + sort(seqVec[index].begin(), seqVec[index].end(), compareIndexes); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SparseDistanceMatrix", "sortSeqVec"); + exit(1); + } +} +/***********************************************************************/ + diff --git a/sparsedistancematrix.h b/sparsedistancematrix.h index f18fdd7..6c6bbe5 100644 --- a/sparsedistancematrix.h +++ b/sparsedistancematrix.h @@ -46,17 +46,21 @@ public: void resize(ull n) { seqVec.resize(n); } void clear(); void addCell(ull, PDistCell); + int addCellSorted(ull, PDistCell); vector > seqVec; + private: PDistCell smallCell; //The cell with the smallest distance int numNodes; bool sorted; int sortSeqVec(); + int sortSeqVec(int); float smallDist, aboveCutoff; MothurOut* m; + }; /***********************************************************************/ diff --git a/weightedlinkage.cpp b/weightedlinkage.cpp index c1e4d51..b79986b 100644 --- a/weightedlinkage.cpp +++ b/weightedlinkage.cpp @@ -10,8 +10,8 @@ /***********************************************************************/ -WeightedLinkage::WeightedLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) : - Cluster(rav, lv, dm, c, s) +WeightedLinkage::WeightedLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s, float a) : + Cluster(rav, lv, dm, c, s, a) { saveRow = -1; saveCol = -1; -- 2.39.2