From: westcott Date: Fri, 28 May 2010 12:20:41 +0000 (+0000) Subject: changed hard parameter in cluster commands X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=47faf5463d44570ad66148384763db1c8238b563 changed hard parameter in cluster commands --- diff --git a/clustercommand.cpp b/clustercommand.cpp index 9ed67e0..30c45e1 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -62,7 +62,7 @@ ClusterCommand::ClusterCommand(string option) { temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; } convert(temp, cutoff); - if (!hard) { cutoff += (5 / (precision * 10.0)); } + cutoff += (5 / (precision * 10.0)); method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; } @@ -181,7 +181,12 @@ int ClusterCommand::execute(){ cluster->update(cutoff); float dist = matrix->getSmallDist(); - float rndDist = roundDist(dist, precision); + float rndDist; + if (hard) { + rndDist = ceilDist(dist, precision); + }else{ + rndDist = roundDist(dist, precision); + } if(previousDist <= 0.0000 && dist != previousDist){ printData("unique"); diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index e9ab530..4b3c27b 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -27,7 +27,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { else { //valid paramters for this command - string Array[] = {"phylip","column","name","cutoff","precision","method","showabund","timing","hard","processors","outputdir","inputdir"}; + string Array[] = {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","showabund","timing","hard","processors","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -76,6 +76,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("taxonomy"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["taxonomy"] = inputDir + it->second; } + } } //check for required parameters @@ -93,11 +101,15 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { if (namefile == "not open") { abort = true; } else if (namefile == "not found") { namefile = ""; } + taxFile = validParameter.validFile(parameters, "taxonomy", true); + if (taxFile == "not open") { abort = true; } + else if (taxFile == "not found") { taxFile = ""; } + if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; } else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; } if (columnfile != "") { - if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } + if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; } } //check for optional parameter and set defaults @@ -119,13 +131,19 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; } convert(temp, cutoff); - if (!hard) { cutoff += (5 / (precision * 10.0)); } + cutoff += (5 / (precision * 10.0)); - method = validParameter.validFile(parameters, "method", false); - if (method == "not found") { method = "furthest"; } + method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; } + + splitmethod = validParameter.validFile(parameters, "splitmethod", false); if (splitmethod == "not found") { method = "distance"; } if ((method == "furthest") || (method == "nearest") || (method == "average")) { } else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; } + + if ((splitmethod == "distance") || (splitmethod == "classify")) { } + else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance or classify."); m->mothurOutEndLine(); abort = true; } + + if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; } showabund = validParameter.validFile(parameters, "showabund", false); if (showabund == "not found") { showabund = "T"; } @@ -145,11 +163,13 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { void ClusterSplitCommand::help(){ try { - m->mothurOut("The cluster command can only be executed after a successful read.dist command.\n"); - m->mothurOut("The cluster command parameter options are method, cuttoff, hard, precision, showabund and timing. No parameters are required.\n"); - m->mothurOut("The cluster command should be in the following format: \n"); - m->mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"); - m->mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n"); + m->mothurOut("The cluster.split command parameter options are cutoff, splitcutoff, precision, method, splitmethod, phylip, column, name, showabund, timing. Phylip or column and name are required.\n"); + m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n"); + m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n"); + m->mothurOut("The cluster.split command should be in the following format: \n"); + m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"); + m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify) \n"); + } catch(exception& e) { m->errorOut(e, "ClusterSplitCommand", "help"); @@ -203,7 +223,7 @@ int ClusterSplitCommand::execute(){ time_t estart = time(NULL); //split matrix into non-overlapping groups - SplitMatrix* split = new SplitMatrix(distfile, namefile, cutoff); + SplitMatrix* split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod); split->split(); if (m->control_pressed) { delete split; return 0; } @@ -212,11 +232,11 @@ int ClusterSplitCommand::execute(){ vector< map > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size delete split; + if (m->control_pressed) { return 0; } + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine(); estart = time(NULL); - if (m->control_pressed) { return 0; } - //****************** break up files between processes and cluster each file set ******************************// vector listFileNames; set labels; @@ -586,7 +606,12 @@ vector ClusterSplitCommand::cluster(vector< map > distNa cluster->update(cutoff); float dist = matrix->getSmallDist(); - float rndDist = roundDist(dist, precision); + float rndDist; + if (hard) { + rndDist = ceilDist(dist, precision); + }else{ + rndDist = roundDist(dist, precision); + } if(previousDist <= 0.0000 && dist != previousDist){ oldList.setLabel("unique"); diff --git a/clustersplitcommand.h b/clustersplitcommand.h index 4d1f435..631a8b6 100644 --- a/clustersplitcommand.h +++ b/clustersplitcommand.h @@ -32,7 +32,7 @@ private: vector processIDS; //processid vector outputNames; - string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing; + string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing, splitmethod, taxFile; double cutoff, splitcutoff; int precision, length, processors; bool print_start, abort, hard; diff --git a/hclustercommand.cpp b/hclustercommand.cpp index dc14e16..b4d6017 100644 --- a/hclustercommand.cpp +++ b/hclustercommand.cpp @@ -110,7 +110,7 @@ HClusterCommand::HClusterCommand(string option) { temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; } convert(temp, cutoff); - if (!hard) { cutoff += (5 / (precision * 10.0)); } + cutoff += (5 / (precision * 10.0)); method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; } @@ -163,7 +163,7 @@ void HClusterCommand::help(){ m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n"); m->mothurOut("The hcluster command should be in the following format: \n"); m->mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"); - m->mothurOut("The acceptable hcluster methods are furthest and nearest, but we hope to add average in the future.\n\n"); + m->mothurOut("The acceptable hcluster methods are furthest, nearest and average.\n\n"); } catch(exception& e) { m->errorOut(e, "HClusterCommand", "help"); @@ -279,8 +279,14 @@ int HClusterCommand::execute(){ return 0; } - - float rndDist = roundDist(seqs[i].dist, precision); + + float rndDist; + if (hard) { + rndDist = ceilDist(seqs[i].dist, precision); + }else{ + rndDist = roundDist(seqs[i].dist, precision); + } + if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ printData("unique"); diff --git a/makefile b/makefile index fdcef27..f50b554 100644 --- a/makefile +++ b/makefile @@ -1691,7 +1691,7 @@ install : mothur $(CC) $(CC_OPTIONS) splitabundcommand.cpp -c $(INCLUDE) -o ./splitabundcommand.o # Item # 206 -- splitmatrix -- -./splitmatrix.o : splitmatrix.o +./splitmatrix.o : splitmatrix.cpp $(CC) $(CC_OPTIONS) splitmatrix.cpp -c $(INCLUDE) -o ./splitmatrix.o # Item # 207 -- splitmatrix -- diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index 6eb9c7d..023f214 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -20,7 +20,7 @@ MGClusterCommand::MGClusterCommand(string option) { else { //valid paramters for this command - string Array[] = {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"}; + string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -104,6 +104,9 @@ MGClusterCommand::MGClusterCommand(string option) { temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; } hclusterWanted = isTrue(temp); + + temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; } + hard = isTrue(temp); } } @@ -214,7 +217,13 @@ int MGClusterCommand::execute(){ } float dist = distMatrix->getSmallDist(); - float rndDist = roundDist(dist, precision); + float rndDist; + if (hard) { + rndDist = ceilDist(dist, precision); + }else{ + rndDist = roundDist(dist, precision); + } + if(previousDist <= 0.0000 && dist != previousDist){ oldList.setLabel("unique"); @@ -329,7 +338,12 @@ int MGClusterCommand::execute(){ return 0; } - float rndDist = roundDist(seqs[i].dist, precision); + float rndDist; + if (hard) { + rndDist = ceilDist(seqs[i].dist, precision); + }else{ + rndDist = roundDist(seqs[i].dist, precision); + } if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ oldList.setLabel("unique"); diff --git a/mgclustercommand.h b/mgclustercommand.h index a1dac22..4ef0bd9 100644 --- a/mgclustercommand.h +++ b/mgclustercommand.h @@ -43,7 +43,7 @@ private: double cutoff; float penalty; int precision, length, precisionLength; - bool abort, minWanted, hclusterWanted, merge; + bool abort, minWanted, hclusterWanted, merge, hard; void printData(ListVector*); ListVector* mergeOPFs(map, float); diff --git a/mothur.h b/mothur.h index ca19aea..5d12977 100644 --- a/mothur.h +++ b/mothur.h @@ -254,6 +254,13 @@ inline float roundDist(float dist, int precision){ return int(dist * precision + 0.5)/float(precision); +} +/***********************************************************************/ + +inline float ceilDist(float dist, int precision){ + + return int(ceil(dist * precision))/float(precision); + } /***********************************************************************/ diff --git a/phylotree.h b/phylotree.h index a961721..50bbb1d 100644 --- a/phylotree.h +++ b/phylotree.h @@ -45,10 +45,12 @@ public: TaxNode get(string seqName); string getName(int i); int getIndex(string seqName); - string getFullTaxonomy(string); //pass a sequence name return taxonomy - int getMaxLevel() { return maxLevel; } - int getNumSeqs() { return numSeqs; } + + int getMaxLevel() { return maxLevel; } + int getNumSeqs() { return numSeqs; } + int getNumNodes() { return tree.size(); } + bool ErrorCheck(vector); private: diff --git a/readdistcommand.cpp b/readdistcommand.cpp index 414f330..4ea630f 100644 --- a/readdistcommand.cpp +++ b/readdistcommand.cpp @@ -22,7 +22,7 @@ ReadDistCommand::ReadDistCommand(string option) { else { //valid paramters for this command - string Array[] = {"phylip", "column", "name", "cutoff","hard", "precision", "group","outputdir","inputdir","sim"}; + string Array[] = {"phylip", "column", "name", "cutoff", "precision", "group","outputdir","inputdir","sim"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -126,12 +126,9 @@ ReadDistCommand::ReadDistCommand(string option) { sim = isTrue(temp); globaldata->sim = sim; - temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; } - hard = isTrue(temp); - temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; } convert(temp, cutoff); - if (!hard) { cutoff += (5 / (precision * 10.0)); } + cutoff += (5 / (precision * 10.0)); if (abort == false) { distFileName = globaldata->inputFileName; diff --git a/readdistcommand.h b/readdistcommand.h index 6241f97..937ca3f 100644 --- a/readdistcommand.h +++ b/readdistcommand.h @@ -42,7 +42,7 @@ private: string phylipfile, columnfile, namefile, groupfile, outputDir; NameAssignment* nameMap; - bool abort, sim, hard; + bool abort, sim; }; diff --git a/splitmatrix.cpp b/splitmatrix.cpp index c52c287..0b59fa2 100644 --- a/splitmatrix.cpp +++ b/splitmatrix.cpp @@ -8,19 +8,44 @@ */ #include "splitmatrix.h" +#include "phylotree.h" /***********************************************************************/ -SplitMatrix::SplitMatrix(string distfile, string name, float c){ +SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){ m = MothurOut::getInstance(); distFile = distfile; cutoff = c; namefile = name; + method = t; + taxFile = tax; } /***********************************************************************/ int SplitMatrix::split(){ + try { + + if (method == "distance") { + splitDistance(); + }else if (method == "classify") { + splitClassify(); + }else { + m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine(); + map temp; + temp[distFile] = namefile; + dists.push_back(temp); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SplitMatrix", "split"); + exit(1); + } +} +/***********************************************************************/ +int SplitMatrix::splitDistance(){ try { vector > groups; @@ -36,6 +61,8 @@ int SplitMatrix::split(){ dFile >> seqA >> seqB >> dist; + if (m->control_pressed) { outFile.close(); dFile.close(); for(int i=0;i 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; } + if(dist < cutoff){ //cout << "in cutoff: " << dist << endl; int groupIDA = -1; @@ -172,7 +199,7 @@ int SplitMatrix::split(){ smallNameFile.close(); } } - + //names of singletons if (nameMap.size() != 0) { singleton = namefile + ".extra.temp"; @@ -193,12 +220,160 @@ int SplitMatrix::split(){ dists.push_back(temp); } } + + if (m->control_pressed) { + for (int i = 0; i < dists.size(); i++) { + remove((dists[i].begin()->first).c_str()); + remove((dists[i].begin()->second).c_str()); + } + dists.clear(); + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SplitMatrix", "splitDistance"); + exit(1); + } +} + +/***********************************************************************/ +int SplitMatrix::splitClassify(){ + try { + map seqGroup; + map::iterator it; + map::iterator it2; + + int numGroups = 0; + + //build tree from users taxonomy file + PhyloTree* phylo = new PhyloTree(); + + ifstream in; + openInputFile(taxFile, in); + + //read in users taxonomy file and add sequences to tree + string seqname, tax; + while(!in.eof()){ + in >> seqname >> tax; gobble(in); + + phylo->addSeqToTree(seqname, tax); + } + in.close(); + + phylo->assignHeirarchyIDs(0); + + //make sure the cutoff is not greater than maxlevel + if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); } + + //for each node in tree + for (int i = 0; i < phylo->getNumNodes(); i++) { + + //is this node within the cutoff + TaxNode taxon = phylo->get(i); + + if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences + if (taxon.children.size() > 1) { //if this taxon just has one seq its a singleton + for (it = taxon.children.begin(); it != taxon.children.end(); it++) { + seqGroup[it->first] = numGroups; + } + numGroups++; + } + } + } + + ifstream dFile; + openInputFile(distFile, dFile); + ofstream outFile; + + for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case + remove((distFile + "." + toString(i) + ".temp").c_str()); + } + + //for each distance + while(dFile){ + string seqA, seqB; + float dist; + + if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } } + + dFile >> seqA >> seqB >> dist; gobble(dFile); + + //if both sequences are in the same group then they are within the cutoff + it = seqGroup.find(seqA); + it2 = seqGroup.find(seqB); + + if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons + if (it->second == it2->second) { //they are from the same group so add the distance + openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile); + outFile << seqA << '\t' << seqB << '\t' << dist << endl; + outFile.close(); + } + } + } + dFile.close(); + + + for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case + remove((namefile + "." + toString(i) + ".temp").c_str()); + } + + ifstream bigNameFile; + openInputFile(namefile, bigNameFile); + + singleton = namefile + ".extra.temp"; + ofstream remainingNames; + openOutputFile(singleton, remainingNames); + + bool wroteExtra = false; + + string name, nameList; + while(!bigNameFile.eof()){ + bigNameFile >> name >> nameList; gobble(bigNameFile); + + //did this sequence get assigned a group + it = seqGroup.find(name); + + if (it != seqGroup.end()) { + openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile); + outFile << name << '\t' << nameList << endl; + outFile.close(); + }else{ + wroteExtra = true; + remainingNames << name << '\t' << nameList << endl; + } + } + bigNameFile.close(); + remainingNames.close(); + + if (!wroteExtra) { + remove(singleton.c_str()); + singleton = "none"; + } + + for(int i=0;i temp; + temp[tempDistFile] = tempNameFile; + dists.push_back(temp); + } + + if (m->control_pressed) { + for (int i = 0; i < dists.size(); i++) { + remove((dists[i].begin()->first).c_str()); + remove((dists[i].begin()->second).c_str()); + } + dists.clear(); + } + return 0; } catch(exception& e) { - m->errorOut(e, "SplitMatrix", "split"); + m->errorOut(e, "SplitMatrix", "splitClassify"); exit(1); } } diff --git a/splitmatrix.h b/splitmatrix.h index 5974ff1..e38f8fc 100644 --- a/splitmatrix.h +++ b/splitmatrix.h @@ -19,18 +19,21 @@ class SplitMatrix { public: - SplitMatrix(string, string, float); //column formatted distance file, namesfile, cutoff + SplitMatrix(string, string, string, float, string); //column formatted distance file, namesfile, cutoff, method ~SplitMatrix(); int split(); vector< map > getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size string getSingletonNames() { return singleton; } //returns namesfile containing singletons private: - string distFile, namefile, singleton; + MothurOut* m; + + string distFile, namefile, singleton, method, taxFile; vector< map< string, string> > dists; float cutoff; - - MothurOut* m; + + int splitDistance(); + int splitClassify(); }; /******************************************************/