temp = validParameter.validFile(parameters, "cutoff", false);
if (temp == "not found") { temp = "10"; }
convert(temp, cutoff);
- if (!hard) { cutoff += (5 / (precision * 10.0)); }
+ cutoff += (5 / (precision * 10.0));
method = validParameter.validFile(parameters, "method", false);
if (method == "not found") { method = "furthest"; }
cluster->update(cutoff);
float dist = matrix->getSmallDist();
- float rndDist = roundDist(dist, precision);
+ float rndDist;
+ if (hard) {
+ rndDist = ceilDist(dist, precision);
+ }else{
+ rndDist = roundDist(dist, precision);
+ }
if(previousDist <= 0.0000 && dist != previousDist){
printData("unique");
else {
//valid paramters for this command
- string Array[] = {"phylip","column","name","cutoff","precision","method","showabund","timing","hard","processors","outputdir","inputdir"};
+ string Array[] = {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","showabund","timing","hard","processors","outputdir","inputdir"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["name"] = inputDir + it->second; }
}
+
+ it = parameters.find("taxonomy");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
+ }
}
//check for required parameters
if (namefile == "not open") { abort = true; }
else if (namefile == "not found") { namefile = ""; }
+ taxFile = validParameter.validFile(parameters, "taxonomy", true);
+ if (taxFile == "not open") { abort = true; }
+ else if (taxFile == "not found") { taxFile = ""; }
+
if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
if (columnfile != "") {
- if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
+ if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
}
//check for optional parameter and set defaults
temp = validParameter.validFile(parameters, "cutoff", false);
if (temp == "not found") { temp = "10"; }
convert(temp, cutoff);
- if (!hard) { cutoff += (5 / (precision * 10.0)); }
+ cutoff += (5 / (precision * 10.0));
- method = validParameter.validFile(parameters, "method", false);
- if (method == "not found") { method = "furthest"; }
+ method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
+
+ splitmethod = validParameter.validFile(parameters, "splitmethod", false); if (splitmethod == "not found") { method = "distance"; }
if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
+
+ if ((splitmethod == "distance") || (splitmethod == "classify")) { }
+ else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance or classify."); m->mothurOutEndLine(); abort = true; }
+
+ if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
showabund = validParameter.validFile(parameters, "showabund", false);
if (showabund == "not found") { showabund = "T"; }
void ClusterSplitCommand::help(){
try {
- m->mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
- m->mothurOut("The cluster command parameter options are method, cuttoff, hard, precision, showabund and timing. No parameters are required.\n");
- m->mothurOut("The cluster command should be in the following format: \n");
- m->mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
- m->mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
+ m->mothurOut("The cluster.split command parameter options are cutoff, splitcutoff, precision, method, splitmethod, phylip, column, name, showabund, timing. Phylip or column and name are required.\n");
+ m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
+ m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
+ m->mothurOut("The cluster.split command should be in the following format: \n");
+ m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
+ m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify) \n");
+
}
catch(exception& e) {
m->errorOut(e, "ClusterSplitCommand", "help");
time_t estart = time(NULL);
//split matrix into non-overlapping groups
- SplitMatrix* split = new SplitMatrix(distfile, namefile, cutoff);
+ SplitMatrix* split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod);
split->split();
if (m->control_pressed) { delete split; return 0; }
vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
delete split;
+ if (m->control_pressed) { return 0; }
+
m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
estart = time(NULL);
- if (m->control_pressed) { return 0; }
-
//****************** break up files between processes and cluster each file set ******************************//
vector<string> listFileNames;
set<string> labels;
cluster->update(cutoff);
float dist = matrix->getSmallDist();
- float rndDist = roundDist(dist, precision);
+ float rndDist;
+ if (hard) {
+ rndDist = ceilDist(dist, precision);
+ }else{
+ rndDist = roundDist(dist, precision);
+ }
if(previousDist <= 0.0000 && dist != previousDist){
oldList.setLabel("unique");
vector<int> processIDS; //processid
vector<string> outputNames;
- string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing;
+ string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing, splitmethod, taxFile;
double cutoff, splitcutoff;
int precision, length, processors;
bool print_start, abort, hard;
temp = validParameter.validFile(parameters, "cutoff", false);
if (temp == "not found") { temp = "10"; }
convert(temp, cutoff);
- if (!hard) { cutoff += (5 / (precision * 10.0)); }
+ cutoff += (5 / (precision * 10.0));
method = validParameter.validFile(parameters, "method", false);
if (method == "not found") { method = "furthest"; }
m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
m->mothurOut("The hcluster command should be in the following format: \n");
m->mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
- m->mothurOut("The acceptable hcluster methods are furthest and nearest, but we hope to add average in the future.\n\n");
+ m->mothurOut("The acceptable hcluster methods are furthest, nearest and average.\n\n");
}
catch(exception& e) {
m->errorOut(e, "HClusterCommand", "help");
return 0;
}
-
- float rndDist = roundDist(seqs[i].dist, precision);
+
+ float rndDist;
+ if (hard) {
+ rndDist = ceilDist(seqs[i].dist, precision);
+ }else{
+ rndDist = roundDist(seqs[i].dist, precision);
+ }
+
if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
printData("unique");
$(CC) $(CC_OPTIONS) splitabundcommand.cpp -c $(INCLUDE) -o ./splitabundcommand.o\r
\r
# Item # 206 -- splitmatrix --\r
-./splitmatrix.o : splitmatrix.o\r
+./splitmatrix.o : splitmatrix.cpp\r
$(CC) $(CC_OPTIONS) splitmatrix.cpp -c $(INCLUDE) -o ./splitmatrix.o\r
\r
# Item # 207 -- splitmatrix --\r
else {
//valid paramters for this command
- string Array[] = {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
+ string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
hclusterWanted = isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
+ hard = isTrue(temp);
}
}
}
float dist = distMatrix->getSmallDist();
- float rndDist = roundDist(dist, precision);
+ float rndDist;
+ if (hard) {
+ rndDist = ceilDist(dist, precision);
+ }else{
+ rndDist = roundDist(dist, precision);
+ }
+
if(previousDist <= 0.0000 && dist != previousDist){
oldList.setLabel("unique");
return 0;
}
- float rndDist = roundDist(seqs[i].dist, precision);
+ float rndDist;
+ if (hard) {
+ rndDist = ceilDist(seqs[i].dist, precision);
+ }else{
+ rndDist = roundDist(seqs[i].dist, precision);
+ }
if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
oldList.setLabel("unique");
double cutoff;
float penalty;
int precision, length, precisionLength;
- bool abort, minWanted, hclusterWanted, merge;
+ bool abort, minWanted, hclusterWanted, merge, hard;
void printData(ListVector*);
ListVector* mergeOPFs(map<string, int>, float);
return int(dist * precision + 0.5)/float(precision);
+}
+/***********************************************************************/
+
+inline float ceilDist(float dist, int precision){
+
+ return int(ceil(dist * precision))/float(precision);
+
}
/***********************************************************************/
TaxNode get(string seqName);
string getName(int i);
int getIndex(string seqName);
-
string getFullTaxonomy(string); //pass a sequence name return taxonomy
- int getMaxLevel() { return maxLevel; }
- int getNumSeqs() { return numSeqs; }
+
+ int getMaxLevel() { return maxLevel; }
+ int getNumSeqs() { return numSeqs; }
+ int getNumNodes() { return tree.size(); }
+
bool ErrorCheck(vector<string>);
private:
else {
//valid paramters for this command
- string Array[] = {"phylip", "column", "name", "cutoff","hard", "precision", "group","outputdir","inputdir","sim"};
+ string Array[] = {"phylip", "column", "name", "cutoff", "precision", "group","outputdir","inputdir","sim"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
sim = isTrue(temp);
globaldata->sim = sim;
- temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
- hard = isTrue(temp);
-
temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
convert(temp, cutoff);
- if (!hard) { cutoff += (5 / (precision * 10.0)); }
+ cutoff += (5 / (precision * 10.0));
if (abort == false) {
distFileName = globaldata->inputFileName;
string phylipfile, columnfile, namefile, groupfile, outputDir;
NameAssignment* nameMap;
- bool abort, sim, hard;
+ bool abort, sim;
};
*/
#include "splitmatrix.h"
+#include "phylotree.h"
/***********************************************************************/
-SplitMatrix::SplitMatrix(string distfile, string name, float c){
+SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
m = MothurOut::getInstance();
distFile = distfile;
cutoff = c;
namefile = name;
+ method = t;
+ taxFile = tax;
}
/***********************************************************************/
int SplitMatrix::split(){
+ try {
+
+ if (method == "distance") {
+ splitDistance();
+ }else if (method == "classify") {
+ splitClassify();
+ }else {
+ m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
+ map<string, string> temp;
+ temp[distFile] = namefile;
+ dists.push_back(temp);
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "split");
+ exit(1);
+ }
+}
+/***********************************************************************/
+int SplitMatrix::splitDistance(){
try {
vector<set<string> > groups;
dFile >> seqA >> seqB >> dist;
+ if (m->control_pressed) { outFile.close(); dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
+
if(dist < cutoff){
//cout << "in cutoff: " << dist << endl;
int groupIDA = -1;
smallNameFile.close();
}
}
-
+
//names of singletons
if (nameMap.size() != 0) {
singleton = namefile + ".extra.temp";
dists.push_back(temp);
}
}
+
+ if (m->control_pressed) {
+ for (int i = 0; i < dists.size(); i++) {
+ remove((dists[i].begin()->first).c_str());
+ remove((dists[i].begin()->second).c_str());
+ }
+ dists.clear();
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "splitDistance");
+ exit(1);
+ }
+}
+
+/***********************************************************************/
+int SplitMatrix::splitClassify(){
+ try {
+ map<string, int> seqGroup;
+ map<string, int>::iterator it;
+ map<string, int>::iterator it2;
+
+ int numGroups = 0;
+
+ //build tree from users taxonomy file
+ PhyloTree* phylo = new PhyloTree();
+
+ ifstream in;
+ openInputFile(taxFile, in);
+
+ //read in users taxonomy file and add sequences to tree
+ string seqname, tax;
+ while(!in.eof()){
+ in >> seqname >> tax; gobble(in);
+
+ phylo->addSeqToTree(seqname, tax);
+ }
+ in.close();
+
+ phylo->assignHeirarchyIDs(0);
+
+ //make sure the cutoff is not greater than maxlevel
+ if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
+
+ //for each node in tree
+ for (int i = 0; i < phylo->getNumNodes(); i++) {
+
+ //is this node within the cutoff
+ TaxNode taxon = phylo->get(i);
+
+ if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
+ if (taxon.children.size() > 1) { //if this taxon just has one seq its a singleton
+ for (it = taxon.children.begin(); it != taxon.children.end(); it++) {
+ seqGroup[it->first] = numGroups;
+ }
+ numGroups++;
+ }
+ }
+ }
+
+ ifstream dFile;
+ openInputFile(distFile, dFile);
+ ofstream outFile;
+
+ for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+ remove((distFile + "." + toString(i) + ".temp").c_str());
+ }
+
+ //for each distance
+ while(dFile){
+ string seqA, seqB;
+ float dist;
+
+ if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
+
+ dFile >> seqA >> seqB >> dist; gobble(dFile);
+
+ //if both sequences are in the same group then they are within the cutoff
+ it = seqGroup.find(seqA);
+ it2 = seqGroup.find(seqB);
+
+ if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
+ if (it->second == it2->second) { //they are from the same group so add the distance
+ openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
+ outFile << seqA << '\t' << seqB << '\t' << dist << endl;
+ outFile.close();
+ }
+ }
+ }
+ dFile.close();
+
+
+ for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+ remove((namefile + "." + toString(i) + ".temp").c_str());
+ }
+
+ ifstream bigNameFile;
+ openInputFile(namefile, bigNameFile);
+
+ singleton = namefile + ".extra.temp";
+ ofstream remainingNames;
+ openOutputFile(singleton, remainingNames);
+
+ bool wroteExtra = false;
+
+ string name, nameList;
+ while(!bigNameFile.eof()){
+ bigNameFile >> name >> nameList; gobble(bigNameFile);
+
+ //did this sequence get assigned a group
+ it = seqGroup.find(name);
+
+ if (it != seqGroup.end()) {
+ openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
+ outFile << name << '\t' << nameList << endl;
+ outFile.close();
+ }else{
+ wroteExtra = true;
+ remainingNames << name << '\t' << nameList << endl;
+ }
+ }
+ bigNameFile.close();
+ remainingNames.close();
+
+ if (!wroteExtra) {
+ remove(singleton.c_str());
+ singleton = "none";
+ }
+
+ for(int i=0;i<numGroups;i++){
+ string tempNameFile = namefile + "." + toString(i) + ".temp";
+ string tempDistFile = distFile + "." + toString(i) + ".temp";
+ map<string, string> temp;
+ temp[tempDistFile] = tempNameFile;
+ dists.push_back(temp);
+ }
+
+ if (m->control_pressed) {
+ for (int i = 0; i < dists.size(); i++) {
+ remove((dists[i].begin()->first).c_str());
+ remove((dists[i].begin()->second).c_str());
+ }
+ dists.clear();
+ }
+
return 0;
}
catch(exception& e) {
- m->errorOut(e, "SplitMatrix", "split");
+ m->errorOut(e, "SplitMatrix", "splitClassify");
exit(1);
}
}
public:
- SplitMatrix(string, string, float); //column formatted distance file, namesfile, cutoff
+ SplitMatrix(string, string, string, float, string); //column formatted distance file, namesfile, cutoff, method
~SplitMatrix();
int split();
vector< map<string, string> > getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
string getSingletonNames() { return singleton; } //returns namesfile containing singletons
private:
- string distFile, namefile, singleton;
+ MothurOut* m;
+
+ string distFile, namefile, singleton, method, taxFile;
vector< map< string, string> > dists;
float cutoff;
-
- MothurOut* m;
+
+ int splitDistance();
+ int splitClassify();
};
/******************************************************/