/***********************************************************************/
-AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) :
-Cluster(rav, lv, dm, c, s)
+AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s, float a) :
+Cluster(rav, lv, dm, c, s, a)
{
saveRow = -1;
saveCol = -1;
/***********************************************************************/
-Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string f) :
-rabund(rav), list(lv), dMatrix(dm), method(f)
+Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string f, float cs) :
+rabund(rav), list(lv), dMatrix(dm), method(f), adjust(cs)
{
try {
changed = updateDistance(dMatrix->seqVec[smallCol][j], dMatrix->seqVec[smallRow][i]);
dMatrix->updateCellCompliment(smallCol, j);
break;
- }else if (dMatrix->seqVec[smallCol][j].index < search) { j+=nColCells; } //we don't have a distance for this cell
+ }else if (dMatrix->seqVec[smallCol][j].index < search) { //we don't have a distance for this cell
+ if (adjust != -1.0) { //adjust
+ merged = true;
+ PDistCell value(search, adjust); //create a distance for the missing value
+ int location = dMatrix->addCellSorted(smallCol, value);
+ changed = updateDistance(dMatrix->seqVec[smallCol][location], dMatrix->seqVec[smallRow][i]);
+ dMatrix->updateCellCompliment(smallCol, location);
+ nColCells++;
+ foundCol.push_back(0); //add a new found column
+ //adjust value
+ for (int k = foundCol.size()-1; k > location; k--) { foundCol[k] = foundCol[k-1]; }
+ foundCol[location] = 1;
+ }
+ j+=nColCells;
+ }
}
}
//if not merged it you need it for warning
// Special handling for singlelinkage case, not sure whether this
// could be avoided
for (int i=nColCells-1;i>=0;i--) {
- if (foundCol[i] == 0) {
- if (method == "average" || method == "weighted") {
- if (dMatrix->seqVec[smallCol][i].index != smallRow) { //if you are not hte smallest distance
- if (cutOFF > dMatrix->seqVec[smallCol][i].dist) {
- cutOFF = dMatrix->seqVec[smallCol][i].dist;
- }
- }
- }
+ if (foundCol[i] == 0) {
+ if (adjust != -1.0) { //adjust
+ PDistCell value(smallCol, adjust); //create a distance for the missing value
+ changed = updateDistance(dMatrix->seqVec[smallCol][i], value);
+ dMatrix->updateCellCompliment(smallCol, i);
+ }else {
+ if (method == "average" || method == "weighted") {
+ if (dMatrix->seqVec[smallCol][i].index != smallRow) { //if you are not hte smallest distance
+ if (cutOFF > dMatrix->seqVec[smallCol][i].dist) {
+ cutOFF = dMatrix->seqVec[smallCol][i].dist;
+ }
+ }
+ }
+ }
dMatrix->rmCell(smallCol, i);
}
}
class Cluster {
public:
- Cluster(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
+ Cluster(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float);
virtual ~Cluster() {}
virtual void update(double&);
virtual string getTag() = 0;
ull smallRow;
ull smallCol;
- float smallDist;
+ float smallDist, adjust;
bool mapWanted;
float cutoff;
map<string, int> seq2Bin;
class CompleteLinkage : public Cluster {
public:
- CompleteLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
+ CompleteLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float);
bool updateDistance(PDistCell& colCell, PDistCell& rowCell);
string getTag();
class SingleLinkage : public Cluster {
public:
- SingleLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
+ SingleLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float);
void update(double&);
bool updateDistance(PDistCell& colCell, PDistCell& rowCell);
string getTag();
class AverageLinkage : public Cluster {
public:
- AverageLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
+ AverageLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float);
bool updateDistance(PDistCell& colCell, PDistCell& rowCell);
string getTag();
class WeightedLinkage : public Cluster {
public:
- WeightedLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
+ WeightedLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string, float);
bool updateDistance(PDistCell& colCell, PDistCell& rowCell);
string getTag();
CommandParameter psim("sim", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psim);
CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+ //CommandParameter padjust("adjust", "String", "", "F", "", "", "","",false,false); parameters.push_back(padjust);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
vector<string> myArray;
try {
string helpString = "";
helpString += "The cluster command parameter options are phylip, column, name, count, method, cuttoff, hard, precision, sim, showabund and timing. Phylip or column and name are required, unless you have a valid current file.\n";
- helpString += "The cluster command should be in the following format: \n";
+ //helpString += "The adjust parameter is used to handle missing distances. If you set a cutoff, adjust=f by default. If not, adjust=t by default. Adjust=f, means ignore missing distances and adjust cutoff as needed with the average neighbor method. Adjust=t, will treat missing distances as 1.0. You can also set the value the missing distances should be set to, adjust=0.5 would give missing distances a value of 0.5.\n";
+ helpString += "The cluster command should be in the following format: \n";
helpString += "cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n";
helpString += "The acceptable cluster methods are furthest, nearest, average and weighted. If no method is provided then average is assumed.\n";
return helpString;
temp = validParameter.validFile(parameters, "sim", false); if (temp == "not found") { temp = "F"; }
sim = m->isTrue(temp);
+ //bool cutoffSet = false;
temp = validParameter.validFile(parameters, "cutoff", false);
if (temp == "not found") { temp = "10"; }
+ //else { cutoffSet = true; }
m->mothurConvert(temp, cutoff);
- cutoff += (5 / (precision * 10.0));
+ cutoff += (5 / (precision * 10.0));
+
+ //temp = validParameter.validFile(parameters, "adjust", false); if (temp == "not found") { temp = "F"; }
+ //if (m->isNumeric1(temp)) { m->mothurConvert(temp, adjust); }
+ //else if (m->isTrue(temp)) { adjust = 1.0; }
+ //else { adjust = -1.0; }
+ adjust=-1.0;
method = validParameter.validFile(parameters, "method", false);
if (method == "not found") { method = "average"; }
}
//create cluster
- if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
- else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
- else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
- else if(method == "weighted"){ cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method); }
+ if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
+ else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
+ else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); }
+ else if(method == "weighted"){ cluster = new WeightedLinkage(rabund, list, matrix, cutoff, method, adjust); }
tag = cluster->getTag();
if (outputDir == "") { outputDir += m->hasPath(distfile); }
string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, format, distfile, countfile;
double cutoff;
+ float adjust;
string showabund, timing;
int precision, length;
ofstream sabundFile, rabundFile, listFile;
m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
//create cluster
- if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
- else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
- else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
+ float adjust = -1.0;
+ if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
+ else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
+ else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); }
tag = cluster->getTag();
if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
/***********************************************************************/
-CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) :
- Cluster(rav, lv, dm, c, s)
+CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s, float a) :
+ Cluster(rav, lv, dm, c, s, a)
{}
/***********************************************************************/
CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
CommandParameter pmin("min", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmin);
CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmerge);
+ CommandParameter padjust("adjust", "String", "", "F", "", "", "","",false,false); parameters.push_back(padjust);
CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(phcluster);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
string MGClusterCommand::getHelpString(){
try {
string helpString = "";
- helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n";
+ helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty, adjust and hcluster. The blast parameter is required.\n";
helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
helpString += "The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then average is assumed.\n";
helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
helpString += "The length parameter is used to specify the minimum overlap required. The default is 5.\n";
+ helpString += "The adjust parameter is used to handle missing distances. If you set a cutoff, adjust=f by default. If not, adjust=t by default. Adjust=f, means ignore missing distances and adjust cutoff as needed with the average neighbor method. Adjust=t, will treat missing distances as 1.0. You can also set the value the missing distances should be set to, adjust=0.5 would give missing distances a value of 0.5.\n";
helpString += "The penalty parameter is used to adjust the error rate. The default is 0.10.\n";
helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n";
helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
precisionLength = temp.length();
m->mothurConvert(temp, precision);
- temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
+ cutoffSet = false;
+ temp = validParameter.validFile(parameters, "cutoff", false);
+ if (temp == "not found") { temp = "0.70"; }
+ else { cutoffSet = true; }
m->mothurConvert(temp, cutoff);
cutoff += (5 / (precision * 10.0));
hclusterWanted = m->isTrue(temp);
temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
- hard = m->isTrue(temp);
+ hard = m->isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "adjust", false); if (temp == "not found") { if (cutoffSet) { temp = "F"; }else { temp="T"; } }
+ if (m->isNumeric1(temp)) { m->mothurConvert(temp, adjust); }
+ else if (m->isTrue(temp)) { adjust = 1.0; }
+ else { adjust = -1.0; }
}
}
delete read;
//create cluster
- if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
- else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
- else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
+ if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
+ else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
+ else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
cluster->setMapWanted(true);
Seq2Bin = cluster->getSeqtoBin();
oldSeq2Bin = Seq2Bin;
string blastfile, method, namefile, countfile, overlapFile, distFile, outputDir;
ofstream sabundFile, rabundFile, listFile;
double cutoff;
- float penalty;
+ float penalty, adjust;
int precision, length, precisionLength;
- bool abort, minWanted, hclusterWanted, merge, hard;
+ bool abort, minWanted, hclusterWanted, merge, hard, cutoffSet;
void printData(ListVector*);
ListVector* mergeOPFs(map<string, int>, float);
RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
- Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
+ float adjust = -1.0;
+ Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest", adjust);
string tag = cluster->getTag();
double clusterCutoff = cutoff;
/***********************************************************************/
-SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) :
-Cluster(rav, lv, dm, c, s)
+SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s, float a) :
+Cluster(rav, lv, dm, c, s, a)
{}
exit(1);
}
}
+/***********************************************************************/
+int SparseDistanceMatrix::addCellSorted(ull row, PDistCell cell){
+ try {
+ numNodes+=2;
+ if(cell.dist < smallDist){ smallDist = cell.dist; }
+
+ seqVec[row].push_back(cell);
+ PDistCell temp(row, cell.dist);
+ seqVec[cell.index].push_back(temp);
+
+ sortSeqVec(row);
+ sortSeqVec(cell.index);
+
+ int location = -1; //find location of new cell when sorted
+ for (int i = 0; i < seqVec[row].size(); i++) { if (seqVec[row][i].index == cell.index) { location = i; break; } }
+
+ return location;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SparseDistanceMatrix", "addCellSorted");
+ exit(1);
+ }
+}
+
/***********************************************************************/
ull SparseDistanceMatrix::getSmallestCell(ull& row){
}
/***********************************************************************/
+int SparseDistanceMatrix::sortSeqVec(int index){
+ try {
+
+ //saves time in getSmallestCell, by making it so you dont search the repeats
+ sort(seqVec[index].begin(), seqVec[index].end(), compareIndexes);
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SparseDistanceMatrix", "sortSeqVec");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
void resize(ull n) { seqVec.resize(n); }
void clear();
void addCell(ull, PDistCell);
+ int addCellSorted(ull, PDistCell);
vector<vector<PDistCell> > seqVec;
+
private:
PDistCell smallCell; //The cell with the smallest distance
int numNodes;
bool sorted;
int sortSeqVec();
+ int sortSeqVec(int);
float smallDist, aboveCutoff;
MothurOut* m;
+
};
/***********************************************************************/
/***********************************************************************/
-WeightedLinkage::WeightedLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) :
- Cluster(rav, lv, dm, c, s)
+WeightedLinkage::WeightedLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s, float a) :
+ Cluster(rav, lv, dm, c, s, a)
{
saveRow = -1;
saveCol = -1;