/***********************************************************************/
AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-Cluster(rav, lv, dm)
-{}
+ Cluster(rav, lv, dm)
+{
+ saveRow = -1;
+ saveCol = -1;
+}
+
+
+/***********************************************************************/
+//This function returns the tag of the method.
+string AverageLinkage::getTag() {
+ return("an");
+}
+
/***********************************************************************/
-//THis function clusters based on the average method
-void AverageLinkage::update(){
- try{
- getRowColCells();
-
- vector<int> found(nColCells, 0);
-
- int rowBin = rabund->get(smallRow);
- int colBin = rabund->get(smallCol);
- int totalBin = rowBin + colBin;
-
- for(int i=1;i<nRowCells;i++){
-
- int search;
-
- if(rowCells[i]->row == smallRow){
- search = rowCells[i]->column;
- }
- else{
- search = rowCells[i]->row;
- }
-
- for(int j=1;j<nColCells;j++){
-
- if(colCells[j]->row == search || colCells[j]->column == search){
- colCells[j]->dist = (colBin * colCells[j]->dist + rowBin * rowCells[i]->dist) / totalBin;
-
- found[j] = 1;
-
- if(colCells[j]->vectorMap != NULL){
- *(colCells[j]->vectorMap) = NULL;
- colCells[j]->vectorMap = NULL;
- }
-
- break;
- }
-
- }
- dMatrix->rmCell(rowCells[i]);
- }
-
- clusterBins();
- clusterNames();
-
- for(int i=0;i<nColCells;i++){
- if(found[i] == 0){
- dMatrix->rmCell(colCells[i]);
- }
+//This function updates the distance based on the average linkage method.
+bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+ try {
+ if ((saveRow != smallRow) || (saveCol != smallCol)) {
+ rowBin = rabund->get(smallRow);
+ colBin = rabund->get(smallCol);
+ totalBin = rowBin + colBin;
+ saveRow = smallRow;
+ saveCol = smallCol;
}
+
+ colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin;
+ return(true);
}
catch(exception& e) {
- errorOut(e, "AverageLinkage", "update");
+ errorOut(e, "AverageLinkage", "updateDistance");
exit(1);
}
}
/***********************************************************************/
-#endif
+/***********************************************************************/
+#endif
Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
rabund(rav), list(lv), dMatrix(dm)
{
+/*
+ cout << "sizeof(MatData): " << sizeof(MatData) << endl;
+ cout << "sizeof(PCell*): " << sizeof(PCell*) << endl;
+
+ int nCells = dMatrix->getNNodes();
+ time_t start = time(NULL);
+
+ MatVec matvec = MatVec(nCells);
+ int i = 0;
+ for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
+ matvec[i++] = currentCell;
+ }
+ for (i= matvec.size();i>0;i--) {
+ dMatrix->rmCell(matvec[i-1]);
+ }
+ MatData it = dMatrix->begin();
+ while (it != dMatrix->end()) {
+ it = dMatrix->rmCell(it);
+ }
+ cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl;
+ exit(0);
+ MatData it = dMatrix->begin();
+ cout << it->row << "/" << it->column << "/" << it->dist << endl;
+ dMatrix->rmCell(dMatrix->begin());
+ cout << it->row << "/" << it->column << "/" << it->dist << endl;
+ exit(0);
+*/
+
+ // Create a data structure to quickly access the PCell information
+ // for a certain sequence. It consists of a vector of lists, where
+ // a list contains pointers (iterators) to the all distances related
+ // to a certain sequence. The Vector is accessed via the index of a
+ // sequence in the distance matrix.
+ seqVec = vector<MatVec>(lv->size());
+ for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
+ seqVec[currentCell->row].push_back(currentCell);
+ seqVec[currentCell->column].push_back(currentCell);
+ }
}
/***********************************************************************/
-void Cluster::getRowColCells(){
+void Cluster::getRowColCells() {
try {
PCell* smallCell = dMatrix->getSmallestCell(); //find the smallest cell - this routine should probably not be in the SpMat class
- smallRow = smallCell->row; //get its row
- smallCol = smallCell->column; //get its column
- smallDist = smallCell->dist; //get the smallest distance
-
- rowCells.clear();
- colCells.clear();
-
- for(MatData currentCell=dMatrix->begin();currentCell!=dMatrix->end();currentCell++){
-
- if(&*currentCell == smallCell){ //put the smallest cell first
- rowCells.insert(rowCells.begin(), currentCell);
- colCells.insert(colCells.begin(), currentCell);
- }
- else if(currentCell->row == smallRow){
- rowCells.push_back(currentCell);
- }
- else if(currentCell->column == smallRow){
- rowCells.push_back(currentCell);
- }
- else if(currentCell->row == smallCol){
- colCells.push_back(currentCell);
- }
- else if(currentCell->column == smallCol){
- colCells.push_back(currentCell);
- }
- }
+ smallRow = smallCell->row; // get its row
+ smallCol = smallCell->column; // get its column
+ smallDist = smallCell->dist; // get the smallest distance
+ rowCells = seqVec[smallRow]; // all distances related to the row index
+ colCells = seqVec[smallCol]; // all distances related to the column index
nRowCells = rowCells.size();
nColCells = colCells.size();
}
errorOut(e, "Cluster", "getRowColCells");
exit(1);
}
+
}
+// Remove the specified cell from the seqVec and from the sparse
+// matrix
+void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix)
+{
+ ull drow = cell->row;
+ ull dcol = cell->column;
+ if (((vrow >=0) && (drow != smallRow)) ||
+ ((vcol >=0) && (dcol != smallCol))) {
+ ull dtemp = drow;
+ drow = dcol;
+ dcol = dtemp;
+ }
+
+ ull crow;
+ ull ccol;
+ int nCells;
+ if (vrow < 0) {
+ nCells = seqVec[drow].size();
+ for (vrow=0; vrow<nCells;vrow++) {
+ crow = seqVec[drow][vrow]->row;
+ ccol = seqVec[drow][vrow]->column;
+ if (((crow == drow) && (ccol == dcol)) ||
+ ((ccol == drow) && (crow == dcol))) {
+ break;
+ }
+ }
+ }
+ seqVec[drow].erase(seqVec[drow].begin()+vrow);
+ if (vcol < 0) {
+ nCells = seqVec[dcol].size();
+ for (vcol=0; vcol<nCells;vcol++) {
+ crow = seqVec[dcol][vcol]->row;
+ ccol = seqVec[dcol][vcol]->column;
+ if (((crow == drow) && (ccol == dcol)) ||
+ ((ccol == drow) && (crow == dcol))) {
+ break;
+ }
+ }
+ }
+ seqVec[dcol].erase(seqVec[dcol].begin()+vcol);
+ if (rmMatrix) {
+ dMatrix->rmCell(cell);
+ }
+}
+
+
/***********************************************************************/
void Cluster::clusterBins(){
try {
-
+ // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
+
rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));
rabund->set(smallRow, 0);
rabund->setLabel(toString(smallDist));
+ // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
}
catch(exception& e) {
errorOut(e, "Cluster", "clusterBins");
exit(1);
}
+
+
}
/***********************************************************************/
void Cluster::clusterNames(){
try {
-
+ // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
+
list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
list->set(smallRow, "");
list->setLabel(toString(smallDist));
+ // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
}
catch(exception& e) {
errorOut(e, "Cluster", "clusterNames");
exit(1);
}
+
}
+/***********************************************************************/
+//This function clusters based on the method of the derived class
+//At the moment only average and complete linkage are covered, because
+//single linkage uses a different approach.
+void Cluster::update(){
+ try {
+ getRowColCells();
+
+ vector<int> found(nColCells, 0);
+ int search;
+ bool changed;
+
+ // The vector has to be traversed in reverse order to preserve the index
+ // for faster removal in removeCell()
+ for (int i=nRowCells-1;i>=0;i--) {
+ if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) {
+ if (rowCells[i]->row == smallRow) {
+ search = rowCells[i]->column;
+ } else {
+ search = rowCells[i]->row;
+ }
+
+ for (int j=0;j<nColCells;j++) {
+ if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) {
+ if (colCells[j]->row == search || colCells[j]->column == search) {
+ found[j] = 1;
+ changed = updateDistance(colCells[j], rowCells[i]);
+ // If the cell's distance changed and it had the same distance as
+ // the smallest distance, invalidate the mins vector in SparseMatrix
+ if (changed) {
+ if (colCells[j]->vectorMap != NULL) {
+ *(colCells[j]->vectorMap) = NULL;
+ colCells[j]->vectorMap = NULL;
+ }
+ }
+ break;
+ }
+ }
+ }
+ removeCell(rowCells[i], i , -1);
+ }
+ }
+ clusterBins();
+ clusterNames();
+
+ // Special handling for singlelinkage case, not sure whether this
+ // could be avoided
+ for (int i=nColCells-1;i>=0;i--) {
+ if (found[i] == 0) {
+ removeCell(colCells[i], -1, i);
+ }
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "Cluster", "update");
+ exit(1);
+ }
+}
+
+
/***********************************************************************/
class RAbundVector;
class ListVector;
-class SparseMatrix;
-typedef list<PCell>::iterator MatData;
+typedef vector<MatData> MatVec;
class Cluster {
public:
Cluster(RAbundVector*, ListVector*, SparseMatrix*);
- virtual void update() = 0;
-
+ virtual void update();
+ virtual string getTag() = 0;
+
protected:
void getRowColCells();
+ void removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix=true);
+
+ virtual bool updateDistance(MatData& colCell, MatData& rowCell) = 0;
+
virtual void clusterBins();
virtual void clusterNames();
int smallRow;
int smallCol;
float smallDist;
- vector<MatData> rowCells;
- vector<MatData> colCells;
+
+ vector<MatVec> seqVec; // contains vectors of cells related to a certain sequence\r
+ MatVec rowCells;
+ MatVec colCells;
ull nRowCells;
ull nColCells;
};
class CompleteLinkage : public Cluster {
public:
CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*);
- void update();
+ bool updateDistance(MatData& colCell, MatData& rowCell);
+ string getTag();
private:
class SingleLinkage : public Cluster {
public:
SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*);
- void update();
+ void update();
+ bool updateDistance(MatData& colCell, MatData& rowCell);
+ string getTag();
private:
class AverageLinkage : public Cluster {
public:
AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*);
- void update();
+ bool updateDistance(MatData& colCell, MatData& rowCell);
+ string getTag();
private:
-
+ int saveRow;
+ int saveCol;
+ int rowBin;
+ int colBin;
+ int totalBin;
+
};
/***********************************************************************/
else {
//valid paramters for this command
- string Array[] = {"cutoff","precision","method"};
+ string Array[] = {"cutoff","precision","method","showabund","timing"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
//check to make sure all parameters are valid for command
for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
- if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {
+ abort = true;
+ }
}
//error checking to make sure they read a distance file
if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
- mothurOut("Before you use the cluster command, you first need to read in a distance matrix."); mothurOutEndLine(); abort = true;
+ mothurOut("Before you use the cluster command, you first need to read in a distance matrix."); mothurOutEndLine();
+ abort = true;
}
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
//get user cutoff and precision or use defaults
string temp;
- temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
+ temp = validParameter.validFile(parameters, "precision", false);
+ if (temp == "not found") { temp = "100"; }
//saves precision legnth for formatting below
length = temp.length();
convert(temp, precision);
- temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
+ temp = validParameter.validFile(parameters, "cutoff", false);
+ if (temp == "not found") { temp = "10"; }
convert(temp, cutoff);
cutoff += (5 / (precision * 10.0));
- method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
-
+ method = validParameter.validFile(parameters, "method", false);
+ if (method == "not found") { method = "furthest"; }
if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
+ showabund = validParameter.validFile(parameters, "showabund", false);
+ if (showabund == "not found") { showabund = "T"; }
+
+ timing = validParameter.validFile(parameters, "timing", false);
+ if (timing == "not found") { timing = "F"; }
if (abort == false) {
}
//create cluster
- if(method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix); tag = "fn"; }
- else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix); tag = "nn"; }
- else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix); tag = "an"; }
- else { mothurOut("error - not recognized method"); mothurOutEndLine(); abort = true; }
-
+ if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix); }
+ else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix); }
+ else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix); }
+ tag = cluster->getTag();
+
fileroot = getRootName(globaldata->inputFileName);
openOutputFile(fileroot+ tag + ".sabund", sabundFile);
openOutputFile(fileroot+ tag + ".rabund", rabundFile);
openOutputFile(fileroot+ tag + ".list", listFile);
-
-
}
-
}
-
}
catch(exception& e) {
- errorOut(e, "ClusterCommand", "ClusterCommand");
+ errorOut(e, "ClusterCommand", "ClusterCommand");
exit(1);
}
}
void ClusterCommand::help(){
try {
- mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
- mothurOut("The cluster command parameter options are method, cuttoff and precision. No parameters are required.\n");
- mothurOut("The cluster command should be in the following format: \n");
- mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
- mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
+ mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
+ mothurOut("The cluster command parameter options are method, cuttoff, precision, showabund and timing. No parameters are required.\n");
+ mothurOut("The cluster command should be in the following format: \n");
+ mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
+ mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
}
catch(exception& e) {
errorOut(e, "ClusterCommand", "help");
if (abort == true) { return 0; }
+ time_t estart = time(NULL);
+ int ndist = matrix->getNNodes();
float previousDist = 0.00000;
float rndPreviousDist = 0.00000;
oldRAbund = *rabund;
oldList = *list;
+
+ print_start = true;
+ start = time(NULL);
+ loops = 0;
- float x;
- x=0.1;
- toString(x, 2);
-
- while(matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
+ while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
+ if (print_start && isTrue(timing)) {
+ mothurOut("Clustering (" + tag + ") dist " + toString(matrix->getSmallDist()) + "/"
+ + toString(roundDist(matrix->getSmallDist(), precision))
+ + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
+ cout.flush();
+ print_start = false;
+ }
+
+ loops++;
+
cluster->update();
float dist = matrix->getSmallDist();
float rndDist = roundDist(dist, precision);
oldRAbund = *rabund;
oldList = *list;
}
+
+ if (print_start && isTrue(timing)) {
+ mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist)
+ + "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
+ cout.flush();
+ print_start = false;
+ }
if(previousDist <= 0.0000){
printData("unique");
sabundFile.close();
rabundFile.close();
listFile.close();
-
+ if (isTrue(timing)) {
+ mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster " + toString(ndist) + " distances"); mothurOutEndLine();
+ }
return 0;
}
catch(exception& e) {
void ClusterCommand::printData(string label){
try {
+ if (isTrue(timing)) {
+ mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins())
+ + "\tclusters. Updates: " + toString(loops)); mothurOutEndLine();
+ }
+ print_start = true;
+ loops = 0;
+ start = time(NULL);
+
oldRAbund.setLabel(label);
- oldRAbund.getSAbundVector().print(cout);
+ if (isTrue(showabund)) {
+ oldRAbund.getSAbundVector().print(cout);
+ }
oldRAbund.print(rabundFile);
oldRAbund.getSAbundVector().print(sabundFile);
errorOut(e, "ClusterCommand", "printData");
exit(1);
}
+
+
}
//**********************************************************************************************************************
The cluster() command outputs three files *.list, *.rabund, and *.sabund. */
-class GlobalData;
-
class ClusterCommand : public Command {
public:
string method, fileroot, tag;
double cutoff;
+ string showabund, timing;
int precision, length;
ofstream sabundFile, rabundFile, listFile;
+
+ bool print_start;
+ time_t start;
+ unsigned long loops;
void printData(string label);
};
#include "cluster.hpp"
-#include "mothur.h"
/***********************************************************************/
CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-Cluster(rav, lv, dm)
+ Cluster(rav, lv, dm)
{}
/***********************************************************************/
-//This function clusters based on the furthest neighbor method.
-void CompleteLinkage::update(){
+//This function returns the tag of the method.
+string CompleteLinkage::getTag() {
+ return("fn");
+}
+
+
+/***********************************************************************/
+//This function updates the distance based on the furthest neighbor method.
+bool CompleteLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
try {
- getRowColCells();
-
- vector<int> found(nColCells, 0);
-
- for(int i=1;i<nRowCells;i++){
-
- int search;
-
- if(rowCells[i]->row == smallRow){
- search = rowCells[i]->column;
- }
- else{
- search = rowCells[i]->row;
- }
-
- for(int j=1;j<nColCells;j++){
-
- if(colCells[j]->row == search || colCells[j]->column == search){
-
- if(colCells[j]->dist < rowCells[i]->dist){
- colCells[j]->dist = rowCells[i]->dist;
-
- if(colCells[j]->vectorMap != NULL){
- *(colCells[j]->vectorMap) = NULL;
- colCells[j]->vectorMap = NULL;
- }
-
- }
-
- found[j] = 1;
- break;
- }
- }
- dMatrix->rmCell(rowCells[i]);
-
+ bool changed = false;
+ if (colCell->dist < rowCell->dist) {
+ colCell->dist = rowCell->dist;
+ changed = true;
}
- clusterBins();
- clusterNames();
-
- for(int i=0;i<nColCells;i++){
- if(found[i] == 0){
- dMatrix->rmCell(colCells[i]);
- }
- }
+ return(changed);
}
catch(exception& e) {
- errorOut(e, "CompleteLinkage", "update");
+ errorOut(e, "CompleteLinkage", "updateDistance");
exit(1);
}
}
int front = 0;
for (int i = 0; i < q.length(); i++) {
+//cout << "query = " << q[i] << " subject = " << s[i] << endl;
if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; }
}
-
+//cout << endl << endl;
int back = 0;
for (int i = q.length(); i >= 0; i--) {
+//cout << "query = " << q[i] << " subject = " << s[i] << endl;
if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; }
}
//count bases
int numBases = 0;
for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } }
-
+//cout << "num Bases = " << numBases << endl;
//save start of seq
win.push_back(front);
-
+//cout << front << '\t';
//move ahead increment bases at a time until all bases are in a window
int countBases = 0;
int totalBases = 0; //used to eliminate window of blanks at end of sequence
for (int m = front; m < (back - size) ; m++) {
//count number of bases you see
- if (isalpha(seq[m])) { countBases++; totalBases++; }
+ if (isalpha(seq[m])) { countBases++; }
//if you have seen enough bases to make a new window
if (countBases >= increment) {
+ //total bases is the number of bases in a window already.
+ totalBases += countBases;
+//cout << "total bases = " << totalBases << endl;
win.push_back(m); //save spot in alignment
+//cout << m << '\t';
countBases = 0; //reset bases you've seen in this window
}
//no need to continue if all your bases are in a window
if (totalBases == numBases) { break; }
}
-
+
+
+ //get last window if needed
+ if (totalBases < numBases) { win.push_back(back-size); cout << back-size << endl;}
+//cout << endl << endl;
+
return win;
}
openOutputFile(freqfile, outFreq);
+ string length = toString(seqs.size()); //if there are 5000 seqs in the template then set precision to 3
+ int precision = length.length() - 1;
+
+ //format output
+ outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
+
//at each position in the sequence
for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
int highest = 0;
for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
- float highFreq;
- //subtract gaps to "ignore them"
- if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; }
- else { highFreq = highest / (float) (seqs.size() - gaps); }
-
+ float highFreq = highest / (float) (seqs.size());
+
float Pi;
Pi = (highFreq - 0.25) / 0.75;
if (Pi < 0) { Pi = 0.0; }
//saves this for later
- outFreq << i+1 << '\t' << highFreq << endl;
-
+ outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
+
if (h.count(i) > 0) {
- cout << i+1 << '\t' << highFreq << endl;
prob.push_back(Pi);
}
}
exit(1);
}
}
+//***************************************************************************************************************
+void DeCalculator::removeObviousOutliers(vector< vector<float> >& quantiles) {
+ try {
+
+
+ for (int i = 0; i < quantiles.size(); i++) {
+
+ //find mean of this quantile score
+ float average = findAverage(quantiles[i]);
+
+ vector<float> newQuanI;
+ //look at each value in quantiles to see if it is an outlier
+ for (int j = 0; j < quantiles[i].size(); j++) {
+
+ float highCutoff, lowCutOff;
+
+
+
+ }
+
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "DeCalculator", "removeObviousOutliers");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+float DeCalculator::findAverage(vector<float> myVector) {
+ try{
+
+ float total = 0.0;
+ for (int i = 0; i < myVector.size(); i++) { total += myVector[i]; }
+
+ float average = total / (float) myVector.size();
+
+ return average;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "DeCalculator", "findAverage");
+ exit(1);
+ }
+}
//***************************************************************************************************************
float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
for (int j = 0; j < obs.size(); j++) { obsAverage += obs[j]; }
obsAverage = obsAverage / (float) obs.size();
//cout << "sum ai / m = " << probAverage << endl;
-//cout << "sum oi / m = " << obsAverage << endl;
+//cout << "sum oi / m = " << obsAverage << endl;
+
float coef = obsAverage / probAverage;
return coef;
void setMask(string m);
void runMask(Sequence*);
void trimSeqs(Sequence*, Sequence*, map<int, int>&);
+ void removeObviousOutliers(vector< vector<float> >&);
vector<float> calcFreq(vector<Sequence*>, string);
vector<int> findWindows(Sequence*, int, int, int&, int);
vector<float> calcObserved(Sequence*, Sequence*, vector<int>, int);
vector< vector<float> > getQuantiles(vector<Sequence*>, vector<int>, int, vector<float>, int, int, int);
private:
+ float findAverage(vector<float> myVector);
string seqMask;
set<int> h;
for (int i = 0; i < querySeqs.size(); i++) {
int index = ceil(deviation[i]);
+float quan = 2.64 * log10(deviation[i]) + 1.46;
+cout << "dist = " << index << endl;
+cout << "de = " << DE[i] << endl;
+cout << "mallard quantile = " << quan << endl;
+cout << "my quantile = " << quantiles[index][4] << endl;
//is your DE value higher than the 95%
string chimera;
}else { probabilityProfile = readFreq(); }
//make P into Q
- for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; cout << i << '\t' << probabilityProfile[i] << endl; }
+ for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
mothurOut("Done."); mothurOutEndLine();
//mask querys
quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
}else { createProcessesQuan(); }
+
+ decalc->removeObviousOutliers(quantiles);
+
ofstream out4;
string o = getRootName(templateFile) + "quan";
in >> pos >> num;
- if (h.count(pos-1) > 0) {
+ if (h.count(pos) > 0) {
float Pi;
Pi = (num - 0.25) / 0.75;
else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a read.dist command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
if (columnfile != "") {
- if (namefile == "") { mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; }
+ if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
}
//check for optional parameter and set defaults
mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
mothurOut("The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command.\n");
mothurOut("For this use the read.dist command should be in the following format: \n");
- mothurOut("read.dist(phylip=yourPhylipfile, group=yourGroupFile). The cutoff and precision parameters are not valid with this use. \n");
+ mothurOut("read.dist(phylip=yourPhylipfile, group=yourGroupFile). The cutoff and precision parameters are not valid with this use. \n");
mothurOut("Note: No spaces between parameter labels (i.e. phylip), '=' and parameters (i.e.yourPhylipfile).\n\n");
}
catch(exception& e) {
//**********************************************************************************************************************
ReadDistCommand::~ReadDistCommand(){
+ if (abort == false) {
+ if (format != "matrix") { delete read; delete nameMap; }
+ }
}
//**********************************************************************************************************************
try {
if (abort == true) { return 0; }
+
+ time_t start = time(NULL);
+ size_t numDists = 0;
if (format == "matrix") {
ifstream in;
//memory leak prevention
if (globaldata->gMatrix != NULL) { delete globaldata->gMatrix; }
globaldata->gMatrix = matrix; //save matrix for coverage commands
- }else {
+ numDists = matrix->getSizes()[1];
+ } else {
read->read(nameMap);
//to prevent memory leak
if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix; }
globaldata->gSparseMatrix = read->getMatrix();
- delete read; delete nameMap;
+ numDists = globaldata->gSparseMatrix->getNNodes();
+
+ int lines = cutoff / (1.0/precision);
+ vector<float> dist_cutoff(lines+1,0);
+ for (int i = 0; i <= lines;i++) {
+ dist_cutoff[i] = (i + 0.5) / precision;
+ }
+ vector<int> dist_count(lines+1,0);
+ list<PCell>::iterator currentCell;
+ SparseMatrix* smatrix = globaldata->gSparseMatrix;
+ for (currentCell = smatrix->begin(); currentCell != smatrix->end(); currentCell++) {
+ for (int i = 0; i <= lines;i++) {
+ if (currentCell->dist < dist_cutoff[i]) {
+ dist_count[i]++;
+ break;
+ }
+ }
+ }
+ string dist_string = "Dist:";
+ string count_string = "Count: ";
+ for (int i = 0; i <= lines;i++) {
+ dist_string = dist_string.append("\t").append(toString(dist_cutoff[i]));
+ count_string = count_string.append("\t").append(toString(dist_count[i]));
+ }
+ mothurOut(dist_string); mothurOutEndLine(); mothurOut(count_string); mothurOutEndLine();
}
+ mothurOut("It took " + toString(time(NULL) - start) + " secs to read " + toString(numDists) + " distances (cutoff: " + toString(cutoff) + ")"); mothurOutEndLine();
return 0;
+
}
catch(exception& e) {
errorOut(e, "ReadDistCommand", "execute");
Cluster(rav, lv, dm)
{}
+
/***********************************************************************/
-//This function clusters based on nearest method.
-void SingleLinkage::update(){
+//This function returns the tag of the method.
+string SingleLinkage::getTag() {
+ return("nn");
+}
+
+/***********************************************************************/
+//This function clusters based on the single linkage method.
+void SingleLinkage::update(){
try {
- getRowColCells();
+ getRowColCells();
- for(int i=1;i<nRowCells;i++){
-
- int search;
-
- if(rowCells[i]->row == smallRow){
- search = rowCells[i]->column;
- }
- else{
- search = rowCells[i]->row;
- }
+ vector<bool> deleted(nRowCells, false);
+ int rowInd;
+ int search;
+ bool changed;
+
+ // The vector has to be traversed in reverse order to preserve the index
+ // for faster removal in removeCell()
+ for (int i=nRowCells-1;i>=0;i--) {
+ if ((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol)) {
+ rowInd = i; // The index of the smallest distance cell in rowCells
+ } else {
+ if (rowCells[i]->row == smallRow) {
+ search = rowCells[i]->column;
+ } else {
+ search = rowCells[i]->row;
+ }
- for(int j=1;j<nColCells;j++){
-
- if(colCells[j]->row == search || colCells[j]->column == search){
-
- if(colCells[j]->dist > rowCells[i]->dist){
- colCells[j]->dist = rowCells[i]->dist;
-
- if(colCells[j]->vectorMap != NULL){
- *(colCells[j]->vectorMap) = NULL;
- colCells[j]->vectorMap = NULL;
+ for (int j=0;j<nColCells;j++) {
+ if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) {
+ if (colCells[j]->row == search || colCells[j]->column == search) {
+ changed = updateDistance(colCells[j], rowCells[i]);
+ // If the cell's distance changed and it had the same distance as
+ // the smallest distance, invalidate the mins vector in SparseMatrix
+ if (changed) {
+ if (colCells[j]->vectorMap != NULL) {
+ *(colCells[j]->vectorMap) = NULL;
+ colCells[j]->vectorMap = NULL;
+ }
+ }
+ removeCell(rowCells[i], i , -1);
+ deleted[i] = true;
+ break;
}
-
}
- dMatrix->rmCell(rowCells[i]);
- break;
}
- }
-
- if(search < smallCol){
- rowCells[i]->row = smallCol;
- rowCells[i]->column = search;
- }
- else{
- rowCells[i]->row = search;
- rowCells[i]->column = smallCol;
- }
-
- }
+ if (!deleted[i]) {
+ // Assign the cell to the new cluster
+ // remove the old cell from seqVec and add the cell
+ // with the new row and column assignment again
+ removeCell(rowCells[i], i , -1, false);
+ if (search < smallCol){
+ rowCells[i]->row = smallCol;
+ rowCells[i]->column = search;
+ } else {
+ rowCells[i]->row = search;
+ rowCells[i]->column = smallCol;
+ }
+ seqVec[rowCells[i]->row].push_back(rowCells[i]);
+ seqVec[rowCells[i]->column].push_back(rowCells[i]);
+ }
+ }
+ }
clusterBins();
clusterNames();
- dMatrix->rmCell(rowCells[0]);
+ // remove also the cell with the smallest distance
+ removeCell(rowCells[rowInd], -1 , -1);
}
catch(exception& e) {
errorOut(e, "SingleLinkage", "update");
}
}
+
+/***********************************************************************/
+//This function updates the distance based on the nearest neighbor method.
+bool SingleLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+ try {
+ bool changed = false;
+ if (colCell->dist > rowCell->dist) {
+ colCell->dist = rowCell->dist;
+ }
+ return(changed);
+ }
+ catch(exception& e) {
+ errorOut(e, "SingleLinkage", "updateDistance");
+ exit(1);
+ }
+}
/***********************************************************************/
#include "sparsematrix.hpp"
#include "listvector.hpp"
-typedef list<PCell>::iterator MatData;
/***********************************************************************/
/***********************************************************************/
-void SparseMatrix::rmCell(MatData data){
+MatData SparseMatrix::rmCell(MatData data){
try {
if(data->vectorMap != NULL ){
*(data->vectorMap) = NULL;
data->vectorMap = NULL;
}
- matrix.erase(data);
+ data = matrix.erase(data);
numNodes--;
-
+ return(data);
// seems like i should be updating smallDist here, but the only time we remove cells is when
// clustering and the clustering algorithm updates smallDist
}
void SparseMatrix::print(){
try {
int index = 0;
-
- mothurOutEndLine();
- mothurOut("Index\tRow\tColumn\tDistance");
- mothurOutEndLine();
+
+ cout << endl << "Index\tRow\tColumn\tDistance" << endl;
for(MatData currentCell=matrix.begin();currentCell!=matrix.end();currentCell++){
- mothurOut(toString(index) + "\t" + toString(currentCell->row) + "\t" + toString(currentCell->column) + "\t" + toString(currentCell->dist)); mothurOutEndLine();
+ cout << index << '\t' << currentCell->row << '\t' << currentCell->column << '\t' << currentCell->dist << endl;
index++;
}
}
try {
int index = 0;
- mothurOutEndLine();
- mothurOut("Index\tRow\tColumn\tDistance");
- mothurOutEndLine();
-
+ mothurOutEndLine(); mothurOut("Index\tRow\tColumn\tDistance"); mothurOutEndLine();
for(MatData currentCell=matrix.begin();currentCell!=matrix.end();currentCell++){
mothurOut(toString(index) + "\t" + toString(list->get(currentCell->row)) + "\t" + toString(list->get(currentCell->column)) + "\t" + toString(currentCell->dist)); mothurOutEndLine();
}
}
- random_shuffle(mins.begin(), mins.end()); //randomize the order of the iterators in the mins vector
+// random_shuffle(mins.begin(), mins.end()); //randomize the order of the iterators in the mins vector
for(int i=0;i<mins.size();i++){
mins[i]->vectorMap = &mins[i]; //assign vectorMap to the address for the container
/***********************************************************************/
+typedef list<PCell>::iterator MatData;
+
class SparseMatrix {
public:
SparseMatrix();
int getNNodes();
void print(); //Print the contents of the matrix
- void print(ListVector*); //Print the contents of the matrix
+ void print(ListVector*); //Print the contents of the matrix
PCell* getSmallestCell(); //Return the cell with the smallest distance
float getSmallDist();
- void rmCell(list<PCell>::iterator);
+ MatData rmCell(MatData);
void addCell(PCell);
void clear();
- list<PCell>::iterator begin();
- list<PCell>::iterator end();
+ MatData begin();
+ MatData end();
private:
PCell* smallCell; //The cell with the smallest distance