A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; };
A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */; };
A7D755DA1535F679009BF21A /* treereader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D755D91535F679009BF21A /* treereader.cpp */; };
+ A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */; };
A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; };
A7E9B88212D37EC400DA6239 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65112D37EC300DA6239 /* aligncommand.cpp */; };
A7E9B88312D37EC400DA6239 /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65312D37EC300DA6239 /* alignment.cpp */; };
A7D755D71535F665009BF21A /* treereader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treereader.h; sourceTree = "<group>"; };
A7D755D91535F679009BF21A /* treereader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treereader.cpp; sourceTree = "<group>"; };
A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = "<group>"; };
+ A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sparsedistancematrix.cpp; sourceTree = "<group>"; };
+ A7E0243F15B4522000A5F046 /* sparsedistancematrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sparsedistancematrix.h; sourceTree = "<group>"; };
A7E9B64F12D37EC300DA6239 /* ace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ace.cpp; sourceTree = "<group>"; };
A7E9B65012D37EC300DA6239 /* ace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ace.h; sourceTree = "<group>"; };
A7E9B65112D37EC300DA6239 /* aligncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligncommand.cpp; sourceTree = "<group>"; };
A7E9B81412D37EC400DA6239 /* sharedsabundvector.h */,
A7E9B83912D37EC400DA6239 /* sparsematrix.cpp */,
A7E9B83A12D37EC400DA6239 /* sparsematrix.hpp */,
+ A7E0243F15B4522000A5F046 /* sparsedistancematrix.h */,
+ A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */,
A7E9B85112D37EC400DA6239 /* suffixdb.cpp */,
A7E9B85212D37EC400DA6239 /* suffixdb.hpp */,
A7E9B85312D37EC400DA6239 /* suffixnodes.cpp */,
A70056EB156AB6E500924A2D /* removeotulabelscommand.cpp in Sources */,
A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */,
A74D59A4159A1E2000043046 /* counttable.cpp in Sources */,
+ A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
#include "mothur.h"
#include "cluster.hpp"
#include "rabundvector.hpp"
-#include "sparsematrix.hpp"
+#include "sparsedistancematrix.h"
/* This class implements the average UPGMA, average neighbor clustering algorithm */
/***********************************************************************/
-AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string s) :
- Cluster(rav, lv, dm, c, s)
+AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) :
+Cluster(rav, lv, dm, c, s)
{
saveRow = -1;
saveCol = -1;
/***********************************************************************/
//This function updates the distance based on the average linkage method.
-bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+bool AverageLinkage::updateDistance(PDistCell& colCell, PDistCell& rowCell) {
try {
if ((saveRow != smallRow) || (saveCol != smallCol)) {
rowBin = rabund->get(smallRow);
saveRow = smallRow;
saveCol = smallCol;
}
-
- colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin;
-
+ //cout << "colcell.dist = " << colCell.dist << '\t' << smallRow << '\t' << smallCol << '\t' << rowCell.dist << endl;
+ colCell.dist = (colBin * colCell.dist + rowBin * rowCell.dist) / totalBin;
+
return(true);
}
catch(exception& e) {
//double findMax(vector<double>); //This returns the maximum value in the vector.
int numNZ(vector<int>); //This returns the number of non-zero values in the vector.
double numNZ(vector<double>); //This returns the number of non-zero values in the vector.
- //double numPos(vector<double>); //This returns the number of positive values in the vector.
- //double findMaxDiff(vector<double>, vector<double>); //This returns the absolute value of the maximum difference between the two vectors.
- //double findDStat(vector<double>, vector<double>, double); //This returns the D-Statistic of the two vectors with the given total number of species.
- //vector<int> findQuartiles(vector<double>); //This returns a vector with the first element being the index of the lower quartile of the vector and the second element being the index of the upper quartile of the vector.
- //vector<double> add(vector<double>, double); //This adds the given number to every element in the given vector and returns the new vector.
- //vector<double> multiply(vector<double>, double); //This multiplies every element in the given vector by the given number and returns the new vector.
- //vector<double> power(vector<double>, double); //This raises every element in the given vector to the given number and returns the new vector.
- //vector<double> addVecs(vector<double>,vector<double>); //The given vectors must be the same size. This adds the ith element of the first given vector to the ith element of the second given vector and returns the new vector.
- //vector<double> multVecs(vector<double>,vector<double>); //The given vectors must be the same size. This multiplies the ith element of the first given vector to the ith element of the second given vector and returns the new vector.
- //vector<double> remDup(vector<double>); //This returns a vector that contains 1 of each unique element in the given vector. The order of the elements is not changed.
- //vector<double> genCVec(vector<double>); //This returns a cumilative vector of the given vector. The ith element of the returned vector is the sum of all the elements in the given vector up to i.
- //vector<double> genRelVec(vector<double>); //This finds the sum of all the elements in the given vector and then divides the ith element in the given vector by that sum and then puts the result into a new vector, which is returned after all of the elements in the given vector have been used.
- ///vector<double> genDiffVec(vector<double>, vector<double>);//This subtracts the ith element of the second given vector from the ith element of the first given vector and returns the new vector.
- //vector<double> genCSVec(vector<double>);//This calculates the number of species that have the same number of individuals as the ith element of the given vector and then returns a cumulative vector.
- //vector<double> genTotVec(vector<vector<double> >); //This adds up the ith element of all the columns and puts that value into a new vector. It those this for all the rows and then returns the new vector.
- //vector<double> quicksort(vector<double>); //This sorts the given vector from highest to lowest and returns the sorted vector.
- //vector<vector<double> > gen2DVec(vector<double>, int, int); //(vector, #rows/columns, 0 if the second parameter was rows, 1 if the second parameter was columns) Transforms a single vector that was formatted like a table into a 2D vector.
- //vector<string> getSData(char[]);//This takes a file name as a parameter and reads all of the data in the file into a <string> vector.
};
-/**************************************************************************************************/
-
-/*This Class is similar to the GeometricSeries.h class. It calculates
-the broken stick distribution of the table and prints the D-Statistic
-and the confidence limits for the Kolmogorov-Smirnov 1-Sample test
-with a 95% confidence level.
-
-class BrokenStick
-{
- public:
- void doBStick(vector<double>);
-};
-
-//**************************************************************************************************/
-/*This Class calculates the geometric series distribution for the data.
-It prints the D-Statistic and the critical values for the Kolmogorov-Smirnov
-1-sample test at the 95% confidence interval.*/
-
-/*class GeometricSeries
-{
- public:
- void doGeomTest(vector<double>);
-};*/
-
-/**************************************************************************************************
-//This Class calculates the jackknifed estimate of the data and
-//prints it and the confidence limits at a chosen confidence level.
-
-class Jackknifing
-{
- public:
- void doJK(vector<double>, double);
-};
-/**************************************************************************************************
-/*This Class stores calculates the Kolmogorov-Smirnov 2-Sample test between two samples.
-It prints the D-Statistic and the critical value for the test at
-the 90% and 95% confidence interval.
-
-class KS2SampleTest
-{
- public:
- void doKSTest(vector<double>, vector<double>);
-};
-
-/**************************************************************************************************
-//This Class calculates and prints the Q-Statistic for the data.
-class QStatistic
-{
- public:
- void doQStat(vector<double>);
-};
-/**************************************************************************************************
-class SSBPDiversityIndices
-{
- public:
- void doSSBP(vector<double>);
- double getShan(vector<double> vec);//The Shannon Index
- double getSimp(vector<double> vec);//The Simpson Index
- double getBP(vector<double> vec);//The Berger-Parker Index
-};
/**************************************************************************************************/
//This Class stores the table of the confidence limits of the Student-T distribution.
class TDTable
public:
double getConfLimit(int,int);
};
-
-/**************************************************************************************************
-//This Class stores the table of the confidence limits of the One-Sample Kolmogorov-Smirnov Test.
-class KOSTable
-{
- public:
- double getConfLimit(int);
-};
-
/**************************************************************************************************/
-/*This Class calculates the truncated lognormal for the data.
-It then prints the D-Statistic and the critical values for the
-Kolmogorov-Smirnov 1-Sample test.*
-
-class TrunLN
-{
- public:
- void doTrunLN(vector<double>, vector<double>);
-};
-/**************************************************************************************************/
-
#endif
#include "cluster.hpp"
#include "rabundvector.hpp"
#include "listvector.hpp"
-#include "sparsematrix.hpp"
/***********************************************************************/
-Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string f) :
+Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string f) :
rabund(rav), list(lv), dMatrix(dm), method(f)
{
try {
-/*
- cout << "sizeof(MatData): " << sizeof(MatData) << endl;
- cout << "sizeof(PCell*): " << sizeof(PCell*) << endl;
-
- int nCells = dMatrix->getNNodes();
- time_t start = time(NULL);
-
- MatVec matvec = MatVec(nCells);
- int i = 0;
- for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
- matvec[i++] = currentCell;
- }
- for (i= matvec.size();i>0;i--) {
- dMatrix->rmCell(matvec[i-1]);
- }
- MatData it = dMatrix->begin();
- while (it != dMatrix->end()) {
- it = dMatrix->rmCell(it);
- }
- cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl;
- exit(0);
- MatData it = dMatrix->begin();
- cout << it->row << "/" << it->column << "/" << it->dist << endl;
- dMatrix->rmCell(dMatrix->begin());
- cout << it->row << "/" << it->column << "/" << it->dist << endl;
- exit(0);
-*/
-
- // Create a data structure to quickly access the PCell information
- // for a certain sequence. It consists of a vector of lists, where
- // a list contains pointers (iterators) to the all distances related
- // to a certain sequence. The Vector is accessed via the index of a
- // sequence in the distance matrix.
-//ofstream outtemp;
-//string temp = "temp";
-//m->openOutputFile(temp, outtemp);
-//cout << lv->size() << endl;
- seqVec = vector<MatVec>(lv->size());
- for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) {
-//outtemp << currentCell->row << '\t' << currentCell->column << '\t' << currentCell->dist << endl;
- seqVec[currentCell->row].push_back(currentCell);
- seqVec[currentCell->column].push_back(currentCell);
- }
-//outtemp.close();
- mapWanted = false; //set to true by mgcluster to speed up overlap merge
-
- //save so you can modify as it changes in average neighbor
- cutoff = c;
- m = MothurOut::getInstance();
-
+
+ mapWanted = false; //set to true by mgcluster to speed up overlap merge
+
+ //save so you can modify as it changes in average neighbor
+ cutoff = c;
+ m = MothurOut::getInstance();
}
catch(exception& e) {
m->errorOut(e, "Cluster", "Cluster");
exit(1);
}
}
-
-/***********************************************************************/
-
-void Cluster::getRowColCells() {
- try {
- PCell* smallCell = dMatrix->getSmallestCell(); //find the smallest cell - this routine should probably not be in the SpMat class
-
- smallRow = smallCell->row; // get its row
- smallCol = smallCell->column; // get its column
- smallDist = smallCell->dist; // get the smallest distance
- //cout << "small row = " << smallRow << "small col = " << smallCol << "small dist = " << smallDist << endl;
-
- rowCells = seqVec[smallRow]; // all distances related to the row index
- colCells = seqVec[smallCol]; // all distances related to the column index
- nRowCells = rowCells.size();
- nColCells = colCells.size();
-//cout << "num rows = " << nRowCells << "num col = " << nColCells << endl;
-
- //for (int i = 0; i < nColCells; i++) { cout << colCells[i]->row << '\t' << colCells[i]->column << endl; }
- //for (int i = 0; i < nRowCells; i++) { cout << rowCells[i]->row << '\t' << rowCells[i]->column << endl; }
- }
- catch(exception& e) {
- m->errorOut(e, "Cluster", "getRowColCells");
- exit(1);
- }
-
-}
-/***********************************************************************/
-// Remove the specified cell from the seqVec and from the sparse
-// matrix
-void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix){
- try {
-
- ull drow = cell->row;
- ull dcol = cell->column;
- if (((vrow >=0) && (drow != smallRow)) ||
- ((vcol >=0) && (dcol != smallCol))) {
- ull dtemp = drow;
- drow = dcol;
- dcol = dtemp;
- }
-
- ull crow;
- ull ccol;
- int nCells;
- if (vrow < 0) {
- nCells = seqVec[drow].size();
- for (vrow=0; vrow<nCells;vrow++) {
- crow = seqVec[drow][vrow]->row;
- ccol = seqVec[drow][vrow]->column;
- if (((crow == drow) && (ccol == dcol)) ||
- ((ccol == drow) && (crow == dcol))) {
- break;
- }
- }
- }
-
- seqVec[drow].erase(seqVec[drow].begin()+vrow);
- if (vcol < 0) {
- nCells = seqVec[dcol].size();
- for (vcol=0; vcol<nCells;vcol++) {
- crow = seqVec[dcol][vcol]->row;
- ccol = seqVec[dcol][vcol]->column;
- if (((crow == drow) && (ccol == dcol)) ||
- ((ccol == drow) && (crow == dcol))) {
- break;
- }
- }
- }
-
- seqVec[dcol].erase(seqVec[dcol].begin()+vcol);
-
- if (rmMatrix) {
- //cout << " removing = " << cell->row << '\t' << cell->column << '\t' << cell->dist << endl;
- dMatrix->rmCell(cell);
- // cout << "done" << endl;
- }
-
- }
- catch(exception& e) {
- m->errorOut(e, "Cluster", "removeCell");
- exit(1);
- }
-}
/***********************************************************************/
-
void Cluster::clusterBins(){
try {
- // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol);
-
- rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));
+ rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol));
rabund->set(smallRow, 0);
rabund->setLabel(toString(smallDist));
-
- // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl;
}
catch(exception& e) {
m->errorOut(e, "Cluster", "clusterBins");
exit(1);
}
-
-
}
-
/***********************************************************************/
void Cluster::clusterNames(){
try {
- // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
if (mapWanted) { updateMap(); }
list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
list->set(smallRow, "");
list->setLabel(toString(smallDist));
-
- // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl;
}
catch(exception& e) {
m->errorOut(e, "Cluster", "clusterNames");
exit(1);
}
-
}
-
/***********************************************************************/
-//This function clusters based on the method of the derived class
-//At the moment only average and complete linkage are covered, because
-//single linkage uses a different approach.
void Cluster::update(double& cutOFF){
try {
- getRowColCells();
-
+ smallCol = dMatrix->getSmallestCell(smallRow);
+ nColCells = dMatrix->seqVec[smallCol].size();
+ nRowCells = dMatrix->seqVec[smallRow].size();
+
vector<int> foundCol(nColCells, 0);
-
+ //cout << "small cell: " << smallRow << '\t' << smallCol << endl;
int search;
bool changed;
-
- // The vector has to be traversed in reverse order to preserve the index
- // for faster removal in removeCell()
+
for (int i=nRowCells-1;i>=0;i--) {
+ if (m->control_pressed) { break; }
+
//if you are not the smallCell
- if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) {
- if (rowCells[i]->row == smallRow) {
- search = rowCells[i]->column;
- } else {
- search = rowCells[i]->row;
- }
-
+ if (dMatrix->seqVec[smallRow][i].index != smallCol) {
+ search = dMatrix->seqVec[smallRow][i].index;
+
bool merged = false;
for (int j=0;j<nColCells;j++) {
- if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance
- if (colCells[j]->row == search || colCells[j]->column == search) {
+
+ if (dMatrix->seqVec[smallCol][j].index != smallRow) { //if you are not the smallest distance
+ if (dMatrix->seqVec[smallCol][j].index == search) {
foundCol[j] = 1;
merged = true;
- changed = updateDistance(colCells[j], rowCells[i]);
- // If the cell's distance changed and it had the same distance as
- // the smallest distance, invalidate the mins vector in SparseMatrix
- if (changed) {
- if (colCells[j]->vectorMap != NULL) {
- *(colCells[j]->vectorMap) = NULL;
- colCells[j]->vectorMap = NULL;
- }
- }
+ changed = updateDistance(dMatrix->seqVec[smallCol][j], dMatrix->seqVec[smallRow][i]);
+ dMatrix->updateCellCompliment(smallCol, j);
break;
- }
- }
+ }else if (dMatrix->seqVec[smallCol][j].index < search) { j+=nColCells; } //we don't have a distance for this cell
+ }
}
//if not merged it you need it for warning
if ((!merged) && (method == "average" || method == "weighted")) {
- //m->mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine();
- if (cutOFF > rowCells[i]->dist) {
- cutOFF = rowCells[i]->dist;
- //m->mothurOut("changing cutoff to " + toString(cutOFF)); m->mothurOutEndLine();
+ if (cutOFF > dMatrix->seqVec[smallRow][i].dist) {
+ cutOFF = dMatrix->seqVec[smallRow][i].dist;
}
-
+
}
- removeCell(rowCells[i], i , -1);
-
+ dMatrix->rmCell(smallRow, i);
}
}
clusterBins();
clusterNames();
-
+
// Special handling for singlelinkage case, not sure whether this
// could be avoided
for (int i=nColCells-1;i>=0;i--) {
- if (foundCol[i] == 0) {
+ if (foundCol[i] == 0) {
if (method == "average" || method == "weighted") {
- if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) {
- //m->mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine();
- if (cutOFF > colCells[i]->dist) {
- cutOFF = colCells[i]->dist;
- //m->mothurOut("changing cutoff to " + toString(cutOFF)); m->mothurOutEndLine();
+ if (dMatrix->seqVec[smallCol][i].index != smallRow) { //if you are not hte smallest distance
+ if (cutOFF > dMatrix->seqVec[smallCol][i].dist) {
+ cutOFF = dMatrix->seqVec[smallCol][i].dist;
}
}
}
- removeCell(colCells[i], -1, i);
+ dMatrix->rmCell(smallCol, i);
}
}
+
}
catch(exception& e) {
m->errorOut(e, "Cluster", "update");
try {
mapWanted = f;
- //initialize map
- for (int i = 0; i < list->getNumBins(); i++) {
-
- //parse bin
- string names = list->get(i);
- while (names.find_first_of(',') != -1) {
- //get name from bin
- string name = names.substr(0,names.find_first_of(','));
- //save name and bin number
- seq2Bin[name] = i;
- names = names.substr(names.find_first_of(',')+1, names.length());
- }
-
- //get last name
- seq2Bin[names] = i;
+ //initialize map
+ for (int k = 0; k < list->getNumBins(); k++) {
+
+ string names = list->get(k);
+
+ //parse bin
+ string individual = "";
+ int binNameslength = names.size();
+ for(int j=0;j<binNameslength;j++){
+ if(names[j] == ','){
+ seq2Bin[individual] = k;
+ individual = "";
+ }
+ else{ individual += names[j]; }
+ }
+ //get last name
+ seq2Bin[individual] = k;
}
}
}
/***********************************************************************/
void Cluster::updateMap() {
-try {
+ try {
//update location of seqs in smallRow since they move to smallCol now
string names = list->get(smallRow);
- while (names.find_first_of(',') != -1) {
- //get name from bin
- string name = names.substr(0,names.find_first_of(','));
- //save name and bin number
- seq2Bin[name] = smallCol;
- names = names.substr(names.find_first_of(',')+1, names.length());
- }
-
- //get last name
- seq2Bin[names] = smallCol;
+ string individual = "";
+ int binNameslength = names.size();
+ for(int j=0;j<binNameslength;j++){
+ if(names[j] == ','){
+ seq2Bin[individual] = smallCol;
+ individual = "";
+ }
+ else{ individual += names[j]; }
+ }
+ //get last name
+ seq2Bin[individual] = smallCol;
+
}
catch(exception& e) {
m->errorOut(e, "Cluster", "updateMap");
#define CLUSTER_H
+
#include "mothur.h"
-#include "sparsematrix.hpp"
+#include "sparsedistancematrix.h"
#include "mothurout.h"
class RAbundVector;
class ListVector;
-typedef vector<MatData> MatVec;
-
class Cluster {
public:
- Cluster(RAbundVector*, ListVector*, SparseMatrix*, float, string);
+ Cluster(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
virtual void update(double&);
virtual string getTag() = 0;
virtual void setMapWanted(bool m);
virtual map<string, int> getSeqtoBin() { return seq2Bin; }
-
-protected:
- void getRowColCells();
- void removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix=true);
-
- virtual bool updateDistance(MatData& colCell, MatData& rowCell) = 0;
-
+
+protected:
+ virtual bool updateDistance(PDistCell& colCell, PDistCell& rowCell) = 0;
+
virtual void clusterBins();
virtual void clusterNames();
virtual void updateMap();
RAbundVector* rabund;
ListVector* list;
- SparseMatrix* dMatrix;
+ SparseDistanceMatrix* dMatrix;
- int smallRow;
- int smallCol;
+ ull smallRow;
+ ull smallCol;
float smallDist;
bool mapWanted;
float cutoff;
map<string, int> seq2Bin;
string method;
- vector<MatVec> seqVec; // contains vectors of cells related to a certain sequence
- MatVec rowCells;
- MatVec colCells;
ull nRowCells;
ull nColCells;
MothurOut* m;
class CompleteLinkage : public Cluster {
public:
- CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*, float, string);
- bool updateDistance(MatData& colCell, MatData& rowCell);
+ CompleteLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
+ bool updateDistance(PDistCell& colCell, PDistCell& rowCell);
string getTag();
private:
-
+
};
/***********************************************************************/
class SingleLinkage : public Cluster {
public:
- SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*, float, string);
+ SingleLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
void update(double&);
- bool updateDistance(MatData& colCell, MatData& rowCell);
+ bool updateDistance(PDistCell& colCell, PDistCell& rowCell);
string getTag();
private:
-
+
};
/***********************************************************************/
class AverageLinkage : public Cluster {
public:
- AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*, float, string);
- bool updateDistance(MatData& colCell, MatData& rowCell);
+ AverageLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
+ bool updateDistance(PDistCell& colCell, PDistCell& rowCell);
string getTag();
private:
int rowBin;
int colBin;
int totalBin;
-
+
};
/***********************************************************************/
class WeightedLinkage : public Cluster {
public:
- WeightedLinkage(RAbundVector*, ListVector*, SparseMatrix*, float, string);
- bool updateDistance(MatData& colCell, MatData& rowCell);
+ WeightedLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string);
+ bool updateDistance(PDistCell& colCell, PDistCell& rowCell);
string getTag();
private:
/***********************************************************************/
-#endif
+
+
+#endif
\ No newline at end of file
struct colDist {
int col;
int row;
- double dist;
+ float dist;
colDist(int r, int c, double d) : row(r), col(c), dist(d) {}
};
RAbundVector* rabund;
ListVector* list;
- vector< vector<double> > dMatrix;
+ vector< vector<float> > dMatrix;
//vector<colDist> rowSmallDists;
int smallRow;
read->read(nameMap);
list = read->getListVector();
- matrix = read->getMatrix();
+ matrix = read->getDMatrix();
rabund = new RAbundVector(list->getRAbundVector());
delete read;
loops = 0;
double saveCutoff = cutoff;
- while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
+ while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
if (m->control_pressed) { //clean up
delete list; delete matrix; delete rabund; delete cluster;
loops++;
cluster->update(cutoff);
-
- float dist = matrix->getSmallDist();
+
+ float dist = matrix->getSmallDist();
float rndDist;
if (hard) {
rndDist = m->ceilDist(dist, precision);
#include "sabundvector.hpp"
#include "listvector.hpp"
#include "cluster.hpp"
-#include "sparsematrix.hpp"
+#include "sparsedistancematrix.h"
/* The cluster() command:
The cluster command outputs a .list , .rabund and .sabund files.
private:
Cluster* cluster;
- SparseMatrix* matrix;
+ SparseDistanceMatrix* matrix;
ListVector* list;
RAbundVector* rabund;
RAbundVector oldRAbund;
string listFileName = "";
Cluster* cluster = NULL;
- SparseMatrix* matrix = NULL;
+ SparseDistanceMatrix* matrix = NULL;
ListVector* list = NULL;
ListVector oldList;
RAbundVector* rabund = NULL;
list = read->getListVector();
oldList = *list;
- matrix = read->getMatrix();
+ matrix = read->getDMatrix();
delete read; read = NULL;
delete nameMap; nameMap = NULL;
#include "sabundvector.hpp"
#include "listvector.hpp"
#include "cluster.hpp"
-#include "sparsematrix.hpp"
+#include "sparsedistancematrix.h"
#include "readcluster.h"
#include "splitmatrix.h"
#include "readphylip.h"
/***********************************************************************/
-CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string s) :
+CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) :
Cluster(rav, lv, dm, c, s)
{}
/***********************************************************************/
//This function updates the distance based on the furthest neighbor method.
-bool CompleteLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+bool CompleteLinkage::updateDistance(PDistCell& colCell, PDistCell& rowCell) {
try {
bool changed = false;
- if (colCell->dist < rowCell->dist) {
- colCell->dist = rowCell->dist;
+ if (colCell.dist < rowCell.dist) {
+ colCell.dist = rowCell.dist;
changed = true;
}
return(changed);
list = readMatrix->getListVector();
- SparseMatrix* matrix = readMatrix->getMatrix();
+ SparseDistanceMatrix* matrix = readMatrix->getDMatrix();
// Create a data structure to quickly access the distance information.
// It consists of a vector of distance maps, where each map contains
// all distances of a certain sequence. Vector and maps are accessed
// via the index of a sequence in the distance matrix
seqVec = vector<SeqMap>(list->size());
- for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
- if (m->control_pressed) { delete readMatrix; return 0; }
- seqVec[currentCell->row][currentCell->column] = currentCell->dist;
+ for (int i = 0; i < matrix->seqVec.size(); i++) {
+ for (int j = 0; j < matrix->seqVec[i].size(); j++) {
+ if (m->control_pressed) { delete readMatrix; return 0; }
+ //already added everyone else in row
+ if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; }
+ }
}
//add dummy map for unweighted calc
SeqMap dummy;
#include "readmatrix.hpp"
#include "formatmatrix.h"
-typedef list<PCell>::iterator MatData;
typedef map<int, float> SeqMap;
struct repStruct {
vector<string> namesOfSeqsInThisBin;
- string names = shared->get(i);
- while ((names.find_first_of(',') != -1)) {
- string name = names.substr(0,names.find_first_of(','));
- names = names.substr(names.find_first_of(',')+1, names.length());
+ string names = shared->get(i);
+ vector<string> binNames;
+ m->splitAtComma(names, binNames);
+ for(int j = 0; j < binNames.size(); j++) {
+ string name = binNames[j];
//find group
string seqGroup = groupMap->getGroup(name);
else { atLeastOne[seqGroup]++; }
}
- //get last name
- string seqGroup = groupMap->getGroup(names);
- if (output != "accnos") {
- namesOfSeqsInThisBin.push_back((names + "|" + seqGroup + "|" + toString(i+1)));
- }else { namesOfSeqsInThisBin.push_back(names); }
-
- if (seqGroup == "not found") { m->mothurOut(names + " is not in your groupfile. Please correct."); m->mothurOutEndLine(); exit(1); }
-
- //is this seq in one of hte groups we care about
- it = groupFinder.find(seqGroup);
- if (it == groupFinder.end()) { uniqueOTU = false; } //you have a sequence from a group you don't want
- else { atLeastOne[seqGroup]++; }
-
-
//make sure you have at least one seq from each group you want
bool sharedByAll = true;
map<string, int>::iterator it2;
if (!hclusterWanted) {
//get distmatrix and overlap
- SparseMatrix* distMatrix = read->getDistMatrix();
+ SparseDistanceMatrix* distMatrix = read->getDistMatrix();
overlapMatrix = read->getOverlapMatrix(); //already sorted by read
delete read;
typedef unsigned long ull;
+typedef unsigned short intDist;
struct IntNode {
int lvalue;
reverseProb = rp;
}
};
-
+/***********************************************************************/
+struct PDistCell{
+ ull index;
+ float dist;
+ PDistCell() : index(0), dist(0) {};
+ PDistCell(ull c, float d) : index(c), dist(d) {}
+};
/************************************************************/
struct clusterNode {
int numSeq;
spearmanRank(string n, float s) : name(n), score(s) {}
};
+//***********************************************************************
+inline bool compareIndexes(PDistCell left, PDistCell right){
+ return (left.index > right.index);
+}
//********************************************************************************************************************
//sorts highest to lowest
inline bool compareSpearman(spearmanRank left, spearmanRank right){
- return (left.score > right.score);
+ return (left.score < right.score);
}
//********************************************************************************************************************
//sorts highest to lowest
exit(1);
}
}
+/***********************************************************************/
+bool MothurOut::mothurConvert(string item, intDist& num){
+ try {
+ bool error = false;
+
+ if (isNumeric1(item)) {
+ convert(item, num);
+ }else {
+ num = 0;
+ error = true;
+ mothurOut("[ERROR]: cannot convert " + item + " to an integer."); mothurOutEndLine();
+ commandInputsConvertError = true;
+ }
+
+ return error;
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "mothurConvert");
+ exit(1);
+ }
+}
+
/***********************************************************************/
bool MothurOut::isNumeric1(string stringToCheck){
try {
int readNames(string, vector<seqPriorityNode>&, map<string, string>&);
int mothurRemove(string);
bool mothurConvert(string, int&); //use for converting user inputs. Sets commandInputsConvertError to true if error occurs. Engines check this.
+ bool mothurConvert(string, intDist&); //use for converting user inputs. Sets commandInputsConvertError to true if error occurs. Engines check this.
bool mothurConvert(string, float&); //use for converting user inputs. Sets commandInputsConvertError to true if error occurs. Engines check this.
bool mothurConvert(string, double&); //use for converting user inputs. Sets commandInputsConvertError to true if error occurs. Engines check this.
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
- m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.unweighted."); m->mothurOutEndLine();
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run phylo.diversity."); m->mothurOutEndLine();
m->mothurOutEndLine();
for (int l = 0; l < numIters; l++) {
random_shuffle(randomLeaf.begin(), randomLeaf.end());
- cout << l << endl;
+
//initialize counts
map<string, int> counts;
vector< map<string, bool> > countedBranch;
//create objects needed for read
if (!hclusterWanted) {
- matrix = new SparseMatrix();
+ matrix = new SparseDistanceMatrix();
+ matrix->resize(nseqs);
}else{
overlapFile = m->getRootName(blastfile) + "overlap.dist";
distFile = m->getRootName(blastfile) + "hclusterDists.dist";
//is this distance below cutoff
if (distance < cutoff) {
if (!hclusterWanted) {
- PCell value(itA->second, it->first, distance);
- matrix->addCell(value);
+ if (itA->second < it->first) {
+ PDistCell value(it->first, distance);
+ matrix->addCell(itA->second, value);
+ }else {
+ PDistCell value(itA->second, distance);
+ matrix->addCell(it->first, value);
+ }
}else{
outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
}
//is this distance below cutoff
if (distance < cutoff) {
if (!hclusterWanted) {
- PCell value(itA->second, it->first, distance);
- matrix->addCell(value);
+ if (itA->second < it->first) {
+ PDistCell value(it->first, distance);
+ matrix->addCell(itA->second, value);
+ }else {
+ PDistCell value(itA->second, distance);
+ matrix->addCell(it->first, value);
+ }
}else{
outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
}
*/
#include "mothur.h"
-#include "sparsematrix.hpp"
+#include "sparsedistancematrix.h"
#include "nameassignment.hpp"
/****************************************************************************************/
~ReadBlast() {}
int read(NameAssignment*);
- SparseMatrix* getDistMatrix() { return matrix; }
+ SparseDistanceMatrix* getDistMatrix() { return matrix; }
vector<seqDist> getOverlapMatrix() { return overlap; }
string getOverlapFile() { return overlapFile; }
string getDistFile() { return distFile; }
bool minWanted; //if true choose min bsr, if false choose max bsr
bool hclusterWanted;
- SparseMatrix* matrix;
+ SparseDistanceMatrix* matrix;
vector<seqDist> overlap;
MothurOut* m;
string firstName, secondName;
float distance;
int nseqs = nameMap->size();
-
+ DMatrix->resize(nseqs);
list = new ListVector(nameMap->getListVector());
Progress* reading = new Progress("Reading matrix: ", nseqs * nseqs);
if(distance < cutoff && itA != itB){
if(itA->second > itB->second){
- PCell value(itA->second, itB->second, distance);
-
+ PDistCell value(itA->second, distance);
+
+
if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol...
refRow = itA->second;
refCol = itB->second;
- D->addCell(value);
+ DMatrix->addCell(itB->second, value);
}
else if(refRow == itA->second && refCol == itB->second){
lt = 0;
}
else{
- D->addCell(value);
+ DMatrix->addCell(itB->second, value);
}
}
else if(itA->second < itB->second){
- PCell value(itB->second, itA->second, distance);
+ PDistCell value(itB->second, distance);
if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol...
refRow = itA->second;
refCol = itB->second;
- D->addCell(value);
+ DMatrix->addCell(itA->second, value);
}
else if(refRow == itB->second && refCol == itA->second){
lt = 0;
}
else{
- D->addCell(value);
+ DMatrix->addCell(itA->second, value);
}
}
reading->update(itA->second * nseqs);
if(lt == 0){ // oops, it was square
fileHandle.close(); //let's start over
- D->clear(); //let's start over
+ DMatrix->clear(); //let's start over
m->openInputFile(distFile, fileHandle); //let's start over
else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
if(distance < cutoff && itA->second > itB->second){
- PCell value(itA->second, itB->second, distance);
- D->addCell(value);
+ PDistCell value(itA->second, distance);
+ DMatrix->addCell(itB->second, value);
reading->update(itA->second * nseqs);
}
}
/***********************************************************************/
-
-ReadColumnMatrix::~ReadColumnMatrix(){
- //delete D;
- //delete list;
-}
-
+ReadColumnMatrix::~ReadColumnMatrix(){}
+/***********************************************************************/
#include "mothur.h"
#include "listvector.hpp"
-#include "sparsematrix.hpp"
#include "nameassignment.hpp"
+#include "sparsedistancematrix.h"
class SparseMatrix;
class ReadMatrix {
public:
- ReadMatrix(){ D = new SparseMatrix(); m = MothurOut::getInstance(); }
+ ReadMatrix(){ DMatrix = new SparseDistanceMatrix(); m = MothurOut::getInstance(); }
virtual ~ReadMatrix() {}
virtual int read(NameAssignment*){ return 1; }
void setCutoff(float c) { cutoff = c; }
- SparseMatrix* getMatrix() { return D; }
+ SparseDistanceMatrix* getDMatrix() { return DMatrix; }
ListVector* getListVector() { return list; }
-// OrderVector* getOrderVector() { return order; }
int successOpen;
protected:
- SparseMatrix* D;
+ SparseDistanceMatrix* DMatrix;
ListVector* list;
float cutoff;
MothurOut* m;
try {
float distance;
- int square, nseqs;
+ int square, nseqs;
string name;
vector<string> matrixNames;
}
Progress* reading;
-
+ DMatrix->resize(nseqs);
+
if(square == 0){
reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2);
if (m->control_pressed) { delete reading; fileHandle.close(); return 0; }
fileHandle >> distance;
-
if (distance == -1) { distance = 1000000; }
else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
if(distance < cutoff){
- PCell value(i, j, distance);
- D->addCell(value);
+ PDistCell value(i, distance);
+ DMatrix->addCell(j, value);
}
index++;
reading->update(index);
else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
if(distance < cutoff){
- PCell value(nameMap->get(matrixNames[i]), nameMap->get(matrixNames[j]), distance);
- D->addCell(value);
+ PDistCell value(nameMap->get(matrixNames[i]), distance);
+ DMatrix->addCell(nameMap->get(matrixNames[j]), value);
}
index++;
reading->update(index);
else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
if(distance < cutoff && j < i){
- PCell value(i, j, distance);
- D->addCell(value);
+ PDistCell value(i, distance);
+ DMatrix->addCell(j, value);
}
index++;
reading->update(index);
else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert.
if(distance < cutoff && j < i){
- PCell value(nameMap->get(matrixNames[i]), nameMap->get(matrixNames[j]), distance);
- D->addCell(value);
+ PDistCell value(nameMap->get(matrixNames[i]), distance);
+ DMatrix->addCell(nameMap->get(matrixNames[j]), value);
}
index++;
reading->update(index);
reading->finish();
delete reading;
-
+
list->setLabel("0");
fileHandle.close();
- /* if(nameMap != NULL){
- for(int i=0;i<matrixNames.size();i++){
- nameMap->erase(matrixNames[i]);
- }
- if(nameMap->size() > 0){
- //should probably tell them what is missing if we missed something
- m->mothurOut("missed something\t" + toString(nameMap->size())); m->mothurOutEndLine();
- }
- } */
-
+
return 1;
}
}
/***********************************************************************/
+ReadPhylipMatrix::~ReadPhylipMatrix(){}
+/***********************************************************************/
-ReadPhylipMatrix::~ReadPhylipMatrix(){
- // delete D;
- // delete list;
-}
//
// int endPos = rdb->referenceSeqs[i].getEndPos();
// if(endPos < minEndPos) { minEndPos = endPos; }
+ if (rdb->referenceSeqs[i].getNumBases() == 0) {
+ m->mothurOut("[WARNING]: " + rdb->referenceSeqs[i].getName() + " is blank, ignoring.");m->mothurOutEndLine();
+ }else {
+ referenceSeqs.push_back(rdb->referenceSeqs[i]);
+ }
- referenceSeqs.push_back(rdb->referenceSeqs[i]);
}
referenceFileName = rdb->getSavedReference();
- m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences.");m->mothurOutEndLine();
+ m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(referenceSeqs.size()) + " sequences.");m->mothurOutEndLine();
}else {
int start = time(NULL);
//
// int endPos = currentSeq.getEndPos();
// if(endPos < minEndPos) { minEndPos = endPos; }
- referenceSeqs.push_back(currentSeq);
-
- if (rdb->save) { rdb->referenceSeqs.push_back(currentSeq); }
+ if (currentSeq.getNumBases() == 0) {
+ m->mothurOut("[WARNING]: " + currentSeq.getName() + " is blank, ignoring.");m->mothurOutEndLine();
+ }else {
+ referenceSeqs.push_back(currentSeq);
+ if (rdb->save) { rdb->referenceSeqs.push_back(currentSeq); }
+ }
m->gobble(referenceFile);
}
for(int i=0;i<numRefs;i++){
referenceSeqs[i].padToPos(maxStartPos);
referenceSeqs[i].padFromPos(minEndPos);
- }
+ }
if(numAmbigSeqs != 0){
m->mothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n");
errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total;
errors.queryName = query.getName();
errors.refName = reference.getName();
-
//return errors;
return 0;
}
//********************************************************************************************************************
void Sequence::padFromPos(int end){
- cout << end << '\t' << endPos << endl;
+ //cout << end << '\t' << endPos << endl;
for(int j = end; j < endPos; j++) {
aligned[j] = '.';
}
duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
}
- string nameFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName) + getOutputFileNameTag("name");
+ string nameFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("name");
ofstream nameFile;
m->openOutputFile(nameFileName, nameFile);
if (m->control_pressed) { break; }
vector<int> otuCounts(numOTUs, 0);
- for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
+ for(int j=0;j<numSeqs;j++) { otuCounts[otuData[j]]++; }
calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
read->read(clusterNameMap);
ListVector* list = read->getListVector();
- SparseMatrix* matrix = read->getMatrix();
+ SparseDistanceMatrix* matrix = read->getDMatrix();
delete read;
delete clusterNameMap;
/***********************************************************************/
-SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string s) :
+SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) :
Cluster(rav, lv, dm, c, s)
{}
/***********************************************************************/
//This function clusters based on the single linkage method.
-void SingleLinkage::update(double& cutOFF){
+void SingleLinkage::update(double& cutOFF){
try {
- getRowColCells();
+ smallCol = dMatrix->getSmallestCell(smallRow);
+ nColCells = dMatrix->seqVec[smallCol].size();
+ nRowCells = dMatrix->seqVec[smallRow].size();
vector<bool> deleted(nRowCells, false);
int rowInd;
// The vector has to be traversed in reverse order to preserve the index
// for faster removal in removeCell()
for (int i=nRowCells-1;i>=0;i--) {
- if ((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol)) {
- rowInd = i; // The index of the smallest distance cell in rowCells
- } else {
- if (rowCells[i]->row == smallRow) {
- search = rowCells[i]->column;
- } else {
- search = rowCells[i]->row;
- }
-
- for (int j=0;j<nColCells;j++) {
- if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) {
- if (colCells[j]->row == search || colCells[j]->column == search) {
- changed = updateDistance(colCells[j], rowCells[i]);
- // If the cell's distance changed and it had the same distance as
- // the smallest distance, invalidate the mins vector in SparseMatrix
- if (changed) {
- if (colCells[j]->vectorMap != NULL) {
- *(colCells[j]->vectorMap) = NULL;
- colCells[j]->vectorMap = NULL;
- }
- }
- removeCell(rowCells[i], i , -1);
- deleted[i] = true;
- break;
- }
- }
- }
- if (!deleted[i]) {
- // Assign the cell to the new cluster
- // remove the old cell from seqVec and add the cell
- // with the new row and column assignment again
- removeCell(rowCells[i], i , -1, false);
- if (search < smallCol){
- rowCells[i]->row = smallCol;
- rowCells[i]->column = search;
- } else {
- rowCells[i]->row = search;
- rowCells[i]->column = smallCol;
- }
- seqVec[rowCells[i]->row].push_back(rowCells[i]);
- seqVec[rowCells[i]->column].push_back(rowCells[i]);
- }
- }
+ if (dMatrix->seqVec[smallRow][i].index == smallCol) {
+ rowInd = i; // The index of the smallest distance cell in rowCells
+ } else {
+ search = dMatrix->seqVec[smallRow][i].index;
+
+ for (int j=0;j<nColCells;j++) {
+ if (dMatrix->seqVec[smallCol][j].index != smallRow) { //if you are not the small cell
+ if (dMatrix->seqVec[smallCol][j].index == search) {
+ changed = updateDistance(dMatrix->seqVec[smallCol][j], dMatrix->seqVec[smallRow][i]);
+ dMatrix->updateCellCompliment(smallCol, j);
+ dMatrix->rmCell(smallRow, i);
+ deleted[i] = true;
+ break;
+ }
+ }
+ }
+ if (!deleted[i]) {
+ // Assign the cell to the new cluster
+ // remove the old cell from seqVec and add the cell
+ // with the new row and column assignment again
+ float distance = dMatrix->seqVec[smallRow][i].dist;
+ dMatrix->rmCell(smallRow, i);
+ if (search < smallCol){
+ PDistCell value(smallCol, distance);
+ dMatrix->addCell(search, value);
+ } else {
+ PDistCell value(search, distance);
+ dMatrix->addCell(smallCol, value);
+ }
+ sort(dMatrix->seqVec[smallCol].begin(), dMatrix->seqVec[smallCol].end(), compareIndexes);
+ sort(dMatrix->seqVec[search].begin(), dMatrix->seqVec[search].end(), compareIndexes);
+ }
+ }
}
clusterBins();
clusterNames();
// remove also the cell with the smallest distance
- removeCell(rowCells[rowInd], -1 , -1);
+ dMatrix->rmCell(smallRow, rowInd);
}
catch(exception& e) {
m->errorOut(e, "SingleLinkage", "update");
/***********************************************************************/
//This function updates the distance based on the nearest neighbor method.
-bool SingleLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+bool SingleLinkage::updateDistance(PDistCell& colCell, PDistCell& rowCell) {
try {
bool changed = false;
- if (colCell->dist > rowCell->dist) {
- colCell->dist = rowCell->dist;
+ if (colCell.dist > rowCell.dist) {
+ colCell.dist = rowCell.dist;
}
return(changed);
}
--- /dev/null
+//
+// sparsedistancematrix.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 7/16/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "sparsedistancematrix.h"
+
+
+/***********************************************************************/
+
+SparseDistanceMatrix::SparseDistanceMatrix() : numNodes(0), smallDist(1e6){ m = MothurOut::getInstance(); sorted=false; aboveCutoff = 1e6; }
+
+/***********************************************************************/
+
+int SparseDistanceMatrix::getNNodes(){
+ return numNodes;
+}
+/***********************************************************************/
+
+void SparseDistanceMatrix::clear(){
+ for (int i = 0; i < seqVec.size(); i++) { seqVec[i].clear(); }
+ seqVec.clear();
+}
+
+/***********************************************************************/
+
+float SparseDistanceMatrix::getSmallDist(){
+ return smallDist;
+}
+/***********************************************************************/
+
+int SparseDistanceMatrix::updateCellCompliment(ull row, ull col){
+ try {
+
+ ull vrow = seqVec[row][col].index;
+ ull vcol = 0;
+
+ //find the columns entry for this cell as well
+ for (int i = 0; i < seqVec[vrow].size(); i++) { if (seqVec[vrow][i].index == row) { vcol = i; break; } }
+
+ seqVec[vrow][vcol].dist = seqVec[row][col].dist;
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SparseDistanceMatrix", "updateCellCompliment");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+int SparseDistanceMatrix::rmCell(ull row, ull col){
+ try {
+ numNodes-=2;
+
+ ull vrow = seqVec[row][col].index;
+ ull vcol = 0;
+
+ //find the columns entry for this cell as well
+ for (int i = 0; i < seqVec[vrow].size(); i++) { if (seqVec[vrow][i].index == row) { vcol = i; break; } }
+
+ seqVec[vrow].erase(seqVec[vrow].begin()+vcol);
+ seqVec[row].erase(seqVec[row].begin()+col);
+
+ return(0);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SparseDistanceMatrix", "rmCell");
+ exit(1);
+ }
+}
+/***********************************************************************/
+void SparseDistanceMatrix::addCell(ull row, PDistCell cell){
+ try {
+ numNodes+=2;
+ if(cell.dist < smallDist){ smallDist = cell.dist; }
+
+ seqVec[row].push_back(cell);
+ PDistCell temp(row, cell.dist);
+ seqVec[cell.index].push_back(temp);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SparseDistanceMatrix", "addCell");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+ull SparseDistanceMatrix::getSmallestCell(ull& row){
+ try {
+ if (!sorted) { sortSeqVec(); sorted = true; }
+
+ vector<PDistCellMin> mins;
+ smallDist = 1e6;
+
+ for (int i = 0; i < seqVec.size(); i++) {
+ for (int j = 0; j < seqVec[i].size(); j++) {
+
+ //already checked everyone else in row
+ if (i < seqVec[i][j].index) {
+
+ float dist = seqVec[i][j].dist;
+
+ if(dist < smallDist){ //found a new smallest distance
+ mins.clear();
+ smallDist = dist;
+ PDistCellMin temp(i, seqVec[i][j].index);
+ mins.push_back(temp);
+ }
+ else if(dist == smallDist){ //if a subsequent distance is the same as mins distance add the new iterator to the mins vector
+ PDistCellMin temp(i, seqVec[i][j].index);
+ mins.push_back(temp);
+ }
+ }else { j+=seqVec[i].size(); } //stop looking
+ }
+ }
+
+ random_shuffle(mins.begin(), mins.end()); //randomize the order of the iterators in the mins vector
+
+ row = mins[0].row;
+ ull col = mins[0].col;
+
+ return col;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SparseMatrix", "getSmallestCell");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+int SparseDistanceMatrix::sortSeqVec(){
+ try {
+
+ //saves time in getSmallestCell, by making it so you dont search the repeats
+ for (int i = 0; i < seqVec.size(); i++) { sort(seqVec[i].begin(), seqVec[i].end(), compareIndexes); }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SparseMatrix", "getSmallestCell");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
--- /dev/null
+#ifndef Mothur_sparsedistancematrix_h
+#define Mothur_sparsedistancematrix_h
+
+//
+// sparsedistancematrix.h
+// Mothur
+//
+// Created by Sarah Westcott on 7/16/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "mothur.h"
+#include "mothurout.h"
+
+
+class ListVector;
+
+
+/* For each distance in a sparse matrix we have a row, column and distance.
+ The PDistCell consists of the column and distance.
+ We know the row by the distances row in the seqVec matrix.
+ SeqVec is square and each row is sorted so the column values are ascending to save time in the search for the smallest distance. */
+
+/***********************************************************************/
+struct PDistCellMin{
+ ull row;
+ ull col;
+ //PDistCell* cell;
+ PDistCellMin(ull r, ull c) : col(c), row(r) {}
+};
+/***********************************************************************/
+
+
+
+class SparseDistanceMatrix {
+
+public:
+ SparseDistanceMatrix();
+ ~SparseDistanceMatrix(){ clear(); }
+ int getNNodes();
+ ull getSmallestCell(ull& index); //Return the cell with the smallest distance
+ float getSmallDist();
+
+ int rmCell(ull, ull);
+ int updateCellCompliment(ull, ull);
+ void resize(ull n) { seqVec.resize(n); }
+ void clear();
+ void addCell(ull, PDistCell);
+ vector<vector<PDistCell> > seqVec;
+
+private:
+ PDistCell smallCell; //The cell with the smallest distance
+ int numNodes;
+
+ bool sorted;
+ int sortSeqVec();
+ float smallDist, aboveCutoff;
+
+ MothurOut* m;
+};
+
+/***********************************************************************/
+
+
+
+#endif
TreeGroupCommand::~TreeGroupCommand(){
if (abort == false) {
if (format == "sharedfile") { delete input; }
- else { delete readMatrix; delete matrix; delete list; }
+ else { delete list; }
delete tmap;
}
}else{
//read in dist file
filename = inputfile;
-
+
+ ReadMatrix* readMatrix;
if (format == "column") { readMatrix = new ReadColumnMatrix(filename); }
else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(filename); }
readMatrix->read(nameMap);
list = readMatrix->getListVector();
- matrix = readMatrix->getMatrix();
+ SparseDistanceMatrix* dMatrix = readMatrix->getDMatrix();
//make treemap
tmap = new TreeMap();
if (m->control_pressed) { return 0; }
- vector< vector<double> > matrix = makeSimsDist();
+ vector< vector<double> > matrix = makeSimsDist(dMatrix);
+ delete readMatrix;
+ delete dMatrix;
if (m->control_pressed) { return 0; }
}
}
/***********************************************************/
-vector< vector<double> > TreeGroupCommand::makeSimsDist() {
+vector< vector<double> > TreeGroupCommand::makeSimsDist(SparseDistanceMatrix* matrix) {
try {
numGroups = list->size();
//go through sparse matrix and fill sims
//go through each cell in the sparsematrix
- for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
- //similairity = -(distance-1)
- simMatrix[currentCell->row][currentCell->column] = -(currentCell->dist -1.0);
- simMatrix[currentCell->column][currentCell->row] = -(currentCell->dist -1.0);
-
- if (m->control_pressed) { return simMatrix; }
+ for (int i = 0; i < matrix->seqVec.size(); i++) {
+ for (int j = 0; j < matrix->seqVec[i].size(); j++) {
+
+ //already checked everyone else in row
+ if (i < matrix->seqVec[i][j].index) {
+ simMatrix[i][matrix->seqVec[i][j].index] = -(matrix->seqVec[i][j].dist -1.0);
+ simMatrix[matrix->seqVec[i][j].index][i] = -(matrix->seqVec[i][j].dist -1.0);
+ if (m->control_pressed) { return simMatrix; }
+ }
+ }
}
return simMatrix;
Tree* createTree(vector< vector<double> >&);
void printSims(ostream&, vector< vector<double> >&);
int makeSimsShared();
- vector< vector<double> > makeSimsDist();
+ vector< vector<double> > makeSimsDist(SparseDistanceMatrix*);
int writeTree(string, Tree*);
int driver(vector<SharedRAbundVector*>, int, int, vector< vector<seqDist> >&);
- ReadMatrix* readMatrix;
- SparseMatrix* matrix;
NameAssignment* nameMap;
ListVector* list;
TreeMap* tmap;
Sequence currSeq(inFASTA); m->gobble(inFASTA);
//cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl;
- if (m->debug) { m->mothurOut("[DEBUG]: " + toString(count) + " fasta = " + currSeq.getName() + '\n'); }
+
QualityScores currQual;
if(qFileName != ""){
currQual = QualityScores(qFile); m->gobble(qFile);
- if (m->debug) { m->mothurOut("[DEBUG]: qual = " + currQual.getName() + '\n'); }
+ if ((m->debug)&&(count>15800)) { m->mothurOut("[DEBUG]: " + toString(count) + " fasta = " + currSeq.getName() + '\n'); m->mothurOut("[DEBUG]: " + toString(getpid()) + '\n'); }
}
string origSeq = currSeq.getUnaligned();
tempNameFileNames,
lines[process],
qLines[process]);
+
+ if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); }
//pass groupCounts to parent
if(createGroup){
/***********************************************************************/
-WeightedLinkage::WeightedLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string s) :
+WeightedLinkage::WeightedLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) :
Cluster(rav, lv, dm, c, s)
{
saveRow = -1;
/***********************************************************************/
//This function updates the distance based on the average linkage method.
-bool WeightedLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
+bool WeightedLinkage::updateDistance(PDistCell& colCell, PDistCell& rowCell) {
try {
if ((saveRow != smallRow) || (saveCol != smallCol)) {
// rowBin = rabund->get(smallRow);
saveCol = smallCol;
}
- colCell->dist = (colCell->dist + rowCell->dist) / 2.0;
+ colCell.dist = (colCell.dist + rowCell.dist) / 2.0;
return(true);
}