From: Sarah Westcott Date: Fri, 27 Jul 2012 13:42:37 +0000 (-0400) Subject: added sparseDistanceMatrix class. Modified cluster commands to use the new sparse... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=0cefb55a2616975bd4a144fc345693695ffc9bb6 added sparseDistanceMatrix class. Modified cluster commands to use the new sparse distance matrix class. cut cluster time by 55% and reduced memory usage by 50%. --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 13f12e0..ff0e35b 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -51,6 +51,7 @@ A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; }; A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */; }; A7D755DA1535F679009BF21A /* treereader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D755D91535F679009BF21A /* treereader.cpp */; }; + A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */; }; A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; }; A7E9B88212D37EC400DA6239 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65112D37EC300DA6239 /* aligncommand.cpp */; }; A7E9B88312D37EC400DA6239 /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65312D37EC300DA6239 /* alignment.cpp */; }; @@ -453,6 +454,8 @@ A7D755D71535F665009BF21A /* treereader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treereader.h; sourceTree = ""; }; A7D755D91535F679009BF21A /* treereader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treereader.cpp; sourceTree = ""; }; A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = ""; }; + A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sparsedistancematrix.cpp; sourceTree = ""; }; + A7E0243F15B4522000A5F046 /* sparsedistancematrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sparsedistancematrix.h; sourceTree = ""; }; A7E9B64F12D37EC300DA6239 /* ace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ace.cpp; sourceTree = ""; }; A7E9B65012D37EC300DA6239 /* ace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ace.h; sourceTree = ""; }; A7E9B65112D37EC300DA6239 /* aligncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligncommand.cpp; sourceTree = ""; }; @@ -1672,6 +1675,8 @@ A7E9B81412D37EC400DA6239 /* sharedsabundvector.h */, A7E9B83912D37EC400DA6239 /* sparsematrix.cpp */, A7E9B83A12D37EC400DA6239 /* sparsematrix.hpp */, + A7E0243F15B4522000A5F046 /* sparsedistancematrix.h */, + A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */, A7E9B85112D37EC400DA6239 /* suffixdb.cpp */, A7E9B85212D37EC400DA6239 /* suffixdb.hpp */, A7E9B85312D37EC400DA6239 /* suffixnodes.cpp */, @@ -2186,6 +2191,7 @@ A70056EB156AB6E500924A2D /* removeotulabelscommand.cpp in Sources */, A73901081588C40900ED2ED6 /* loadlogfilecommand.cpp in Sources */, A74D59A4159A1E2000043046 /* counttable.cpp in Sources */, + A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/averagelinkage.cpp b/averagelinkage.cpp index c430c88..e9ff3b3 100644 --- a/averagelinkage.cpp +++ b/averagelinkage.cpp @@ -5,14 +5,14 @@ #include "mothur.h" #include "cluster.hpp" #include "rabundvector.hpp" -#include "sparsematrix.hpp" +#include "sparsedistancematrix.h" /* This class implements the average UPGMA, average neighbor clustering algorithm */ /***********************************************************************/ -AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string s) : - Cluster(rav, lv, dm, c, s) +AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) : +Cluster(rav, lv, dm, c, s) { saveRow = -1; saveCol = -1; @@ -28,7 +28,7 @@ string AverageLinkage::getTag() { /***********************************************************************/ //This function updates the distance based on the average linkage method. -bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) { +bool AverageLinkage::updateDistance(PDistCell& colCell, PDistCell& rowCell) { try { if ((saveRow != smallRow) || (saveCol != smallCol)) { rowBin = rabund->get(smallRow); @@ -37,9 +37,9 @@ bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) { saveRow = smallRow; saveCol = smallCol; } - - colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin; - + //cout << "colcell.dist = " << colCell.dist << '\t' << smallRow << '\t' << smallCol << '\t' << rowCell.dist << endl; + colCell.dist = (colBin * colCell.dist + rowBin * rowCell.dist) / totalBin; + return(true); } catch(exception& e) { diff --git a/calculator.h b/calculator.h index 6791d83..218f8ea 100644 --- a/calculator.h +++ b/calculator.h @@ -67,86 +67,8 @@ class VecCalc //double findMax(vector); //This returns the maximum value in the vector. int numNZ(vector); //This returns the number of non-zero values in the vector. double numNZ(vector); //This returns the number of non-zero values in the vector. - //double numPos(vector); //This returns the number of positive values in the vector. - //double findMaxDiff(vector, vector); //This returns the absolute value of the maximum difference between the two vectors. - //double findDStat(vector, vector, double); //This returns the D-Statistic of the two vectors with the given total number of species. - //vector findQuartiles(vector); //This returns a vector with the first element being the index of the lower quartile of the vector and the second element being the index of the upper quartile of the vector. - //vector add(vector, double); //This adds the given number to every element in the given vector and returns the new vector. - //vector multiply(vector, double); //This multiplies every element in the given vector by the given number and returns the new vector. - //vector power(vector, double); //This raises every element in the given vector to the given number and returns the new vector. - //vector addVecs(vector,vector); //The given vectors must be the same size. This adds the ith element of the first given vector to the ith element of the second given vector and returns the new vector. - //vector multVecs(vector,vector); //The given vectors must be the same size. This multiplies the ith element of the first given vector to the ith element of the second given vector and returns the new vector. - //vector remDup(vector); //This returns a vector that contains 1 of each unique element in the given vector. The order of the elements is not changed. - //vector genCVec(vector); //This returns a cumilative vector of the given vector. The ith element of the returned vector is the sum of all the elements in the given vector up to i. - //vector genRelVec(vector); //This finds the sum of all the elements in the given vector and then divides the ith element in the given vector by that sum and then puts the result into a new vector, which is returned after all of the elements in the given vector have been used. - ///vector genDiffVec(vector, vector);//This subtracts the ith element of the second given vector from the ith element of the first given vector and returns the new vector. - //vector genCSVec(vector);//This calculates the number of species that have the same number of individuals as the ith element of the given vector and then returns a cumulative vector. - //vector genTotVec(vector >); //This adds up the ith element of all the columns and puts that value into a new vector. It those this for all the rows and then returns the new vector. - //vector quicksort(vector); //This sorts the given vector from highest to lowest and returns the sorted vector. - //vector > gen2DVec(vector, int, int); //(vector, #rows/columns, 0 if the second parameter was rows, 1 if the second parameter was columns) Transforms a single vector that was formatted like a table into a 2D vector. - //vector getSData(char[]);//This takes a file name as a parameter and reads all of the data in the file into a vector. }; -/**************************************************************************************************/ - -/*This Class is similar to the GeometricSeries.h class. It calculates -the broken stick distribution of the table and prints the D-Statistic -and the confidence limits for the Kolmogorov-Smirnov 1-Sample test -with a 95% confidence level. - -class BrokenStick -{ - public: - void doBStick(vector); -}; - -//**************************************************************************************************/ -/*This Class calculates the geometric series distribution for the data. -It prints the D-Statistic and the critical values for the Kolmogorov-Smirnov -1-sample test at the 95% confidence interval.*/ - -/*class GeometricSeries -{ - public: - void doGeomTest(vector); -};*/ - -/************************************************************************************************** -//This Class calculates the jackknifed estimate of the data and -//prints it and the confidence limits at a chosen confidence level. - -class Jackknifing -{ - public: - void doJK(vector, double); -}; -/************************************************************************************************** -/*This Class stores calculates the Kolmogorov-Smirnov 2-Sample test between two samples. -It prints the D-Statistic and the critical value for the test at -the 90% and 95% confidence interval. - -class KS2SampleTest -{ - public: - void doKSTest(vector, vector); -}; - -/************************************************************************************************** -//This Class calculates and prints the Q-Statistic for the data. -class QStatistic -{ - public: - void doQStat(vector); -}; -/************************************************************************************************** -class SSBPDiversityIndices -{ - public: - void doSSBP(vector); - double getShan(vector vec);//The Shannon Index - double getSimp(vector vec);//The Simpson Index - double getBP(vector vec);//The Berger-Parker Index -}; /**************************************************************************************************/ //This Class stores the table of the confidence limits of the Student-T distribution. class TDTable @@ -154,26 +76,6 @@ class TDTable public: double getConfLimit(int,int); }; - -/************************************************************************************************** -//This Class stores the table of the confidence limits of the One-Sample Kolmogorov-Smirnov Test. -class KOSTable -{ - public: - double getConfLimit(int); -}; - /**************************************************************************************************/ -/*This Class calculates the truncated lognormal for the data. -It then prints the D-Statistic and the critical values for the -Kolmogorov-Smirnov 1-Sample test.* - -class TrunLN -{ - public: - void doTrunLN(vector, vector); -}; -/**************************************************************************************************/ - #endif diff --git a/cluster.cpp b/cluster.cpp index ac9f448..2a3e751 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -10,269 +10,112 @@ #include "cluster.hpp" #include "rabundvector.hpp" #include "listvector.hpp" -#include "sparsematrix.hpp" /***********************************************************************/ -Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string f) : +Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string f) : rabund(rav), list(lv), dMatrix(dm), method(f) { try { -/* - cout << "sizeof(MatData): " << sizeof(MatData) << endl; - cout << "sizeof(PCell*): " << sizeof(PCell*) << endl; - - int nCells = dMatrix->getNNodes(); - time_t start = time(NULL); - - MatVec matvec = MatVec(nCells); - int i = 0; - for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) { - matvec[i++] = currentCell; - } - for (i= matvec.size();i>0;i--) { - dMatrix->rmCell(matvec[i-1]); - } - MatData it = dMatrix->begin(); - while (it != dMatrix->end()) { - it = dMatrix->rmCell(it); - } - cout << "Time to remove " << nCells << " cells: " << time(NULL) - start << " seconds" << endl; - exit(0); - MatData it = dMatrix->begin(); - cout << it->row << "/" << it->column << "/" << it->dist << endl; - dMatrix->rmCell(dMatrix->begin()); - cout << it->row << "/" << it->column << "/" << it->dist << endl; - exit(0); -*/ - - // Create a data structure to quickly access the PCell information - // for a certain sequence. It consists of a vector of lists, where - // a list contains pointers (iterators) to the all distances related - // to a certain sequence. The Vector is accessed via the index of a - // sequence in the distance matrix. -//ofstream outtemp; -//string temp = "temp"; -//m->openOutputFile(temp, outtemp); -//cout << lv->size() << endl; - seqVec = vector(lv->size()); - for (MatData currentCell = dMatrix->begin(); currentCell != dMatrix->end(); currentCell++) { -//outtemp << currentCell->row << '\t' << currentCell->column << '\t' << currentCell->dist << endl; - seqVec[currentCell->row].push_back(currentCell); - seqVec[currentCell->column].push_back(currentCell); - } -//outtemp.close(); - mapWanted = false; //set to true by mgcluster to speed up overlap merge - - //save so you can modify as it changes in average neighbor - cutoff = c; - m = MothurOut::getInstance(); - + + mapWanted = false; //set to true by mgcluster to speed up overlap merge + + //save so you can modify as it changes in average neighbor + cutoff = c; + m = MothurOut::getInstance(); } catch(exception& e) { m->errorOut(e, "Cluster", "Cluster"); exit(1); } } - -/***********************************************************************/ - -void Cluster::getRowColCells() { - try { - PCell* smallCell = dMatrix->getSmallestCell(); //find the smallest cell - this routine should probably not be in the SpMat class - - smallRow = smallCell->row; // get its row - smallCol = smallCell->column; // get its column - smallDist = smallCell->dist; // get the smallest distance - //cout << "small row = " << smallRow << "small col = " << smallCol << "small dist = " << smallDist << endl; - - rowCells = seqVec[smallRow]; // all distances related to the row index - colCells = seqVec[smallCol]; // all distances related to the column index - nRowCells = rowCells.size(); - nColCells = colCells.size(); -//cout << "num rows = " << nRowCells << "num col = " << nColCells << endl; - - //for (int i = 0; i < nColCells; i++) { cout << colCells[i]->row << '\t' << colCells[i]->column << endl; } - //for (int i = 0; i < nRowCells; i++) { cout << rowCells[i]->row << '\t' << rowCells[i]->column << endl; } - } - catch(exception& e) { - m->errorOut(e, "Cluster", "getRowColCells"); - exit(1); - } - -} -/***********************************************************************/ -// Remove the specified cell from the seqVec and from the sparse -// matrix -void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix){ - try { - - ull drow = cell->row; - ull dcol = cell->column; - if (((vrow >=0) && (drow != smallRow)) || - ((vcol >=0) && (dcol != smallCol))) { - ull dtemp = drow; - drow = dcol; - dcol = dtemp; - } - - ull crow; - ull ccol; - int nCells; - if (vrow < 0) { - nCells = seqVec[drow].size(); - for (vrow=0; vrowrow; - ccol = seqVec[drow][vrow]->column; - if (((crow == drow) && (ccol == dcol)) || - ((ccol == drow) && (crow == dcol))) { - break; - } - } - } - - seqVec[drow].erase(seqVec[drow].begin()+vrow); - if (vcol < 0) { - nCells = seqVec[dcol].size(); - for (vcol=0; vcolrow; - ccol = seqVec[dcol][vcol]->column; - if (((crow == drow) && (ccol == dcol)) || - ((ccol == drow) && (crow == dcol))) { - break; - } - } - } - - seqVec[dcol].erase(seqVec[dcol].begin()+vcol); - - if (rmMatrix) { - //cout << " removing = " << cell->row << '\t' << cell->column << '\t' << cell->dist << endl; - dMatrix->rmCell(cell); - // cout << "done" << endl; - } - - } - catch(exception& e) { - m->errorOut(e, "Cluster", "removeCell"); - exit(1); - } -} /***********************************************************************/ - void Cluster::clusterBins(){ try { - // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol); - - rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol)); + rabund->set(smallCol, rabund->get(smallRow)+rabund->get(smallCol)); rabund->set(smallRow, 0); rabund->setLabel(toString(smallDist)); - - // cout << '\t' << rabund->get(smallRow) << '\t' << rabund->get(smallCol) << endl; } catch(exception& e) { m->errorOut(e, "Cluster", "clusterBins"); exit(1); } - - } - /***********************************************************************/ void Cluster::clusterNames(){ try { - // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol); if (mapWanted) { updateMap(); } list->set(smallCol, list->get(smallRow)+','+list->get(smallCol)); list->set(smallRow, ""); list->setLabel(toString(smallDist)); - - // cout << '\t' << list->get(smallRow) << '\t' << list->get(smallCol) << endl; } catch(exception& e) { m->errorOut(e, "Cluster", "clusterNames"); exit(1); } - } - /***********************************************************************/ -//This function clusters based on the method of the derived class -//At the moment only average and complete linkage are covered, because -//single linkage uses a different approach. void Cluster::update(double& cutOFF){ try { - getRowColCells(); - + smallCol = dMatrix->getSmallestCell(smallRow); + nColCells = dMatrix->seqVec[smallCol].size(); + nRowCells = dMatrix->seqVec[smallRow].size(); + vector foundCol(nColCells, 0); - + //cout << "small cell: " << smallRow << '\t' << smallCol << endl; int search; bool changed; - - // The vector has to be traversed in reverse order to preserve the index - // for faster removal in removeCell() + for (int i=nRowCells-1;i>=0;i--) { + if (m->control_pressed) { break; } + //if you are not the smallCell - if (!((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol))) { - if (rowCells[i]->row == smallRow) { - search = rowCells[i]->column; - } else { - search = rowCells[i]->row; - } - + if (dMatrix->seqVec[smallRow][i].index != smallCol) { + search = dMatrix->seqVec[smallRow][i].index; + bool merged = false; for (int j=0;jrow == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance - if (colCells[j]->row == search || colCells[j]->column == search) { + + if (dMatrix->seqVec[smallCol][j].index != smallRow) { //if you are not the smallest distance + if (dMatrix->seqVec[smallCol][j].index == search) { foundCol[j] = 1; merged = true; - changed = updateDistance(colCells[j], rowCells[i]); - // If the cell's distance changed and it had the same distance as - // the smallest distance, invalidate the mins vector in SparseMatrix - if (changed) { - if (colCells[j]->vectorMap != NULL) { - *(colCells[j]->vectorMap) = NULL; - colCells[j]->vectorMap = NULL; - } - } + changed = updateDistance(dMatrix->seqVec[smallCol][j], dMatrix->seqVec[smallRow][i]); + dMatrix->updateCellCompliment(smallCol, j); break; - } - } + }else if (dMatrix->seqVec[smallCol][j].index < search) { j+=nColCells; } //we don't have a distance for this cell + } } //if not merged it you need it for warning if ((!merged) && (method == "average" || method == "weighted")) { - //m->mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine(); - if (cutOFF > rowCells[i]->dist) { - cutOFF = rowCells[i]->dist; - //m->mothurOut("changing cutoff to " + toString(cutOFF)); m->mothurOutEndLine(); + if (cutOFF > dMatrix->seqVec[smallRow][i].dist) { + cutOFF = dMatrix->seqVec[smallRow][i].dist; } - + } - removeCell(rowCells[i], i , -1); - + dMatrix->rmCell(smallRow, i); } } clusterBins(); clusterNames(); - + // Special handling for singlelinkage case, not sure whether this // could be avoided for (int i=nColCells-1;i>=0;i--) { - if (foundCol[i] == 0) { + if (foundCol[i] == 0) { if (method == "average" || method == "weighted") { - if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) { - //m->mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); m->mothurOutEndLine(); - if (cutOFF > colCells[i]->dist) { - cutOFF = colCells[i]->dist; - //m->mothurOut("changing cutoff to " + toString(cutOFF)); m->mothurOutEndLine(); + if (dMatrix->seqVec[smallCol][i].index != smallRow) { //if you are not hte smallest distance + if (cutOFF > dMatrix->seqVec[smallCol][i].dist) { + cutOFF = dMatrix->seqVec[smallCol][i].dist; } } } - removeCell(colCells[i], -1, i); + dMatrix->rmCell(smallCol, i); } } + } catch(exception& e) { m->errorOut(e, "Cluster", "update"); @@ -284,21 +127,23 @@ void Cluster::setMapWanted(bool f) { try { mapWanted = f; - //initialize map - for (int i = 0; i < list->getNumBins(); i++) { - - //parse bin - string names = list->get(i); - while (names.find_first_of(',') != -1) { - //get name from bin - string name = names.substr(0,names.find_first_of(',')); - //save name and bin number - seq2Bin[name] = i; - names = names.substr(names.find_first_of(',')+1, names.length()); - } - - //get last name - seq2Bin[names] = i; + //initialize map + for (int k = 0; k < list->getNumBins(); k++) { + + string names = list->get(k); + + //parse bin + string individual = ""; + int binNameslength = names.size(); + for(int j=0;jget(smallRow); - while (names.find_first_of(',') != -1) { - //get name from bin - string name = names.substr(0,names.find_first_of(',')); - //save name and bin number - seq2Bin[name] = smallCol; - names = names.substr(names.find_first_of(',')+1, names.length()); - } - - //get last name - seq2Bin[names] = smallCol; + string individual = ""; + int binNameslength = names.size(); + for(int j=0;jerrorOut(e, "Cluster", "updateMap"); diff --git a/cluster.hpp b/cluster.hpp index d7c2737..ff5d41d 100644 --- a/cluster.hpp +++ b/cluster.hpp @@ -2,49 +2,42 @@ #define CLUSTER_H + #include "mothur.h" -#include "sparsematrix.hpp" +#include "sparsedistancematrix.h" #include "mothurout.h" class RAbundVector; class ListVector; -typedef vector MatVec; - class Cluster { public: - Cluster(RAbundVector*, ListVector*, SparseMatrix*, float, string); + Cluster(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); virtual void update(double&); virtual string getTag() = 0; virtual void setMapWanted(bool m); virtual map getSeqtoBin() { return seq2Bin; } - -protected: - void getRowColCells(); - void removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix=true); - - virtual bool updateDistance(MatData& colCell, MatData& rowCell) = 0; - + +protected: + virtual bool updateDistance(PDistCell& colCell, PDistCell& rowCell) = 0; + virtual void clusterBins(); virtual void clusterNames(); virtual void updateMap(); RAbundVector* rabund; ListVector* list; - SparseMatrix* dMatrix; + SparseDistanceMatrix* dMatrix; - int smallRow; - int smallCol; + ull smallRow; + ull smallCol; float smallDist; bool mapWanted; float cutoff; map seq2Bin; string method; - vector seqVec; // contains vectors of cells related to a certain sequence - MatVec rowCells; - MatVec colCells; ull nRowCells; ull nColCells; MothurOut* m; @@ -54,33 +47,33 @@ protected: class CompleteLinkage : public Cluster { public: - CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*, float, string); - bool updateDistance(MatData& colCell, MatData& rowCell); + CompleteLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); + bool updateDistance(PDistCell& colCell, PDistCell& rowCell); string getTag(); private: - + }; /***********************************************************************/ class SingleLinkage : public Cluster { public: - SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*, float, string); + SingleLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); void update(double&); - bool updateDistance(MatData& colCell, MatData& rowCell); + bool updateDistance(PDistCell& colCell, PDistCell& rowCell); string getTag(); private: - + }; /***********************************************************************/ class AverageLinkage : public Cluster { public: - AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*, float, string); - bool updateDistance(MatData& colCell, MatData& rowCell); + AverageLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); + bool updateDistance(PDistCell& colCell, PDistCell& rowCell); string getTag(); private: @@ -89,15 +82,15 @@ private: int rowBin; int colBin; int totalBin; - + }; /***********************************************************************/ class WeightedLinkage : public Cluster { public: - WeightedLinkage(RAbundVector*, ListVector*, SparseMatrix*, float, string); - bool updateDistance(MatData& colCell, MatData& rowCell); + WeightedLinkage(RAbundVector*, ListVector*, SparseDistanceMatrix*, float, string); + bool updateDistance(PDistCell& colCell, PDistCell& rowCell); string getTag(); private: @@ -107,4 +100,6 @@ private: /***********************************************************************/ -#endif + + +#endif \ No newline at end of file diff --git a/clusterclassic.h b/clusterclassic.h index 932c806..a650bbf 100644 --- a/clusterclassic.h +++ b/clusterclassic.h @@ -41,13 +41,13 @@ private: struct colDist { int col; int row; - double dist; + float dist; colDist(int r, int c, double d) : row(r), col(c), dist(d) {} }; RAbundVector* rabund; ListVector* list; - vector< vector > dMatrix; + vector< vector > dMatrix; //vector rowSmallDists; int smallRow; diff --git a/clustercommand.cpp b/clustercommand.cpp index 8019def..56ce8bb 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -288,7 +288,7 @@ int ClusterCommand::execute(){ read->read(nameMap); list = read->getListVector(); - matrix = read->getMatrix(); + matrix = read->getDMatrix(); rabund = new RAbundVector(list->getRAbundVector()); delete read; @@ -333,7 +333,7 @@ int ClusterCommand::execute(){ loops = 0; double saveCutoff = cutoff; - while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){ + while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){ if (m->control_pressed) { //clean up delete list; delete matrix; delete rabund; delete cluster; @@ -353,8 +353,8 @@ int ClusterCommand::execute(){ loops++; cluster->update(cutoff); - - float dist = matrix->getSmallDist(); + + float dist = matrix->getSmallDist(); float rndDist; if (hard) { rndDist = m->ceilDist(dist, precision); diff --git a/clustercommand.h b/clustercommand.h index 7349961..e3dd214 100644 --- a/clustercommand.h +++ b/clustercommand.h @@ -14,7 +14,7 @@ #include "sabundvector.hpp" #include "listvector.hpp" #include "cluster.hpp" -#include "sparsematrix.hpp" +#include "sparsedistancematrix.h" /* The cluster() command: The cluster command outputs a .list , .rabund and .sabund files. @@ -44,7 +44,7 @@ public: private: Cluster* cluster; - SparseMatrix* matrix; + SparseDistanceMatrix* matrix; ListVector* list; RAbundVector* rabund; RAbundVector oldRAbund; diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index fc0211e..b097f38 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -1197,7 +1197,7 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile string listFileName = ""; Cluster* cluster = NULL; - SparseMatrix* matrix = NULL; + SparseDistanceMatrix* matrix = NULL; ListVector* list = NULL; ListVector oldList; RAbundVector* rabund = NULL; @@ -1227,7 +1227,7 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile list = read->getListVector(); oldList = *list; - matrix = read->getMatrix(); + matrix = read->getDMatrix(); delete read; read = NULL; delete nameMap; nameMap = NULL; diff --git a/clustersplitcommand.h b/clustersplitcommand.h index 28de948..d46bb8e 100644 --- a/clustersplitcommand.h +++ b/clustersplitcommand.h @@ -15,7 +15,7 @@ #include "sabundvector.hpp" #include "listvector.hpp" #include "cluster.hpp" -#include "sparsematrix.hpp" +#include "sparsedistancematrix.h" #include "readcluster.h" #include "splitmatrix.h" #include "readphylip.h" diff --git a/completelinkage.cpp b/completelinkage.cpp index 86e9054..06ed2db 100644 --- a/completelinkage.cpp +++ b/completelinkage.cpp @@ -3,7 +3,7 @@ /***********************************************************************/ -CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string s) : +CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) : Cluster(rav, lv, dm, c, s) {} @@ -16,11 +16,11 @@ string CompleteLinkage::getTag() { /***********************************************************************/ //This function updates the distance based on the furthest neighbor method. -bool CompleteLinkage::updateDistance(MatData& colCell, MatData& rowCell) { +bool CompleteLinkage::updateDistance(PDistCell& colCell, PDistCell& rowCell) { try { bool changed = false; - if (colCell->dist < rowCell->dist) { - colCell->dist = rowCell->dist; + if (colCell.dist < rowCell.dist) { + colCell.dist = rowCell.dist; changed = true; } return(changed); diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index 4ebde29..4967f24 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -359,16 +359,19 @@ int GetOTURepCommand::execute(){ list = readMatrix->getListVector(); - SparseMatrix* matrix = readMatrix->getMatrix(); + SparseDistanceMatrix* matrix = readMatrix->getDMatrix(); // Create a data structure to quickly access the distance information. // It consists of a vector of distance maps, where each map contains // all distances of a certain sequence. Vector and maps are accessed // via the index of a sequence in the distance matrix seqVec = vector(list->size()); - for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) { - if (m->control_pressed) { delete readMatrix; return 0; } - seqVec[currentCell->row][currentCell->column] = currentCell->dist; + for (int i = 0; i < matrix->seqVec.size(); i++) { + for (int j = 0; j < matrix->seqVec[i].size(); j++) { + if (m->control_pressed) { delete readMatrix; return 0; } + //already added everyone else in row + if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; } + } } //add dummy map for unweighted calc SeqMap dummy; diff --git a/getoturepcommand.h b/getoturepcommand.h index 5af0340..d19a396 100644 --- a/getoturepcommand.h +++ b/getoturepcommand.h @@ -19,7 +19,6 @@ #include "readmatrix.hpp" #include "formatmatrix.h" -typedef list::iterator MatData; typedef map SeqMap; struct repStruct { diff --git a/getsharedotucommand.cpp b/getsharedotucommand.cpp index 9beca6a..1b69a25 100644 --- a/getsharedotucommand.cpp +++ b/getsharedotucommand.cpp @@ -417,10 +417,11 @@ int GetSharedOTUCommand::process(ListVector* shared) { vector namesOfSeqsInThisBin; - string names = shared->get(i); - while ((names.find_first_of(',') != -1)) { - string name = names.substr(0,names.find_first_of(',')); - names = names.substr(names.find_first_of(',')+1, names.length()); + string names = shared->get(i); + vector binNames; + m->splitAtComma(names, binNames); + for(int j = 0; j < binNames.size(); j++) { + string name = binNames[j]; //find group string seqGroup = groupMap->getGroup(name); @@ -436,20 +437,6 @@ int GetSharedOTUCommand::process(ListVector* shared) { else { atLeastOne[seqGroup]++; } } - //get last name - string seqGroup = groupMap->getGroup(names); - if (output != "accnos") { - namesOfSeqsInThisBin.push_back((names + "|" + seqGroup + "|" + toString(i+1))); - }else { namesOfSeqsInThisBin.push_back(names); } - - if (seqGroup == "not found") { m->mothurOut(names + " is not in your groupfile. Please correct."); m->mothurOutEndLine(); exit(1); } - - //is this seq in one of hte groups we care about - it = groupFinder.find(seqGroup); - if (it == groupFinder.end()) { uniqueOTU = false; } //you have a sequence from a group you don't want - else { atLeastOne[seqGroup]++; } - - //make sure you have at least one seq from each group you want bool sharedByAll = true; map::iterator it2; diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index a845e0b..dd98b37 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -287,7 +287,7 @@ int MGClusterCommand::execute(){ if (!hclusterWanted) { //get distmatrix and overlap - SparseMatrix* distMatrix = read->getDistMatrix(); + SparseDistanceMatrix* distMatrix = read->getDistMatrix(); overlapMatrix = read->getOverlapMatrix(); //already sorted by read delete read; diff --git a/mothur.h b/mothur.h index 2c143e8..25b803f 100644 --- a/mothur.h +++ b/mothur.h @@ -88,6 +88,7 @@ using namespace std; typedef unsigned long ull; +typedef unsigned short intDist; struct IntNode { int lvalue; @@ -119,7 +120,13 @@ struct diffPair { reverseProb = rp; } }; - +/***********************************************************************/ +struct PDistCell{ + ull index; + float dist; + PDistCell() : index(0), dist(0) {}; + PDistCell(ull c, float d) : index(c), dist(d) {} +}; /************************************************************/ struct clusterNode { int numSeq; @@ -158,10 +165,14 @@ struct spearmanRank { spearmanRank(string n, float s) : name(n), score(s) {} }; +//*********************************************************************** +inline bool compareIndexes(PDistCell left, PDistCell right){ + return (left.index > right.index); +} //******************************************************************************************************************** //sorts highest to lowest inline bool compareSpearman(spearmanRank left, spearmanRank right){ - return (left.score > right.score); + return (left.score < right.score); } //******************************************************************************************************************** //sorts highest to lowest diff --git a/mothurout.cpp b/mothurout.cpp index 14840c1..dfcf25b 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -2017,6 +2017,28 @@ bool MothurOut::mothurConvert(string item, int& num){ exit(1); } } +/***********************************************************************/ +bool MothurOut::mothurConvert(string item, intDist& num){ + try { + bool error = false; + + if (isNumeric1(item)) { + convert(item, num); + }else { + num = 0; + error = true; + mothurOut("[ERROR]: cannot convert " + item + " to an integer."); mothurOutEndLine(); + commandInputsConvertError = true; + } + + return error; + } + catch(exception& e) { + errorOut(e, "MothurOut", "mothurConvert"); + exit(1); + } +} + /***********************************************************************/ bool MothurOut::isNumeric1(string stringToCheck){ try { diff --git a/mothurout.h b/mothurout.h index ac47d79..77c5a80 100644 --- a/mothurout.h +++ b/mothurout.h @@ -111,6 +111,7 @@ class MothurOut { int readNames(string, vector&, map&); int mothurRemove(string); bool mothurConvert(string, int&); //use for converting user inputs. Sets commandInputsConvertError to true if error occurs. Engines check this. + bool mothurConvert(string, intDist&); //use for converting user inputs. Sets commandInputsConvertError to true if error occurs. Engines check this. bool mothurConvert(string, float&); //use for converting user inputs. Sets commandInputsConvertError to true if error occurs. Engines check this. bool mothurConvert(string, double&); //use for converting user inputs. Sets commandInputsConvertError to true if error occurs. Engines check this. diff --git a/phylodiversitycommand.cpp b/phylodiversitycommand.cpp index 9d68cb5..ddd2b31 100644 --- a/phylodiversitycommand.cpp +++ b/phylodiversitycommand.cpp @@ -346,7 +346,7 @@ int PhyloDiversityCommand::execute(){ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.unweighted."); m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run phylo.diversity."); m->mothurOutEndLine(); m->mothurOutEndLine(); @@ -460,7 +460,7 @@ int PhyloDiversityCommand::driver(Tree* t, map< string, vector >& div, ma for (int l = 0; l < numIters; l++) { random_shuffle(randomLeaf.begin(), randomLeaf.end()); - cout << l << endl; + //initialize counts map counts; vector< map > countedBranch; diff --git a/readblast.cpp b/readblast.cpp index 2d49477..5f24415 100644 --- a/readblast.cpp +++ b/readblast.cpp @@ -54,7 +54,8 @@ int ReadBlast::read(NameAssignment* nameMap) { //create objects needed for read if (!hclusterWanted) { - matrix = new SparseMatrix(); + matrix = new SparseDistanceMatrix(); + matrix->resize(nseqs); }else{ overlapFile = m->getRootName(blastfile) + "overlap.dist"; distFile = m->getRootName(blastfile) + "hclusterDists.dist"; @@ -185,8 +186,13 @@ int ReadBlast::read(NameAssignment* nameMap) { //is this distance below cutoff if (distance < cutoff) { if (!hclusterWanted) { - PCell value(itA->second, it->first, distance); - matrix->addCell(value); + if (itA->second < it->first) { + PDistCell value(it->first, distance); + matrix->addCell(itA->second, value); + }else { + PDistCell value(itA->second, distance); + matrix->addCell(it->first, value); + } }else{ outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl; } @@ -252,8 +258,13 @@ int ReadBlast::read(NameAssignment* nameMap) { //is this distance below cutoff if (distance < cutoff) { if (!hclusterWanted) { - PCell value(itA->second, it->first, distance); - matrix->addCell(value); + if (itA->second < it->first) { + PDistCell value(it->first, distance); + matrix->addCell(itA->second, value); + }else { + PDistCell value(itA->second, distance); + matrix->addCell(it->first, value); + } }else{ outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl; } diff --git a/readblast.h b/readblast.h index fd32380..ef5ff9a 100644 --- a/readblast.h +++ b/readblast.h @@ -10,7 +10,7 @@ */ #include "mothur.h" -#include "sparsematrix.hpp" +#include "sparsedistancematrix.h" #include "nameassignment.hpp" /****************************************************************************************/ @@ -26,7 +26,7 @@ public: ~ReadBlast() {} int read(NameAssignment*); - SparseMatrix* getDistMatrix() { return matrix; } + SparseDistanceMatrix* getDistMatrix() { return matrix; } vector getOverlapMatrix() { return overlap; } string getOverlapFile() { return overlapFile; } string getDistFile() { return distFile; } @@ -38,7 +38,7 @@ private: bool minWanted; //if true choose min bsr, if false choose max bsr bool hclusterWanted; - SparseMatrix* matrix; + SparseDistanceMatrix* matrix; vector overlap; MothurOut* m; diff --git a/readcolumn.cpp b/readcolumn.cpp index 53a8c42..665dcab 100644 --- a/readcolumn.cpp +++ b/readcolumn.cpp @@ -34,7 +34,7 @@ int ReadColumnMatrix::read(NameAssignment* nameMap){ string firstName, secondName; float distance; int nseqs = nameMap->size(); - + DMatrix->resize(nseqs); list = new ListVector(nameMap->getListVector()); Progress* reading = new Progress("Reading matrix: ", nseqs * nseqs); @@ -63,33 +63,34 @@ int ReadColumnMatrix::read(NameAssignment* nameMap){ if(distance < cutoff && itA != itB){ if(itA->second > itB->second){ - PCell value(itA->second, itB->second, distance); - + PDistCell value(itA->second, distance); + + if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol... refRow = itA->second; refCol = itB->second; - D->addCell(value); + DMatrix->addCell(itB->second, value); } else if(refRow == itA->second && refCol == itB->second){ lt = 0; } else{ - D->addCell(value); + DMatrix->addCell(itB->second, value); } } else if(itA->second < itB->second){ - PCell value(itB->second, itA->second, distance); + PDistCell value(itB->second, distance); if(refRow == refCol){ // in other words, if we haven't loaded refRow and refCol... refRow = itA->second; refCol = itB->second; - D->addCell(value); + DMatrix->addCell(itA->second, value); } else if(refRow == itB->second && refCol == itA->second){ lt = 0; } else{ - D->addCell(value); + DMatrix->addCell(itA->second, value); } } reading->update(itA->second * nseqs); @@ -100,7 +101,7 @@ int ReadColumnMatrix::read(NameAssignment* nameMap){ if(lt == 0){ // oops, it was square fileHandle.close(); //let's start over - D->clear(); //let's start over + DMatrix->clear(); //let's start over m->openInputFile(distFile, fileHandle); //let's start over @@ -119,8 +120,8 @@ int ReadColumnMatrix::read(NameAssignment* nameMap){ else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. if(distance < cutoff && itA->second > itB->second){ - PCell value(itA->second, itB->second, distance); - D->addCell(value); + PDistCell value(itA->second, distance); + DMatrix->addCell(itB->second, value); reading->update(itA->second * nseqs); } @@ -145,10 +146,6 @@ int ReadColumnMatrix::read(NameAssignment* nameMap){ } /***********************************************************************/ - -ReadColumnMatrix::~ReadColumnMatrix(){ - //delete D; - //delete list; -} - +ReadColumnMatrix::~ReadColumnMatrix(){} +/***********************************************************************/ diff --git a/readmatrix.hpp b/readmatrix.hpp index 49ae294..5ef8afa 100644 --- a/readmatrix.hpp +++ b/readmatrix.hpp @@ -12,27 +12,26 @@ #include "mothur.h" #include "listvector.hpp" -#include "sparsematrix.hpp" #include "nameassignment.hpp" +#include "sparsedistancematrix.h" class SparseMatrix; class ReadMatrix { public: - ReadMatrix(){ D = new SparseMatrix(); m = MothurOut::getInstance(); } + ReadMatrix(){ DMatrix = new SparseDistanceMatrix(); m = MothurOut::getInstance(); } virtual ~ReadMatrix() {} virtual int read(NameAssignment*){ return 1; } void setCutoff(float c) { cutoff = c; } - SparseMatrix* getMatrix() { return D; } + SparseDistanceMatrix* getDMatrix() { return DMatrix; } ListVector* getListVector() { return list; } -// OrderVector* getOrderVector() { return order; } int successOpen; protected: - SparseMatrix* D; + SparseDistanceMatrix* DMatrix; ListVector* list; float cutoff; MothurOut* m; diff --git a/readphylip.cpp b/readphylip.cpp index 1c529b2..d256f92 100644 --- a/readphylip.cpp +++ b/readphylip.cpp @@ -33,7 +33,7 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ try { float distance; - int square, nseqs; + int square, nseqs; string name; vector matrixNames; @@ -72,7 +72,8 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ } Progress* reading; - + DMatrix->resize(nseqs); + if(square == 0){ reading = new Progress("Reading matrix: ", nseqs * (nseqs - 1) / 2); @@ -95,14 +96,13 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ if (m->control_pressed) { delete reading; fileHandle.close(); return 0; } fileHandle >> distance; - if (distance == -1) { distance = 1000000; } else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. if(distance < cutoff){ - PCell value(i, j, distance); - D->addCell(value); + PDistCell value(i, distance); + DMatrix->addCell(j, value); } index++; reading->update(index); @@ -121,8 +121,8 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. if(distance < cutoff){ - PCell value(nameMap->get(matrixNames[i]), nameMap->get(matrixNames[j]), distance); - D->addCell(value); + PDistCell value(nameMap->get(matrixNames[i]), distance); + DMatrix->addCell(nameMap->get(matrixNames[j]), value); } index++; reading->update(index); @@ -153,8 +153,8 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. if(distance < cutoff && j < i){ - PCell value(i, j, distance); - D->addCell(value); + PDistCell value(i, distance); + DMatrix->addCell(j, value); } index++; reading->update(index); @@ -173,8 +173,8 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ else if (sim) { distance = 1.0 - distance; } //user has entered a sim matrix that we need to convert. if(distance < cutoff && j < i){ - PCell value(nameMap->get(matrixNames[i]), nameMap->get(matrixNames[j]), distance); - D->addCell(value); + PDistCell value(nameMap->get(matrixNames[i]), distance); + DMatrix->addCell(nameMap->get(matrixNames[j]), value); } index++; reading->update(index); @@ -187,20 +187,11 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ reading->finish(); delete reading; - + list->setLabel("0"); fileHandle.close(); - /* if(nameMap != NULL){ - for(int i=0;ierase(matrixNames[i]); - } - if(nameMap->size() > 0){ - //should probably tell them what is missing if we missed something - m->mothurOut("missed something\t" + toString(nameMap->size())); m->mothurOutEndLine(); - } - } */ - + return 1; } @@ -211,8 +202,6 @@ int ReadPhylipMatrix::read(NameAssignment* nameMap){ } /***********************************************************************/ +ReadPhylipMatrix::~ReadPhylipMatrix(){} +/***********************************************************************/ -ReadPhylipMatrix::~ReadPhylipMatrix(){ - // delete D; - // delete list; -} diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index 9a7b814..5ec6cf2 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -821,12 +821,16 @@ void SeqErrorCommand::getReferences(){ // // int endPos = rdb->referenceSeqs[i].getEndPos(); // if(endPos < minEndPos) { minEndPos = endPos; } + if (rdb->referenceSeqs[i].getNumBases() == 0) { + m->mothurOut("[WARNING]: " + rdb->referenceSeqs[i].getName() + " is blank, ignoring.");m->mothurOutEndLine(); + }else { + referenceSeqs.push_back(rdb->referenceSeqs[i]); + } - referenceSeqs.push_back(rdb->referenceSeqs[i]); } referenceFileName = rdb->getSavedReference(); - m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); }else { int start = time(NULL); @@ -844,9 +848,12 @@ void SeqErrorCommand::getReferences(){ // // int endPos = currentSeq.getEndPos(); // if(endPos < minEndPos) { minEndPos = endPos; } - referenceSeqs.push_back(currentSeq); - - if (rdb->save) { rdb->referenceSeqs.push_back(currentSeq); } + if (currentSeq.getNumBases() == 0) { + m->mothurOut("[WARNING]: " + currentSeq.getName() + " is blank, ignoring.");m->mothurOutEndLine(); + }else { + referenceSeqs.push_back(currentSeq); + if (rdb->save) { rdb->referenceSeqs.push_back(currentSeq); } + } m->gobble(referenceFile); } @@ -860,7 +867,7 @@ void SeqErrorCommand::getReferences(){ for(int i=0;imothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n"); @@ -964,7 +971,6 @@ int SeqErrorCommand::getErrors(Sequence query, Sequence reference, Compare& erro errors.errorRate = (double)(errors.total-errors.matches) / (double)errors.total; errors.queryName = query.getName(); errors.refName = reference.getName(); - //return errors; return 0; } diff --git a/sequence.cpp b/sequence.cpp index a1e787b..a359607 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -682,7 +682,7 @@ int Sequence::getEndPos(){ //******************************************************************************************************************** void Sequence::padFromPos(int end){ - cout << end << '\t' << endPos << endl; + //cout << end << '\t' << endPos << endl; for(int j = end; j < endPos; j++) { aligned[j] = '.'; } diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 115e930..c34f25d 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -802,7 +802,7 @@ string ShhherCommand::createNamesFile(){ duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ','; } - string nameFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName) + getOutputFileNameTag("name"); + string nameFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("name"); ofstream nameFile; m->openOutputFile(nameFileName, nameFile); @@ -2309,7 +2309,7 @@ int ShhherCommand::driver(vector filenames, string thisCompositeFASTAFil if (m->control_pressed) { break; } vector otuCounts(numOTUs, 0); - for(int i=0;iread(clusterNameMap); ListVector* list = read->getListVector(); - SparseMatrix* matrix = read->getMatrix(); + SparseDistanceMatrix* matrix = read->getDMatrix(); delete read; delete clusterNameMap; diff --git a/singlelinkage.cpp b/singlelinkage.cpp index 1a1b2ae..3af1ea0 100644 --- a/singlelinkage.cpp +++ b/singlelinkage.cpp @@ -5,7 +5,7 @@ /***********************************************************************/ -SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string s) : +SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) : Cluster(rav, lv, dm, c, s) {} @@ -18,9 +18,11 @@ string SingleLinkage::getTag() { /***********************************************************************/ //This function clusters based on the single linkage method. -void SingleLinkage::update(double& cutOFF){ +void SingleLinkage::update(double& cutOFF){ try { - getRowColCells(); + smallCol = dMatrix->getSmallestCell(smallRow); + nColCells = dMatrix->seqVec[smallCol].size(); + nRowCells = dMatrix->seqVec[smallRow].size(); vector deleted(nRowCells, false); int rowInd; @@ -30,55 +32,45 @@ void SingleLinkage::update(double& cutOFF){ // The vector has to be traversed in reverse order to preserve the index // for faster removal in removeCell() for (int i=nRowCells-1;i>=0;i--) { - if ((rowCells[i]->row == smallRow) && (rowCells[i]->column == smallCol)) { - rowInd = i; // The index of the smallest distance cell in rowCells - } else { - if (rowCells[i]->row == smallRow) { - search = rowCells[i]->column; - } else { - search = rowCells[i]->row; - } - - for (int j=0;jrow == smallRow) && (colCells[j]->column == smallCol))) { - if (colCells[j]->row == search || colCells[j]->column == search) { - changed = updateDistance(colCells[j], rowCells[i]); - // If the cell's distance changed and it had the same distance as - // the smallest distance, invalidate the mins vector in SparseMatrix - if (changed) { - if (colCells[j]->vectorMap != NULL) { - *(colCells[j]->vectorMap) = NULL; - colCells[j]->vectorMap = NULL; - } - } - removeCell(rowCells[i], i , -1); - deleted[i] = true; - break; - } - } - } - if (!deleted[i]) { - // Assign the cell to the new cluster - // remove the old cell from seqVec and add the cell - // with the new row and column assignment again - removeCell(rowCells[i], i , -1, false); - if (search < smallCol){ - rowCells[i]->row = smallCol; - rowCells[i]->column = search; - } else { - rowCells[i]->row = search; - rowCells[i]->column = smallCol; - } - seqVec[rowCells[i]->row].push_back(rowCells[i]); - seqVec[rowCells[i]->column].push_back(rowCells[i]); - } - } + if (dMatrix->seqVec[smallRow][i].index == smallCol) { + rowInd = i; // The index of the smallest distance cell in rowCells + } else { + search = dMatrix->seqVec[smallRow][i].index; + + for (int j=0;jseqVec[smallCol][j].index != smallRow) { //if you are not the small cell + if (dMatrix->seqVec[smallCol][j].index == search) { + changed = updateDistance(dMatrix->seqVec[smallCol][j], dMatrix->seqVec[smallRow][i]); + dMatrix->updateCellCompliment(smallCol, j); + dMatrix->rmCell(smallRow, i); + deleted[i] = true; + break; + } + } + } + if (!deleted[i]) { + // Assign the cell to the new cluster + // remove the old cell from seqVec and add the cell + // with the new row and column assignment again + float distance = dMatrix->seqVec[smallRow][i].dist; + dMatrix->rmCell(smallRow, i); + if (search < smallCol){ + PDistCell value(smallCol, distance); + dMatrix->addCell(search, value); + } else { + PDistCell value(search, distance); + dMatrix->addCell(smallCol, value); + } + sort(dMatrix->seqVec[smallCol].begin(), dMatrix->seqVec[smallCol].end(), compareIndexes); + sort(dMatrix->seqVec[search].begin(), dMatrix->seqVec[search].end(), compareIndexes); + } + } } clusterBins(); clusterNames(); // remove also the cell with the smallest distance - removeCell(rowCells[rowInd], -1 , -1); + dMatrix->rmCell(smallRow, rowInd); } catch(exception& e) { m->errorOut(e, "SingleLinkage", "update"); @@ -89,11 +81,11 @@ void SingleLinkage::update(double& cutOFF){ /***********************************************************************/ //This function updates the distance based on the nearest neighbor method. -bool SingleLinkage::updateDistance(MatData& colCell, MatData& rowCell) { +bool SingleLinkage::updateDistance(PDistCell& colCell, PDistCell& rowCell) { try { bool changed = false; - if (colCell->dist > rowCell->dist) { - colCell->dist = rowCell->dist; + if (colCell.dist > rowCell.dist) { + colCell.dist = rowCell.dist; } return(changed); } diff --git a/sparsedistancematrix.cpp b/sparsedistancematrix.cpp new file mode 100644 index 0000000..8035e13 --- /dev/null +++ b/sparsedistancematrix.cpp @@ -0,0 +1,149 @@ +// +// sparsedistancematrix.cpp +// Mothur +// +// Created by Sarah Westcott on 7/16/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "sparsedistancematrix.h" + + +/***********************************************************************/ + +SparseDistanceMatrix::SparseDistanceMatrix() : numNodes(0), smallDist(1e6){ m = MothurOut::getInstance(); sorted=false; aboveCutoff = 1e6; } + +/***********************************************************************/ + +int SparseDistanceMatrix::getNNodes(){ + return numNodes; +} +/***********************************************************************/ + +void SparseDistanceMatrix::clear(){ + for (int i = 0; i < seqVec.size(); i++) { seqVec[i].clear(); } + seqVec.clear(); +} + +/***********************************************************************/ + +float SparseDistanceMatrix::getSmallDist(){ + return smallDist; +} +/***********************************************************************/ + +int SparseDistanceMatrix::updateCellCompliment(ull row, ull col){ + try { + + ull vrow = seqVec[row][col].index; + ull vcol = 0; + + //find the columns entry for this cell as well + for (int i = 0; i < seqVec[vrow].size(); i++) { if (seqVec[vrow][i].index == row) { vcol = i; break; } } + + seqVec[vrow][vcol].dist = seqVec[row][col].dist; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SparseDistanceMatrix", "updateCellCompliment"); + exit(1); + } +} +/***********************************************************************/ + +int SparseDistanceMatrix::rmCell(ull row, ull col){ + try { + numNodes-=2; + + ull vrow = seqVec[row][col].index; + ull vcol = 0; + + //find the columns entry for this cell as well + for (int i = 0; i < seqVec[vrow].size(); i++) { if (seqVec[vrow][i].index == row) { vcol = i; break; } } + + seqVec[vrow].erase(seqVec[vrow].begin()+vcol); + seqVec[row].erase(seqVec[row].begin()+col); + + return(0); + } + catch(exception& e) { + m->errorOut(e, "SparseDistanceMatrix", "rmCell"); + exit(1); + } +} +/***********************************************************************/ +void SparseDistanceMatrix::addCell(ull row, PDistCell cell){ + try { + numNodes+=2; + if(cell.dist < smallDist){ smallDist = cell.dist; } + + seqVec[row].push_back(cell); + PDistCell temp(row, cell.dist); + seqVec[cell.index].push_back(temp); + } + catch(exception& e) { + m->errorOut(e, "SparseDistanceMatrix", "addCell"); + exit(1); + } +} +/***********************************************************************/ + +ull SparseDistanceMatrix::getSmallestCell(ull& row){ + try { + if (!sorted) { sortSeqVec(); sorted = true; } + + vector mins; + smallDist = 1e6; + + for (int i = 0; i < seqVec.size(); i++) { + for (int j = 0; j < seqVec[i].size(); j++) { + + //already checked everyone else in row + if (i < seqVec[i][j].index) { + + float dist = seqVec[i][j].dist; + + if(dist < smallDist){ //found a new smallest distance + mins.clear(); + smallDist = dist; + PDistCellMin temp(i, seqVec[i][j].index); + mins.push_back(temp); + } + else if(dist == smallDist){ //if a subsequent distance is the same as mins distance add the new iterator to the mins vector + PDistCellMin temp(i, seqVec[i][j].index); + mins.push_back(temp); + } + }else { j+=seqVec[i].size(); } //stop looking + } + } + + random_shuffle(mins.begin(), mins.end()); //randomize the order of the iterators in the mins vector + + row = mins[0].row; + ull col = mins[0].col; + + return col; + } + catch(exception& e) { + m->errorOut(e, "SparseMatrix", "getSmallestCell"); + exit(1); + } +} +/***********************************************************************/ + +int SparseDistanceMatrix::sortSeqVec(){ + try { + + //saves time in getSmallestCell, by making it so you dont search the repeats + for (int i = 0; i < seqVec.size(); i++) { sort(seqVec[i].begin(), seqVec[i].end(), compareIndexes); } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SparseMatrix", "getSmallestCell"); + exit(1); + } +} +/***********************************************************************/ + diff --git a/sparsedistancematrix.h b/sparsedistancematrix.h new file mode 100644 index 0000000..f18fdd7 --- /dev/null +++ b/sparsedistancematrix.h @@ -0,0 +1,66 @@ +#ifndef Mothur_sparsedistancematrix_h +#define Mothur_sparsedistancematrix_h + +// +// sparsedistancematrix.h +// Mothur +// +// Created by Sarah Westcott on 7/16/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "mothur.h" +#include "mothurout.h" + + +class ListVector; + + +/* For each distance in a sparse matrix we have a row, column and distance. + The PDistCell consists of the column and distance. + We know the row by the distances row in the seqVec matrix. + SeqVec is square and each row is sorted so the column values are ascending to save time in the search for the smallest distance. */ + +/***********************************************************************/ +struct PDistCellMin{ + ull row; + ull col; + //PDistCell* cell; + PDistCellMin(ull r, ull c) : col(c), row(r) {} +}; +/***********************************************************************/ + + + +class SparseDistanceMatrix { + +public: + SparseDistanceMatrix(); + ~SparseDistanceMatrix(){ clear(); } + int getNNodes(); + ull getSmallestCell(ull& index); //Return the cell with the smallest distance + float getSmallDist(); + + int rmCell(ull, ull); + int updateCellCompliment(ull, ull); + void resize(ull n) { seqVec.resize(n); } + void clear(); + void addCell(ull, PDistCell); + vector > seqVec; + +private: + PDistCell smallCell; //The cell with the smallest distance + int numNodes; + + bool sorted; + int sortSeqVec(); + float smallDist, aboveCutoff; + + MothurOut* m; +}; + +/***********************************************************************/ + + + +#endif diff --git a/treegroupscommand.cpp b/treegroupscommand.cpp index 888f502..6633e51 100644 --- a/treegroupscommand.cpp +++ b/treegroupscommand.cpp @@ -286,7 +286,7 @@ TreeGroupCommand::TreeGroupCommand(string option) { TreeGroupCommand::~TreeGroupCommand(){ if (abort == false) { if (format == "sharedfile") { delete input; } - else { delete readMatrix; delete matrix; delete list; } + else { delete list; } delete tmap; } @@ -418,7 +418,8 @@ int TreeGroupCommand::execute(){ }else{ //read in dist file filename = inputfile; - + + ReadMatrix* readMatrix; if (format == "column") { readMatrix = new ReadColumnMatrix(filename); } else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(filename); } @@ -434,7 +435,7 @@ int TreeGroupCommand::execute(){ readMatrix->read(nameMap); list = readMatrix->getListVector(); - matrix = readMatrix->getMatrix(); + SparseDistanceMatrix* dMatrix = readMatrix->getDMatrix(); //make treemap tmap = new TreeMap(); @@ -457,7 +458,9 @@ int TreeGroupCommand::execute(){ if (m->control_pressed) { return 0; } - vector< vector > matrix = makeSimsDist(); + vector< vector > matrix = makeSimsDist(dMatrix); + delete readMatrix; + delete dMatrix; if (m->control_pressed) { return 0; } @@ -554,7 +557,7 @@ void TreeGroupCommand::printSims(ostream& out, vector< vector >& simMatr } } /***********************************************************/ -vector< vector > TreeGroupCommand::makeSimsDist() { +vector< vector > TreeGroupCommand::makeSimsDist(SparseDistanceMatrix* matrix) { try { numGroups = list->size(); @@ -569,13 +572,17 @@ vector< vector > TreeGroupCommand::makeSimsDist() { //go through sparse matrix and fill sims //go through each cell in the sparsematrix - for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){ - //similairity = -(distance-1) - simMatrix[currentCell->row][currentCell->column] = -(currentCell->dist -1.0); - simMatrix[currentCell->column][currentCell->row] = -(currentCell->dist -1.0); - - if (m->control_pressed) { return simMatrix; } + for (int i = 0; i < matrix->seqVec.size(); i++) { + for (int j = 0; j < matrix->seqVec[i].size(); j++) { + + //already checked everyone else in row + if (i < matrix->seqVec[i][j].index) { + simMatrix[i][matrix->seqVec[i][j].index] = -(matrix->seqVec[i][j].dist -1.0); + simMatrix[matrix->seqVec[i][j].index][i] = -(matrix->seqVec[i][j].dist -1.0); + if (m->control_pressed) { return simMatrix; } + } + } } return simMatrix; diff --git a/treegroupscommand.h b/treegroupscommand.h index a0e0d42..b0ae730 100644 --- a/treegroupscommand.h +++ b/treegroupscommand.h @@ -101,12 +101,10 @@ private: Tree* createTree(vector< vector >&); void printSims(ostream&, vector< vector >&); int makeSimsShared(); - vector< vector > makeSimsDist(); + vector< vector > makeSimsDist(SparseDistanceMatrix*); int writeTree(string, Tree*); int driver(vector, int, int, vector< vector >&); - ReadMatrix* readMatrix; - SparseMatrix* matrix; NameAssignment* nameMap; ListVector* list; TreeMap* tmap; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 3b9d195..0036769 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -607,11 +607,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string Sequence currSeq(inFASTA); m->gobble(inFASTA); //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl; - if (m->debug) { m->mothurOut("[DEBUG]: " + toString(count) + " fasta = " + currSeq.getName() + '\n'); } + QualityScores currQual; if(qFileName != ""){ currQual = QualityScores(qFile); m->gobble(qFile); - if (m->debug) { m->mothurOut("[DEBUG]: qual = " + currQual.getName() + '\n'); } + if ((m->debug)&&(count>15800)) { m->mothurOut("[DEBUG]: " + toString(count) + " fasta = " + currSeq.getName() + '\n'); m->mothurOut("[DEBUG]: " + toString(getpid()) + '\n'); } } string origSeq = currSeq.getUnaligned(); @@ -883,6 +883,8 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName tempNameFileNames, lines[process], qLines[process]); + + if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); } //pass groupCounts to parent if(createGroup){ diff --git a/weightedlinkage.cpp b/weightedlinkage.cpp index 0851bd7..19c41ce 100644 --- a/weightedlinkage.cpp +++ b/weightedlinkage.cpp @@ -11,7 +11,7 @@ /***********************************************************************/ -WeightedLinkage::WeightedLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c, string s) : +WeightedLinkage::WeightedLinkage(RAbundVector* rav, ListVector* lv, SparseDistanceMatrix* dm, float c, string s) : Cluster(rav, lv, dm, c, s) { saveRow = -1; @@ -28,7 +28,7 @@ string WeightedLinkage::getTag() { /***********************************************************************/ //This function updates the distance based on the average linkage method. -bool WeightedLinkage::updateDistance(MatData& colCell, MatData& rowCell) { +bool WeightedLinkage::updateDistance(PDistCell& colCell, PDistCell& rowCell) { try { if ((saveRow != smallRow) || (saveCol != smallCol)) { // rowBin = rabund->get(smallRow); @@ -38,7 +38,7 @@ bool WeightedLinkage::updateDistance(MatData& colCell, MatData& rowCell) { saveCol = smallCol; } - colCell->dist = (colCell->dist + rowCell->dist) / 2.0; + colCell.dist = (colCell.dist + rowCell.dist) / 2.0; return(true); }