]> git.donarmstrong.com Git - mothur.git/commitdiff
added warning about average neighbor merges around cutoff. fixed little bugs.
authorwestcott <westcott>
Tue, 19 Jan 2010 13:48:32 +0000 (13:48 +0000)
committerwestcott <westcott>
Tue, 19 Jan 2010 13:48:32 +0000 (13:48 +0000)
19 files changed:
averagelinkage.cpp
classifyseqscommand.cpp
cluster.cpp
cluster.hpp
clustercommand.cpp
completelinkage.cpp
formatcolumn.cpp
formatmatrix.h
getoturepcommand.cpp
hcluster.cpp
hcluster.h
hclustercommand.cpp
mgclustercommand.cpp
mothur.cpp
mothur.h
readdistcommand.cpp
readphylip.cpp
sharedcommand.cpp
singlelinkage.cpp

index 7a4cb88d366d4f9765d906ce0fc7ea27600a6fdd..db2c51edc5b182b52dcf540456baad7b5782c61e 100644 (file)
@@ -11,8 +11,8 @@
 
 /***********************************************************************/
 
-AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-       Cluster(rav, lv, dm)
+AverageLinkage::AverageLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) :
+       Cluster(rav, lv, dm, c)
 {
        saveRow = -1;
        saveCol = -1;
@@ -37,8 +37,16 @@ bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
                        saveRow = smallRow;
                        saveCol = smallCol;
                }
-
+               
+               float oldColCell = colCell->dist;
+               
                colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin;
+               
+               //warn user if merge with value above cutoff produces a value below cutoff
+               if ((colCell->dist < cutoff) && ((oldColCell > cutoff) || (rowCell->dist > cutoff)) ) {
+                       mothurOut("Warning: merging " + toString(oldColCell) + " with " + toString(rowCell->dist) + ", new value = " + toString(colCell->dist) + ". Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine();
+               }
+
                return(true);
        }
        catch(exception& e) {
index 52248979080ce4a8b3676fb3fa96baa3e7ddd24a..39dda9ffd5fb29bb872ce4ccb205882d69926c66 100644 (file)
@@ -81,7 +81,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
                        
                        
                        namefile = validParameter.validFile(parameters, "name", false);
-                       if (fastaFileName == "not found") { namefile = "";  }
+                       if (namefile == "not found") { namefile = "";  }
                        else { 
                                splitAtDash(namefile, namefileNames);
                                
@@ -297,7 +297,7 @@ int ClassifySeqsCommand::execute(){
 #endif 
                        //make taxonomy tree from new taxonomy file 
                        PhyloTree taxaBrowser;
-                       
+               
                        ifstream in;
                        openInputFile(tempTaxonomyFile, in);
                
index 429e19aaeacce39b1a85943b31cd32da70739239..00d8083275779399748d690b3a9fe84910d91104 100644 (file)
@@ -14,8 +14,8 @@
 
 /***********************************************************************/
 
-Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-rabund(rav), list(lv), dMatrix(dm)
+Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) :
+rabund(rav), list(lv), dMatrix(dm), cutoff(c)
 {
 /*
        cout << "sizeof(MatData): " << sizeof(MatData) << endl;
index a6024e79d45a52274040a53cca4f8b32d9d2e998..38897c9cf0a2a2c0c95933eb7cabc8732f95839e 100644 (file)
@@ -13,7 +13,7 @@ typedef vector<MatData> MatVec;
 class Cluster {
        
 public:
-       Cluster(RAbundVector*, ListVector*, SparseMatrix*);
+       Cluster(RAbundVector*, ListVector*, SparseMatrix*, float);
     virtual void update();                             
        virtual string getTag() = 0;
        virtual void setMapWanted(bool m);  
@@ -37,6 +37,7 @@ protected:
        int smallCol;
        float smallDist;
        bool mapWanted;
+       float cutoff;
        map<string, int> seq2Bin;
        
        vector<MatVec> seqVec;          // contains vectors of cells related to a certain sequence
@@ -50,7 +51,7 @@ protected:
 
 class CompleteLinkage : public Cluster {
 public:
-       CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*);
+       CompleteLinkage(RAbundVector*, ListVector*, SparseMatrix*, float);
        bool updateDistance(MatData& colCell, MatData& rowCell);
        string getTag();
        
@@ -62,7 +63,7 @@ private:
 
 class SingleLinkage : public Cluster {
 public:
-       SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*);
+       SingleLinkage(RAbundVector*, ListVector*, SparseMatrix*, float);
     void update();
        bool updateDistance(MatData& colCell, MatData& rowCell);
        string getTag();
@@ -75,7 +76,7 @@ private:
 
 class AverageLinkage : public Cluster {
 public:
-       AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*);
+       AverageLinkage(RAbundVector*, ListVector*, SparseMatrix*, float);
        bool updateDistance(MatData& colCell, MatData& rowCell);
        string getTag();
        
index baf4224c952a5a47bea7aa911f56196c6dabdead..93ba1f9b65b8146b666d32debb3df601c046b19a 100644 (file)
@@ -81,9 +81,9 @@ ClusterCommand::ClusterCommand(string option){
                                }
                                
                                //create cluster
-                               if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix); }
-                               else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix); }
-                               else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix);     }
+                               if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, matrix, cutoff); }
+                               else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, matrix, cutoff); }
+                               else if(method == "average"){   cluster = new AverageLinkage(rabund, list, matrix, cutoff);     }
                                tag = cluster->getTag();
 
                                fileroot = getRootName(globaldata->inputFileName);
index 09ba9639fdbf4146fc955a1c2ac23e696bb62cd6..a09af904fc64e3f901ef48c82b6f20469c48f348 100644 (file)
@@ -3,8 +3,8 @@
 
 /***********************************************************************/
 
-CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-       Cluster(rav, lv, dm)
+CompleteLinkage::CompleteLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) :
+       Cluster(rav, lv, dm, c)
 {}
 
 /***********************************************************************/
index e68d85d71c32df15ca56cbee916c6c9dd1072df1..a11601b74004d2429b985a1aa1e6fe4114457855 100644 (file)
@@ -47,9 +47,8 @@ void FormatColumnMatrix::read(NameAssignment* nameMap){
                        if(itB == nameMap->end()){      cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
 
                        if (distance == -1) { distance = 1000000; }
-                       
-                       if(distance < cutoff && itA != itB){
-                                                       
+               
+                       if((distance < cutoff) && (itA != itB)){
                                if(refRow == refCol){           // in other words, if we haven't loaded refRow and refCol...
                                        refRow = itA->second;
                                        refCol = itB->second;
@@ -71,7 +70,7 @@ void FormatColumnMatrix::read(NameAssignment* nameMap){
                }
                out.close();
                fileHandle.close();
-               
+       
                string squareFile;
                if(lt == 0){  // oops, it was square
                        squareFile = filename;
@@ -129,10 +128,13 @@ void FormatColumnMatrix::read(NameAssignment* nameMap){
                                rowMap.clear();
                                
                                //save row you just read
-                               rowMap[second] = dist;
-
+                               if (dist < cutoff) {
+                                       rowMap[second] = dist;
+                               }
                        }else{
-                               rowMap[second] = dist;
+                               if (dist < cutoff) {
+                                       rowMap[second] = dist;
+                               }
                        }
                }
                
index a02ef6cc27ac634eda25ec0c5b7cbc81b13c121a..4ef36d95d6d8e11db5bf065a755fd0292e809c17 100644 (file)
 #include "nameassignment.hpp"
 
 
+//**********************************************************************************************************************
+//  This class takes a distance matrix file and converts it to a file where each row contains all distances below the cutoff
+//  for a given sequence.
+
+//  Example:
+       /*      5
+               A  
+               B  0.01 
+               C  0.015 0.03 
+               D  0.03 0.02 0.02  
+               E  0.04 0.05 0.03 0.02   
+               
+               becomes 
+               
+               0       4       1       0.01    2       0.015   3       0.03    4       0.04    
+               1       4       0       0.01    2       0.03    3       0.02    4       0.05    
+               2       4       0       0.015   1       0.03    3       0.02    4       0.03    
+               3       4       0       0.03    1       0.02    2       0.02    4       0.02    
+               4       4       0       0.04    1       0.05    2       0.03    3       0.02
+               
+               column 1 - sequence name converted to row number
+               column 2 - numDists under cutoff
+               rest of line - sequence row -> distance, sequence row -> distance
+               
+               if you had a cutoff of 0.03 then the file would look like,
+               
+               0       3       1       0.01    2       0.015   3       0.03    
+               1       3       0       0.01    2       0.03    3       0.02    
+               2       4       0       0.015   1       0.03    3       0.02    4       0.03    
+               3       4       0       0.03    1       0.02    2       0.02    4       0.02    
+               4       2       2       0.03    3       0.02    
+               
+               This class also creates a vector of ints, rowPos.
+               
+               rowPos[0] = position in the file of distances related to sequence 0.
+               If a sequence is excluded by the cutoff, it's rowPos = -1.
+*/
 //**********************************************************************************************************************
 
 class FormatMatrix {
index 9667755e00343e6edccac6d5a607f246c0af3ba8..d37ee19eb573bba0ade4c91320c9766f1712be69 100644 (file)
@@ -48,7 +48,7 @@ GetOTURepCommand::GetOTURepCommand(string option){
                        help(); abort = true;
                } else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff"};
+                       string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -139,11 +139,12 @@ GetOTURepCommand::GetOTURepCommand(string option){
 
 void GetOTURepCommand::help(){
        try {
-               mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
-               mothurOut("The get.oturep command parameters are list, fasta, name, group, large, sorted and label.  The fasta and list parameters are required.\n");
+               mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
                mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
-               mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
-               mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
+               mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
+               mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
+               mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
+               mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
                mothurOut("The default value for label is all labels in your inputfile.\n");
                mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
                mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
@@ -328,8 +329,10 @@ int GetOTURepCommand::execute(){
                }
                
                //close and remove formatted matrix file
-               inRow.close();
-               remove(distFile.c_str());
+               if (large) {
+                       inRow.close();
+                       //remove(distFile.c_str());
+               }
                
                globaldata->gListVector = NULL;
                delete input;  globaldata->ginput = NULL;
index 3b972c0e3cc91c15b4730db6ae837ec621b8d651..259036f0f02a42beb95926f9ce59a4140164840e 100644 (file)
@@ -27,7 +27,9 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAs
                
                if (method != "average") {
                        openInputFile(distfile, filehandle);
-               }else{ firstRead = true; }
+               }else{  
+                       processFile();  
+               }
        }
        catch(exception& e) {
                errorOut(e, "HCluster", "HCluster");
@@ -358,7 +360,6 @@ vector<seqDist> HCluster::getSeqs(){
                if(method != "average") {
                        sameSeqs = getSeqsFNNN();
                }else{
-                       if (firstRead) {        processFile();   }
                        sameSeqs = getSeqsAN(); 
                }
                                
@@ -755,8 +756,6 @@ void HCluster::processFile() {
                
                remove(distfile.c_str());
                rename(outTemp.c_str(), distfile.c_str());
-               
-               firstRead = false;
        }
        catch(exception& e) {
                errorOut(e, "HCluster", "processFile");
index 679abbc2cc19e54f5a3028155de3d6d18372d937..bff432892144bfc0caa3f52f9cdd2691a605e72a 100644 (file)
@@ -71,7 +71,7 @@ protected:
        int smallCol;
        float smallDist, cutoff;
        map<string, int> seq2Bin;
-       bool mapWanted, exitedBreak, firstRead;
+       bool mapWanted, exitedBreak;
        seqDist next;
        string method, distfile;
        ifstream filehandle;
index bdec48e788121bc2d88519d98e1e8135d32020d5..c39b002ccfc0096197cd59ced3f4bb7fa21cce6b 100644 (file)
@@ -178,9 +178,6 @@ int HClusterCommand::execute(){
                
                print_start = true;
                start = time(NULL);
-       
-               //ifstream in;
-               //openInputFile(distfile, in);
                                
                cluster = new HCluster(rabund, list, method, distfile, globaldata->nameMap, cutoff);
                vector<seqDist> seqs; seqs.resize(1); // to start loop
@@ -192,19 +189,17 @@ int HClusterCommand::execute(){
                        for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
 
                                if (seqs[i].seq1 != seqs[i].seq2) {
-                                       bool clustered = cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
+                                       cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
                                        
                                        float rndDist = roundDist(seqs[i].dist, precision);
                                        
-                                       if (clustered) {
-                                               if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
-                                                       printData("unique");
-                                               }
-                                               else if((rndDist != rndPreviousDist)){
-                                                       printData(toString(rndPreviousDist,  length-1));
-                                               }
+                                       if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
+                                               printData("unique");
                                        }
-                                       
+                                       else if((rndDist != rndPreviousDist)){
+                                               printData(toString(rndPreviousDist,  length-1));
+                                       }
+                               
                                        previousDist = seqs[i].dist;
                                        rndPreviousDist = rndDist;
                                        oldRAbund = *rabund;
@@ -212,8 +207,6 @@ int HClusterCommand::execute(){
                                }
                        }
                }
-               
-               //in.close();
 
                if(previousDist <= 0.0000){
                        printData("unique");
index fad0ed64ff58e58a3bc3bfaa6c432f899f6f1eea..81e80ccaf6ee82a0bbed8cd5a757a4502fd5449d 100644 (file)
@@ -153,9 +153,9 @@ int MGClusterCommand::execute(){
                        delete read;
                
                        //create cluster
-                       if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix); }
-                       else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix); }
-                       else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix); }
+                       if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff); }
+                       else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff); }
+                       else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff); }
                        cluster->setMapWanted(true);
                        
                        //cluster using cluster classes
index 3bdbfcd0c5e6859a34172c5accf14ebbd28e866a..7fd5d1e48fc25bee4af0aefa7213814695489cf2 100644 (file)
@@ -20,8 +20,11 @@ int main(int argc, char *argv[]){
        try {
                
                //remove old logfile
-               string logFileName = "mothur.logFile";
-               remove(logFileName.c_str());
+               string log = "mothur.logFile";
+               remove(log.c_str());
+               
+               time_t ltime = time(NULL); /* calendar time */  
+               string logFileName = "mothur." + toString(asctime( localtime(&ltime) )) + ".logfile";
                
                //version
                #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
@@ -106,7 +109,9 @@ int main(int argc, char *argv[]){
                while(bail == 0)                {       bail = mothur->getInput();                      }
        
                delete mothur;
-       
+               
+               rename(log.c_str(), logFileName.c_str()); //logfile with timestamp
+               
                return 0;
        }
        catch(exception& e) {
index 8f9f94ff872aab355c6123f97e4cebb8e4dedb7f..8bd8d7f155ea7aaa28dc1b0d8b5bea2e9e4b43d6 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -72,7 +72,6 @@ using namespace std;
 #define isnan(x) ((x) != (x))
 #define isinf(x) (fabs(x) == std::numeric_limits<double>::infinity())
 
-
 typedef unsigned long ull;
 
 struct IntNode {
index 1c134f543ef45e89dd97ef6b0462b8cd3a282d67..9d38e76b10935e2ea7fe9f485e53eab662fc982e 100644 (file)
@@ -208,7 +208,7 @@ int ReadDistCommand::execute(){
                        numDists = globaldata->gSparseMatrix->getNNodes();
        //cout << "matrix contains " << numDists << " distances." << endl;
                        
-      int lines = cutoff / (1.0/precision);
+    /*  int lines = cutoff / (1.0/precision);
       vector<float> dist_cutoff(lines+1,0);
                        for (int i = 0; i <= lines;i++) {       
        dist_cutoff[i] = (i + 0.5) / precision; 
@@ -224,7 +224,7 @@ int ReadDistCommand::execute(){
           }
         }
                        }
-
+*/
      // string dist_string = "Dist:";
     //  string count_string = "Count: ";
                        //for (int i = 0; i <= lines;i++) {     
index fe278ab14856f085a9a61bfc09e40522dcf96d03..5f92275eb9e6d4c8e3ee21241bbd6f626a295bc1 100644 (file)
@@ -59,13 +59,13 @@ void ReadPhylipMatrix::read(NameAssignment* nameMap){
                         }
         
                         Progress* reading;
-        
+      
                         if(square == 0){
 
                                 reading = new Progress("Reading matrix:     ", nseqs * (nseqs - 1) / 2);
                 
                                 int        index = 0;
-                
+               
                                 for(int i=1;i<nseqs;i++){
                                         fileHandle >> name;
                                         matrixNames.push_back(name);
@@ -156,7 +156,7 @@ void ReadPhylipMatrix::read(NameAssignment* nameMap){
                         list->setLabel("0");
                         fileHandle.close();
 
-                        if(nameMap != NULL){
+                     /*   if(nameMap != NULL){
                                 for(int i=0;i<matrixNames.size();i++){
                                         nameMap->erase(matrixNames[i]);
                                 }
@@ -164,7 +164,7 @@ void ReadPhylipMatrix::read(NameAssignment* nameMap){
                                         //should probably tell them what is missing if we missed something
                                         mothurOut("missed something\t" + toString(nameMap->size())); mothurOutEndLine();
                                 }
-                        }
+                        } */
 
                 }
         catch(exception& e) {
index c827207f76529ebf79e8008deea7f245498a40b6..89f57eb5ca1aa80bf0d72d5aa47fcdabca6d8ef2 100644 (file)
@@ -288,10 +288,10 @@ void SharedCommand::createMisMatchFile() {
                                        string name = names.substr(0,names.find_first_of(','));
                                        names = names.substr(names.find_first_of(',')+1, names.length());
                                        string group = groupMap->getGroup(name);
-                                       
+       cout << name << endl;                           
                                        if(group == "not found") {      outMisMatch << name << endl;  }
                                }
-                               
+       cout << names << endl;                  
                                //get last name
                                string group = groupMap->getGroup(names);
                                if(group == "not found") {      outMisMatch << names << endl;  }                                
index c27b14e727057c6dcd14f2b0268cd250c4d52559..9f015db7359879be53da5e6001889117c7e5f7c6 100644 (file)
@@ -5,8 +5,8 @@
 
 /***********************************************************************/
 
-SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm) :
-Cluster(rav, lv, dm)
+SingleLinkage::SingleLinkage(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) :
+Cluster(rav, lv, dm, c)
 {}