fixed some bugs and added mgcluster command

author westcott <westcott>

Mon, 21 Dec 2009 15:03:14 +0000 (15:03 +0000)

committer westcott <westcott>

Mon, 21 Dec 2009 15:03:14 +0000 (15:03 +0000)
author westcott <westcott>
Mon, 21 Dec 2009 15:03:14 +0000 (15:03 +0000)
committer westcott <westcott>
Mon, 21 Dec 2009 15:03:14 +0000 (15:03 +0000)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index efe0beb43d14d80310535868942a7f4ba1321626..5ae6f336987553e85d2841145f2f151008351554 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -162,6 +162,7 @@
                 A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
                 A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
                 A70DECD91063D8B40057C03C /* secondarystructurecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */; };
+               A727E84A10D14568001A8432 /* readblast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727E84910D14568001A8432 /* readblast.cpp */; };
                 A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */; };
                 A729ACD010848E6100139801 /* hclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A729ACCF10848E6100139801 /* hclustercommand.cpp */; };
                 A729ACE410849BBE00139801 /* hcluster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A729ACE310849BBE00139801 /* hcluster.cpp */; };
@@ -178,6 +179,7 @@
                 A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; };
                 A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
                 A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
+               A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; };
                 A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; };
                 A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; };
                 A7F5759710CEBC0600E20E47 /* libreadline.a in Frameworks */ = {isa = PBXBuildFile; fileRef = A7F5759610CEBC0600E20E47 /* libreadline.a */; };
@@ -532,6 +534,8 @@
                 A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; };
                 A70DECD71063D8B40057C03C /* secondarystructurecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = secondarystructurecommand.h; sourceTree = SOURCE_ROOT; };
                 A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = secondarystructurecommand.cpp; sourceTree = SOURCE_ROOT; };
+               A727E84810D14568001A8432 /* readblast.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readblast.h; sourceTree = SOURCE_ROOT; };
+               A727E84910D14568001A8432 /* readblast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readblast.cpp; sourceTree = SOURCE_ROOT; };
                 A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = SOURCE_ROOT; };
                 A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = SOURCE_ROOT; };
                 A729ACCE10848E6100139801 /* hclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hclustercommand.h; sourceTree = SOURCE_ROOT; };
@@ -564,6 +568,8 @@
                 A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; };
                 A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; };
                 A7B0450D106CEEC90046FC83 /* slayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slayer.cpp; sourceTree = SOURCE_ROOT; };
+               A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = "<group>"; };
+               A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = "<group>"; };
                 A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; };
                 A7E4A782106913F900688F62 /* getsharedotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsharedotucommand.cpp; sourceTree = SOURCE_ROOT; };
                 A7E4A822106A9AD700688F62 /* maligner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = maligner.h; sourceTree = SOURCE_ROOT; };
@@ -725,6 +731,8 @@
                 3796441D0FB9B9650081FDB6 /* read */ = {
                         isa = PBXGroup;
                         children = (
+                               A727E84810D14568001A8432 /* readblast.h */,
+                               A727E84910D14568001A8432 /* readblast.cpp */,
                                 A751ACE81098B283003D0911 /* readcluster.h */,
                                 A751ACE91098B283003D0911 /* readcluster.cpp */,
                                 375AA1340F9E433D008EF9B8 /* readcolumn.h */,
@@ -915,6 +923,8 @@
                                 21E859D70FC4632E005E1A48 /* matrixoutputcommand.cpp */,
                                 7E5A17AD0FE57292003C6A03 /* mergefilecommand.h */,
                                 7E5A17AE0FE57292003C6A03 /* mergefilecommand.cpp */,
+                               A7E0AAB410D2886D00CF407B /* mgclustercommand.h */,
+                               A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */,
                                 375873F70F7D649C0040F377 /* nocommands.h */,
                                 375873F60F7D649C0040F377 /* nocommands.cpp */,
                                 3792946E0F2E191800B9034A /* parsimonycommand.h */,
@@ -1287,6 +1297,8 @@
                                 A787844510A1EBDD0086103D /* knn.cpp in Sources */,
                                 A773805F10B6D6FC002A6A61 /* taxonomyequalizer.cpp in Sources */,
                                 A773809010B6EFD1002A6A61 /* phylotypecommand.cpp in Sources */,
+                               A727E84A10D14568001A8432 /* readblast.cpp in Sources */,
+                               A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;
                 };
diff --git a/aligncommand.cpp b/aligncommand.cpp

index 55f0b821b1392f40b1f6ecd2c9414a6f8a04fff2..96e1a7899c4fec8d769c46aede121bb8de240c07 100644 (file)
--- a/aligncommand.cpp
+++ b/aligncommand.cpp
@@ -209,22 +209,23 @@ int AlignCommand::execute(){
                         while(!inFASTA.eof()){
                                 input = getline(inFASTA);
                                 if (input.length() != 0) {
-                                       if(input[0] == '>'){    int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);       }
+                                       if(input[0] == '>'){    long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);  }
                                 }
                         }
                         inFASTA.close();
                         
                         numFastaSeqs = positions.size();
-                       
+               
                         int numSeqsPerProcessor = numFastaSeqs / processors;
                         
                         for (int i = 0; i < processors; i++) {
-                               int startPos = positions[ i * numSeqsPerProcessor ];
+                               long int startPos = positions[ i * numSeqsPerProcessor ];
                                 if(i == processors - 1){
                                         numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
                                 }
                                 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
                         }
+                       
                         createProcesses(alignFileName, reportFileName, accnosFileName); 
                         
                         rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str());
@@ -318,11 +319,11 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName,
  
                 for(int i=0;i<line->numSeqs;i++){
                         
-                       Sequence* candidateSeq = new Sequence(inFASTA);
+                       Sequence* candidateSeq = new Sequence(inFASTA);  gobble(inFASTA);
                         int origNumBases = candidateSeq->getNumBases();
                         string originalUnaligned = candidateSeq->getUnaligned();
                         int numBasesNeeded = origNumBases * threshold;
-                       
+       
                         if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
                                 if (candidateSeq->getUnaligned().length() > alignment->getnRows()) {
                                         alignment->resize(candidateSeq->getUnaligned().length()+1);
diff --git a/aligncommand.h b/aligncommand.h

index f7d59b0206c989558f0e11163d1f7abddcf9a784..ec0c80ff7994e533ac2a9e627936e11c296ddcd8 100644 (file)
--- a/aligncommand.h
+++ b/aligncommand.h
@@ -28,7 +28,7 @@ private:
         struct linePair {
                 int start;
                 int numSeqs;
-               linePair(int i, int j) : start(i), numSeqs(j) {}
+               linePair(long int i, int j) : start(i), numSeqs(j) {}
         };
         vector<int> processIDS;   //processid
         vector<linePair*> lines;
diff --git a/alignmentdb.cpp b/alignmentdb.cpp

index e4509521e7383d305151170e2fcc96da1c2f3e88..52eedf80cbdc0991249d9eb6deca30e006d665a8 100644 (file)
--- a/alignmentdb.cpp
+++ b/alignmentdb.cpp
@@ -30,7 +30,7 @@ AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, floa
                         if (temp.getName() != "") {
                                 templateSequences.push_back(temp);
                                 //save longest base
-                               if (temp.getUnaligned().length() > longest)  { longest = temp.getUnaligned().length(); }
+                               if (temp.getUnaligned().length() > longest)  { longest = temp.getUnaligned().length()+1; }
                         }
                 }
                 
diff --git a/cluster.cpp b/cluster.cpp

index 6fd463116e65becc0ef2a969c08d5bfaf6dfea15..429e19aaeacce39b1a85943b31cd32da70739239 100644 (file)
--- a/cluster.cpp
+++ b/cluster.cpp
@@ -55,6 +55,7 @@ rabund(rav), list(lv), dMatrix(dm)
                 seqVec[currentCell->row].push_back(currentCell);
                 seqVec[currentCell->column].push_back(currentCell);
         }
+       mapWanted = false;  //set to true by mgcluster to speed up overlap merge
  }
  
  /***********************************************************************/
@@ -78,7 +79,7 @@ void Cluster::getRowColCells() {
         }
  
  }
-
+/***********************************************************************/
  // Remove the specified cell from the seqVec and from the sparse
  // matrix
  void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix)
@@ -150,7 +151,8 @@ void Cluster::clusterBins(){
  void Cluster::clusterNames(){
         try {
         //      cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol);
-
+               if (mapWanted) {  updateMap();  }
+               
                 list->set(smallCol, list->get(smallRow)+','+list->get(smallCol));
                 list->set(smallRow, "");        
                 list->setLabel(toString(smallDist));
@@ -222,7 +224,57 @@ void Cluster::update(){
                 exit(1);
         }
  }
+/***********************************************************************/
+void Cluster::setMapWanted(bool m)  {  
+       try {
+               mapWanted = m;
+               
+               //initialize map
+               for (int i = 0; i < list->getNumBins(); i++) {
+                       
+                       //parse bin 
+                       string names = list->get(i);
+                       while (names.find_first_of(',') != -1) { 
+                               //get name from bin
+                               string name = names.substr(0,names.find_first_of(','));
+                               //save name and bin number
+                               seq2Bin[name] = i;
+                               names = names.substr(names.find_first_of(',')+1, names.length());
+                       }
+                       
+                       //get last name
+                       seq2Bin[names] = i;
+               }
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "Cluster", "setMapWanted");
+               exit(1);
+       }
+}
+/***********************************************************************/
+void Cluster::updateMap() {
+try {
+               //update location of seqs in smallRow since they move to smallCol now
+               string names = list->get(smallRow);
+               while (names.find_first_of(',') != -1) { 
+                       //get name from bin
+                       string name = names.substr(0,names.find_first_of(','));
+                       //save name and bin number
+                       seq2Bin[name] = smallCol;
+                       names = names.substr(names.find_first_of(',')+1, names.length());
+               }
+                       
+               //get last name
+               seq2Bin[names] = smallCol;
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "Cluster", "updateMap");
+               exit(1);
+       }
+}
+/***********************************************************************/
  
  
-/***********************************************************************/
  
diff --git a/cluster.hpp b/cluster.hpp

index 99d8b7d557a41cef684ad2f9aae7d636d621bbc7..a6024e79d45a52274040a53cca4f8b32d9d2e998 100644 (file)
--- a/cluster.hpp
+++ b/cluster.hpp
@@ -14,8 +14,10 @@ class Cluster {
         
  public:
         Cluster(RAbundVector*, ListVector*, SparseMatrix*);
-    virtual void update();
+    virtual void update();                             
         virtual string getTag() = 0;
+       virtual void setMapWanted(bool m);  
+       virtual map<string, int> getSeqtoBin()  {  return seq2Bin;      }
  
  protected:     
         void getRowColCells();
@@ -25,6 +27,7 @@ protected:
  
         virtual void clusterBins();
         virtual void clusterNames();
+       virtual void updateMap();
         
         RAbundVector* rabund;
         ListVector* list;
@@ -33,8 +36,10 @@ protected:
         int smallRow;
         int smallCol;
         float smallDist;
+       bool mapWanted;
+       map<string, int> seq2Bin;
         
-       vector<MatVec> seqVec;          // contains vectors of cells related to a certain sequence\r
+       vector<MatVec> seqVec;          // contains vectors of cells related to a certain sequence
         MatVec rowCells;
         MatVec colCells;
         ull nRowCells;
diff --git a/commandfactory.cpp b/commandfactory.cpp

index c33de86f5c5a97fe9599a8871b190194170e1a6c..b361b6600655ff57a987cf7bce9b2b543d96c854 100644 (file)
--- a/commandfactory.cpp
+++ b/commandfactory.cpp
@@ -60,6 +60,7 @@
  #include "hclustercommand.h"
  #include "classifyseqscommand.h"
  #include "phylotypecommand.h"
+#include "mgclustercommand.h"
  
  /*******************************************************/
  
@@ -129,6 +130,7 @@ CommandFactory::CommandFactory(){
         commands["hcluster"]                    = "hcluster"; 
         commands["classify.seqs"]               = "classify.seqs"; 
         commands["phylotype"]                   = "phylotype";
+       commands["mgcluster"]                   = "mgcluster";
  
  }
  /***********************************************************/
@@ -196,6 +198,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
                 else if(commandName == "hcluster")                              {       command = new HClusterCommand(optionString);                    }
                 else if(commandName == "classify.seqs")                 {       command = new ClassifySeqsCommand(optionString);                }
                 else if(commandName == "phylotype")                             {       command = new PhylotypeCommand(optionString);                   }
+               else if(commandName == "mgcluster")                             {       command = new MGClusterCommand(optionString);                   }
                 else                                                                                    {       command = new NoCommand(optionString);                                  }
  
                 return command;
diff --git a/engine.cpp b/engine.cpp

index c6ade387da83e0ebbdcf67addd28f99e7267a500..593c2c3128d08cfe8abddfa81a1e384a1d17aa62 100644 (file)
--- a/engine.cpp
+++ b/engine.cpp
@@ -14,16 +14,6 @@
  
  #include "engine.hpp"
  
-/***********************************************************************/
-inline void terminateCommand(int dummy)  {
-       
-               //mothurOut("Stopping command...."); 
-               //CommandFactory* cFactory = CommandFactory::getInstance();
-               //cFactory->getCommand();  //deletes old command and makes new no command.  
-                                                               //this may cause memory leak if old commands execute function allocated memory 
-                                                               //that is freed in the execute function and not the deconstructor 
-               //mothurOut("DONE."); mothurOutEndLine();
-}
  /***********************************************************************/
  Engine::Engine(){
         try {
diff --git a/engine.hpp b/engine.hpp

index df4e08d879103c95e2959130eda2d19296f0a0f3..d3f156e533a6a759bab014250dd1d54ad1726c7b 100644 (file)
--- a/engine.hpp
+++ b/engine.hpp
@@ -27,7 +27,6 @@ public:
         virtual bool getInput() = 0;
         virtual string getCommand();
         vector<string> getOptions() {   return options;         }
-       //virtual void terminateCommand(int);
  protected:
         vector<string> options;
         CommandFactory* cFactory;
diff --git a/fullmatrix.h b/fullmatrix.h

index 890d07a846006e5e81570402b0fd4e61879e0b68..7cb8b15fedf8c39f06aeaee5ca3fb6e9f3274499 100644 (file)
--- a/fullmatrix.h
+++ b/fullmatrix.h
@@ -35,6 +35,7 @@ public:
         int getNumGroups();
         void printMatrix(ostream&);
         float get(int, int);
+       Names getRowInfo(int row)  {  return index[row];  }
         
  private:
         vector< vector<float> > matrix;  //a 2D distance matrix of all the sequences and their distances to eachother.
diff --git a/getsharedotucommand.cpp b/getsharedotucommand.cpp

index 742377f14a12cb55f306cf334936a9ca1f1e12f3..8081f1090f3754e9904bdc838a2133d0baf58f25 100644 (file)
--- a/getsharedotucommand.cpp
+++ b/getsharedotucommand.cpp
@@ -16,6 +16,7 @@ GetSharedOTUCommand::GetSharedOTUCommand(string option){
         
                 globaldata = GlobalData::getInstance();
                 abort = false;
+               unique = true;
                 allLines = 1;
                 labels.clear();
                 
@@ -24,7 +25,7 @@ GetSharedOTUCommand::GetSharedOTUCommand(string option){
                 
                 else {
                         //valid paramters for this command
-                       string Array[] =  {"label","groups","fasta","list","group","output"};
+                       string Array[] =  {"label","unique","shared","fasta","list","group","output"};
                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                         
                         OptionParser parser(option);
@@ -61,11 +62,20 @@ GetSharedOTUCommand::GetSharedOTUCommand(string option){
                         output = validParameter.validFile(parameters, "output", false);                 
                         if (output == "not found") { output = ""; }
                         
-                       groups = validParameter.validFile(parameters, "groups", false);                 
+                       groups = validParameter.validFile(parameters, "unique", false);                 
                         if (groups == "not found") { groups = ""; }
                         else { 
                                 splitAtDash(groups, Groups);
                                 globaldata->Groups = Groups;
+                               
+                       }
+                       
+                       groups = validParameter.validFile(parameters, "shared", false);                 
+                       if (groups == "not found") { groups = "";  }
+                       else { 
+                               splitAtDash(groups, Groups);
+                               globaldata->Groups = Groups;
+                               unique = false;
                         }
                         
                         fastafile = validParameter.validFile(parameters, "fasta", true);
@@ -84,9 +94,12 @@ GetSharedOTUCommand::GetSharedOTUCommand(string option){
  
  void GetSharedOTUCommand::help(){
         try {
-               mothurOut("The get.sharedseqs command parameters are list, group, label, groups, output and fasta.  The list and group parameters are required.\n");
+               mothurOut("The get.sharedseqs command parameters are list, group, label, unique, shared, output and fasta.  The list and group parameters are required.\n");
                 mothurOut("The label parameter allows you to select what distance levels you would like output files for, and are separated by dashes.\n");
-               mothurOut("The groups parameter allows you to select groups you would like to know the shared info for, and are separated by dashes.\n");
+               mothurOut("The unique and shared parameters allow you to select groups you would like to know the shared info for, and are separated by dashes.\n");
+               mothurOut("If you enter your groups under the unique parameter mothur will return the otus that contain ONLY sequences from those groups.\n");
+               mothurOut("If you enter your groups under the shared parameter mothur will return the otus that contain sequences from those groups and may also contain sequences from other groups.\n");
+               mothurOut("If you do not enter any groups then the get.sharedseqs command will return sequences that are unique to all groups in your group file.\n");
                 mothurOut("The fasta parameter allows you to input a fasta file and outputs a fasta file for each distance level containing only the sequences that are in OTUs shared by the groups specified.\n");
                 mothurOut("The output parameter allows you to output the list of names without the group and bin number added. \n");
                 mothurOut("With this option you can use the names file as an input in get.seqs and remove.seqs commands. To do this enter output=accnos. \n");
@@ -121,7 +134,7 @@ int GetSharedOTUCommand::execute(){
                 if (Groups.size() == 0) {
                         Groups = groupMap->namesOfGroups;
                 }
-               
+       
                 //put groups in map to find easier
                 for(int i = 0; i < Groups.size(); i++) {
                         groupFinder[Groups[i]] = Groups[i];
@@ -238,7 +251,7 @@ void GetSharedOTUCommand::process(ListVector* shared) {
                 //go through each bin, find out if shared
                 for (int i = 0; i < shared->getNumBins(); i++) {
                         
-                       bool sharedByAll = true;
+                       bool uniqueOTU = true;
                         
                         map<string, int> atLeastOne;
                         for (int m = 0; m < Groups.size(); m++) {
@@ -248,48 +261,48 @@ void GetSharedOTUCommand::process(ListVector* shared) {
                         vector<string> namesOfSeqsInThisBin;
                         
                         string names = shared->get(i);  
-                       while ((names.find_first_of(',') != -1) && sharedByAll) { 
+                       while ((names.find_first_of(',') != -1)) { 
                                 string name = names.substr(0,names.find_first_of(','));
                                 names = names.substr(names.find_first_of(',')+1, names.length());
-                                                               
+                               
                                 //find group
                                 string seqGroup = groupMap->getGroup(name);
                                 if (output != "accnos") {
                                         namesOfSeqsInThisBin.push_back((name + "\t" + seqGroup + "\t" + toString(i+1)));
                                 }else {  namesOfSeqsInThisBin.push_back(name);  }
-
+                               
                                 if (seqGroup == "not found") { mothurOut(name + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1);  }
                                 
                                 //is this seq in one of hte groups we care about
                                 it = groupFinder.find(seqGroup);
-                               if (it == groupFinder.end()) {  sharedByAll = false;  } //you have a sequence from a group you don't want
+                               if (it == groupFinder.end()) {  uniqueOTU = false;  } //you have a sequence from a group you don't want
                                 else {  atLeastOne[seqGroup]++;  }
                         }
                         
                         //get last name
-                       //find group
-                       if (sharedByAll) {
-                               string seqGroup = groupMap->getGroup(names);
-                               if (output != "accnos") {
-                                       namesOfSeqsInThisBin.push_back((names + "\t" + seqGroup + "\t" + toString(i+1)));
-                               }else {  namesOfSeqsInThisBin.push_back(names); }
-                               
-                               if (seqGroup == "not found") { mothurOut(names + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1);  }
+                       string seqGroup = groupMap->getGroup(names);
+                       if (output != "accnos") {
+                               namesOfSeqsInThisBin.push_back((names + "\t" + seqGroup + "\t" + toString(i+1)));
+                       }else {  namesOfSeqsInThisBin.push_back(names); }
+                       
+                       if (seqGroup == "not found") { mothurOut(names + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1);  }
+                       
+                       //is this seq in one of hte groups we care about
+                       it = groupFinder.find(seqGroup);
+                       if (it == groupFinder.end()) {  uniqueOTU = false;  } //you have a sequence from a group you don't want
+                       else {  atLeastOne[seqGroup]++;  }
                         
-                               //is this seq in one of hte groups we care about
-                               it = groupFinder.find(seqGroup);
-                               if (it == groupFinder.end()) {  sharedByAll = false;  } //you have a sequence from a group you don't want
-                               else {  atLeastOne[seqGroup]++;  }
-                       }
                         
                         //make sure you have at least one seq from each group you want
+                       bool sharedByAll = true;
                         map<string, int>::iterator it2;
                         for (it2 = atLeastOne.begin(); it2 != atLeastOne.end(); it2++) {
                                 if (it2->second == 0) {  sharedByAll = false;   }
                         }
                         
-                       //if shared, save names of seqs in that bin
-                       if (sharedByAll) {
+                       //if the user wants unique bins and this is unique then print
+                       //or this the user wants shared bins and this bin is shared then print
+                       if ((unique && uniqueOTU && sharedByAll) || (!unique && sharedByAll)) {
                                 
                                 wroteSomething = true;
                                 num++;
@@ -309,9 +322,6 @@ void GetSharedOTUCommand::process(ListVector* shared) {
                                         }
                                 }
                         }
-                       
-                       
-                               
                 }
                 
                 outNames.close();
diff --git a/getsharedotucommand.h b/getsharedotucommand.h

index 6470961cdb204dac5992344ed3a5e91bd3ec797a..3b14b39bb70e8939641dea69c2e53bcb98a82ffa 100644 (file)
--- a/getsharedotucommand.h
+++ b/getsharedotucommand.h
@@ -34,7 +34,7 @@ class GetSharedOTUCommand : public Command {
                 
                 set<string> labels;
                 string fastafile, label, groups, listfile, groupfile, output;
-               bool abort, allLines;
+               bool abort, allLines, unique;
                 vector<string> Groups;
                 map<string, string> groupFinder;
                 map<string, string>::iterator it;
diff --git a/hcluster.cpp b/hcluster.cpp

index 7613ecbf43e245713880a26444b3e68baaad808c..2d88533757a1495d54423ad87354532a96d3e053 100644 (file)
--- a/hcluster.cpp
+++ b/hcluster.cpp
@@ -16,7 +16,7 @@
  
  HCluster::HCluster(RAbundVector* rav, ListVector* lv) :  rabund(rav), list(lv){
         try {
-       
+               mapWanted = false;
                 numSeqs = list->getNumSeqs();
                 
                 //initialize cluster array
@@ -56,7 +56,8 @@ void HCluster::clusterBins(){
  void HCluster::clusterNames(){
         try {
                 ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild);
-
+               if (mapWanted) {  updateMap();  }
+               
                 list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild));
                 list->set(clusterArray[smallRow].smallChild, "");       
                 list->setLabel(toString(smallDist));
@@ -256,7 +257,7 @@ void HCluster::updateArrayandLinkTable() {
         }
  }
  /***********************************************************************/
-bool HCluster::update(int row, int col, float distance){
+void HCluster::update(int row, int col, float distance){
         try {
                 
                 smallRow = row;
@@ -284,15 +285,62 @@ bool HCluster::update(int row, int col, float distance){
                 }
                 
                 //printInfo();
-               return clustered;
         }
         catch(exception& e) {
                 errorOut(e, "HCluster", "update");
                 exit(1);
         }
  }
-
-
  /***********************************************************************/
+void HCluster::setMapWanted(bool m)  {  
+       try {
+               mapWanted = m;
+               
+               //initialize map
+               for (int i = 0; i < list->getNumBins(); i++) {
+                       
+                       //parse bin 
+                       string names = list->get(i);
+                       while (names.find_first_of(',') != -1) { 
+                               //get name from bin
+                               string name = names.substr(0,names.find_first_of(','));
+                               //save name and bin number
+                               seq2Bin[name] = i;
+                               names = names.substr(names.find_first_of(',')+1, names.length());
+                       }
+                       
+                       //get last name
+                       seq2Bin[names] = i;
+               }
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "HCluster", "setMapWanted");
+               exit(1);
+       }
+}
+/***********************************************************************/
+void HCluster::updateMap() {
+try {
+               //update location of seqs in smallRow since they move to smallCol now
+               string names = list->get(clusterArray[smallRow].smallChild);
+               while (names.find_first_of(',') != -1) { 
+                       //get name from bin
+                       string name = names.substr(0,names.find_first_of(','));
+                       //save name and bin number
+                       seq2Bin[name] = clusterArray[smallCol].smallChild;
+                       names = names.substr(names.find_first_of(',')+1, names.length());
+               }
+                       
+               //get last name
+               seq2Bin[names] = clusterArray[smallCol].smallChild;
+       }
+       catch(exception& e) {
+               errorOut(e, "HCluster", "updateMap");
+               exit(1);
+       }
+}
+/***********************************************************************/
+
  
  
diff --git a/hcluster.h b/hcluster.h

index 6b2862b7919b8d74e7a28a0442ef10bceb863db0..dd4c679123f12d90fbc2ca1daaae5045be9a1dc1 100644 (file)
--- a/hcluster.h
+++ b/hcluster.h
@@ -22,8 +22,9 @@ class HCluster {
  public:
         HCluster(RAbundVector*, ListVector*);
         ~HCluster(){};
-    bool update(int, int, float);
-       //string getTag();
+    void update(int, int, float);
+       void setMapWanted(bool m); 
+       map<string, int> getSeqtoBin()  {  return seq2Bin;      }
  
  protected:     
         void clusterBins();
@@ -32,6 +33,7 @@ protected:
         int makeActive();
         void printInfo();
         void updateArrayandLinkTable();
+       void updateMap();
                 
         RAbundVector* rabund;
         ListVector* list;
@@ -43,10 +45,11 @@ protected:
         map<int, int>::iterator it2;
         
         int numSeqs;
-       
         int smallRow;
         int smallCol;
         float smallDist;
+       map<string, int> seq2Bin;
+       bool mapWanted;
         
  };
  
diff --git a/hclustercommand.cpp b/hclustercommand.cpp

index dbc0e1bb9917b0d2cfdebfa83a5a97a17b6eb905..3c79f649301f9e049e5ff8cf3eff454bf97fb4c4 100644 (file)
--- a/hclustercommand.cpp
+++ b/hclustercommand.cpp
@@ -176,7 +176,6 @@ int HClusterCommand::execute(){
                 
                 print_start = true;
                 start = time(NULL);
-               
         
                 ifstream in;
                 openInputFile(distfile, in);
@@ -184,7 +183,6 @@ int HClusterCommand::execute(){
                 float distance;
                 
                 cluster = new HCluster(rabund, list);
-               bool clusteredSomething;
                 vector<seqDist> seqs; seqs.resize(1); // to start loop
                 exitedBreak = false;  //lets you know if there is a distance stored in next
         
@@ -207,7 +205,7 @@ int HClusterCommand::execute(){
                                 
         //cout << "before cluster update" << endl;
                                 if (seqs[i].seq1 != seqs[i].seq2) {
-                                       clusteredSomething = cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
+                                       cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
                                         
                                         float rndDist = roundDist(seqs[i].dist, precision);
                 //cout << "after cluster update clusterSomething = " << clusteredSomething << " rndDist = " << rndDist << " rndPreviousDist = " << rndPreviousDist << endl;                     
@@ -301,7 +299,6 @@ void HClusterCommand::printData(string label){
  
  }
  //**********************************************************************************************************************
-
  vector<seqDist> HClusterCommand::getSeqs(ifstream& filehandle){
         try {
                 string firstName, secondName;
diff --git a/hclustercommand.h b/hclustercommand.h

index 7351cef71a6edad1a3b1484cbe437d9dd6899e39..1dd46b9ad96890c5576e18ae500cff6359270795 100644 (file)
--- a/hclustercommand.h
+++ b/hclustercommand.h
@@ -28,10 +28,6 @@
  //University of Florida, Gainesville, FL 32610-3622 and 3Materials Technology Directorate, Air Force Technical 
  //Applications Center, 1030 S. Highway A1A, Patrick AFB, FL 32925-3002, USA 
  //Received January 28, 2009; Revised April 14, 2009; Accepted April 15, 2009 
-
-/******************************************************************/
-
-
  /************************************************************/
  struct seqDist {
         int seq1;
diff --git a/libshuff.cpp b/libshuff.cpp

index e6c83f7a42b20cb674675fd6608f9da5dd9f7fa6..cd4570ebe7368b9d274e32d7a792170c8ed19aa1 100644 (file)
--- a/libshuff.cpp
+++ b/libshuff.cpp
@@ -64,7 +64,6 @@ vector<vector<vector<double> > > Libshuff::getSavedMins(){
  
  vector<double> Libshuff::getMinX(int x){
         try{
-
                 vector<double> minX(groupSizes[x], 0);
                 for(int i=0;i<groupSizes[x];i++){
                         minX[i] = (groupSizes[x] > 1 ? (i==0 ? matrix->get(groups[x][0], groups[x][1]) : matrix->get(groups[x][i], groups[x][0])) : 0.0); //get the first value in row i of this block
diff --git a/libshuffcommand.cpp b/libshuffcommand.cpp

index 2d84767bc998d1e8db1d1ab40e7f9836f2045595..74acd841b676acc429df6df4e303210d82ae92ba 100644 (file)
--- a/libshuffcommand.cpp
+++ b/libshuffcommand.cpp
@@ -22,7 +22,6 @@
  
  LibShuffCommand::LibShuffCommand(string option){
         try {
-               
                 globaldata = GlobalData::getInstance();
                 abort = false;
                 Groups.clear();
@@ -74,19 +73,19 @@ LibShuffCommand::LibShuffCommand(string option){
                         userform = validParameter.validFile(parameters, "form", false);                 if (userform == "not found") { userform = "integral"; }
                         
                         if (abort == false) {
-                       
+               
                                 matrix = globaldata->gMatrix;                           //get the distance matrix
                                 setGroups();                                                            //set the groups to be analyzed and sorts them
-                               
+       
                                 /********************************************************************************************/
                                 //this is needed because when we read the matrix we sort it into groups in alphabetical order
                                 //the rest of the command and the classes used in this command assume specific order
                                 /********************************************************************************************/
                                 matrix->setGroups(globaldata->gGroupmap->namesOfGroups);
                                 vector<int> sizes;
-                               for (int i = 0; i < globaldata->gGroupmap->namesOfGroups.size(); i++) {  sizes.push_back(globaldata->gGroupmap->getNumSeqs(globaldata->gGroupmap->namesOfGroups[i]));  }
+                               for (int i = 0; i < globaldata->gGroupmap->namesOfGroups.size(); i++) {   sizes.push_back(globaldata->gGroupmap->getNumSeqs(globaldata->gGroupmap->namesOfGroups[i]));  }
                                 matrix->setSizes(sizes);
-                               
+                       
  
                                 if(userform == "discrete"){
                                         form = new DLibshuff(matrix, iters, step, cutOff);
@@ -132,10 +131,10 @@ int LibShuffCommand::execute(){
         try {
                 
                 if (abort == true) {    return 0;       }
-               
+       
                 savedDXYValues = form->evaluateAll();
                 savedMinValues = form->getSavedMins();
-               
+       
                 pValueCounts.resize(numGroups);
                 for(int i=0;i<numGroups;i++){
                         pValueCounts[i].assign(numGroups, 0);
diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp

new file mode 100644 (file)

index 0000000..d5b84a8
--- /dev/null
+++ b/mgclustercommand.cpp
@@ -0,0 +1,506 @@
+/*
+ *  mgclustercommand.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 12/11/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mgclustercommand.h"
+#include "readcluster.h"
+
+//**********************************************************************************************************************
+MGClusterCommand::MGClusterCommand(string option){
+       try {
+               globaldata = GlobalData::getInstance();
+               abort = false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; }
+               
+               else {
+                       //valid paramters for this command
+                       string Array[] =  {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge"};
+                       vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+                       
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+               
+                       //check to make sure all parameters are valid for command
+                       for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //check for required parameters
+                       blastfile = validParameter.validFile(parameters, "blast", true);
+                       if (blastfile == "not open") { abort = true; }  
+                       else if (blastfile == "not found") { blastfile = ""; }
+                       
+                       namefile = validParameter.validFile(parameters, "name", true);
+                       if (namefile == "not open") { abort = true; }   
+                       else if (namefile == "not found") { namefile = ""; }
+                       
+                       if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
+                       
+                       //check for optional parameter and set defaults
+                       string temp;
+                       temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
+                       precisionLength = temp.length();
+                       convert(temp, precision); 
+                       
+                       temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
+                       convert(temp, cutoff); 
+                       cutoff += (5 / (precision * 10.0));
+                       
+                       method = validParameter.validFile(parameters, "method", false);
+                       if (method == "not found") { method = "furthest"; }
+                       
+                       if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
+                       else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
+
+                       temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
+                       convert(temp, length); 
+                       
+                       temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
+                       convert(temp, penalty); 
+                       
+                       temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
+                       minWanted = isTrue(temp); 
+                       
+                       temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
+                       merge = isTrue(temp); 
+                       
+                       temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
+                       hclusterWanted = isTrue(temp); 
+               }
+
+       }
+       catch(exception& e) {
+               errorOut(e, "MGClusterCommand", "MGClusterCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+void MGClusterCommand::help(){
+       try {
+               mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
+               mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
+               mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
+               mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
+               mothurOut("The precision parameter's default value is 100. \n");
+               mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");       
+               mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
+               mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
+               mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
+               mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
+               mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
+               mothurOut("The mgcluster command should be in the following format: \n");
+               mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
+               mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
+       }
+       catch(exception& e) {
+               errorOut(e, "MGClusterCommand", "help");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+MGClusterCommand::~MGClusterCommand(){}
+//**********************************************************************************************************************
+int MGClusterCommand::execute(){
+       try {
+               
+               if (abort == true) {    return 0;       }
+               
+               //read names file
+               if (namefile != "") {
+                       nameMap = new NameAssignment(namefile);
+                       nameMap->readMap();
+               }else{ nameMap= new NameAssignment(); }
+               
+               string fileroot = getRootName(blastfile);
+               string tag = "";
+               time_t start;
+               float previousDist = 0.00000;
+               float rndPreviousDist = 0.00000;
+               
+               //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
+               //must remember to delete those objects here since readBlast does not
+               read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
+               read->read(nameMap);
+               
+               list = new ListVector(nameMap->getListVector());
+               RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
+               
+               start = time(NULL);
+               oldList = *list;
+               
+               if (!hclusterWanted) {
+                       //get distmatrix and overlap
+                       SparseMatrix* distMatrix = read->getDistMatrix();
+                       overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
+                       delete read;
+               
+                       //create cluster
+                       if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix); }
+                       else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix); }
+                       else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix); }
+                       tag = cluster->getTag();
+                       cluster->setMapWanted(true);
+                       
+                       //open output files
+                       openOutputFile(fileroot+ tag + ".list",  listFile);
+                       openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
+                       openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
+                       
+                       //cluster using cluster classes
+                       while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
+                               
+                               cluster->update();
+                               float dist = distMatrix->getSmallDist();
+                               float rndDist = roundDist(dist, precision);
+                               
+                               if(previousDist <= 0.0000 && dist != previousDist){
+                                       oldList.setLabel("unique");
+                                       printData(&oldList);
+                               }
+                               else if(rndDist != rndPreviousDist){
+                                       if (merge) {
+                                               map<string, int> seq2Bin = cluster->getSeqtoBin();
+                                               ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+                                               temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
+                                               printData(temp);
+                                               delete temp;
+                                       }else{
+                                               oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
+                                               printData(&oldList);
+                                       }
+                               }
+                               
+                               previousDist = dist;
+                               rndPreviousDist = rndDist;
+                               oldList = *list;
+                       }
+                       
+                       if(previousDist <= 0.0000){
+                               oldList.setLabel("unique");
+                               printData(&oldList);
+                       }
+                       else if(rndPreviousDist<cutoff){
+                               if (merge) {
+                                       map<string, int> seq2Bin = cluster->getSeqtoBin();
+                                       ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+                                       temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
+                                       printData(temp);
+                                       delete temp;
+                               }else{
+                                       oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
+                                       printData(&oldList);
+                               }
+                       }
+                       
+                       //free memory
+                       overlapMatrix.clear();
+                       delete distMatrix;
+                       delete cluster;
+                       
+               }else { //use hcluster to cluster
+                       tag = "fn";
+                       //get distmatrix and overlap
+                       overlapFile = read->getOverlapFile();
+                       distFile = read->getDistFile(); 
+                       delete read;
+               
+                       //sort the distance and overlap files
+                       sortHclusterFiles(distFile, overlapFile);
+               
+                       //create cluster
+                       hcluster = new HCluster(rabund, list);
+                       hcluster->setMapWanted(true);
+                       
+                       //open output files
+                       openOutputFile(fileroot+ tag + ".list",  listFile);
+                       openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
+                       openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
+                       
+                       vector<DistNode> seqs; seqs.resize(1); // to start loop
+                       exitedBreak = false;  //lets you know if there is a distance stored in next
+                       
+                       ifstream inHcluster;
+                       openInputFile(distFile, inHcluster);
+
+                       while (seqs.size() != 0){
+               
+                               seqs = getSeqs(inHcluster);
+                               random_shuffle(seqs.begin(), seqs.end());
+                               
+                               for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
+                                       
+                                       if (seqs[i].seq1 != seqs[i].seq2) {
+               
+                                               hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
+       
+                                               float rndDist = roundDist(seqs[i].dist, precision);
+                                                                                               
+                                               if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
+                                                       oldList.setLabel("unique");
+                                                       printData(&oldList);
+                                               }
+                                               else if((rndDist != rndPreviousDist)){
+                                                       if (merge) {
+                                                               map<string, int> seq2Bin = hcluster->getSeqtoBin();
+                                                               ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+                                                               temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
+                                                               printData(temp);
+                                                               delete temp;
+                                                       }else{
+                                                               oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
+                                                               printData(&oldList);
+                                                       }
+                                               }
+                                               
+                                               previousDist = seqs[i].dist;
+                                               rndPreviousDist = rndDist;
+                                               oldList = *list;
+                                       }
+                               }
+                       }
+                       inHcluster.close();
+                       
+                       if(previousDist <= 0.0000){
+                               oldList.setLabel("unique");
+                               printData(&oldList);
+                       }
+                       else if(rndPreviousDist<cutoff){
+                               if (merge) {
+                                       map<string, int> seq2Bin = hcluster->getSeqtoBin();
+                                       ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
+                                       temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
+                                       printData(temp);
+                                       delete temp;
+                               }else{
+                                       oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
+                                       printData(&oldList);
+                               }
+                       }
+                       
+                       delete hcluster;
+                       remove(distFile.c_str());
+                       remove(overlapFile.c_str());
+               }
+               
+               delete list; 
+               delete rabund;
+               listFile.close();
+               sabundFile.close();
+               rabundFile.close();
+       
+               globaldata->setListFile(fileroot+ tag + ".list");
+               globaldata->setFormat("list");
+                       
+               mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
+                       
+               return 0;
+       }
+       catch(exception& e) {
+               errorOut(e, "MGClusterCommand", "execute");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+void MGClusterCommand::printData(ListVector* mergedList){
+       try {
+               mergedList->print(listFile);
+               mergedList->getRAbundVector().print(rabundFile);
+               
+               SAbundVector sabund = mergedList->getSAbundVector();
+
+               sabund.print(cout);
+               sabund.print(sabundFile);
+       }
+       catch(exception& e) {
+               errorOut(e, "MGClusterCommand", "printData");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+//this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
+//that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
+ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
+       try {
+               //create new listvector so you don't overwrite the clustering
+               ListVector* newList = new ListVector(oldList);
+               bool done = false;
+               ifstream inOverlap;
+               int count = 0;
+               
+               if (hclusterWanted) {  
+                       openInputFile(overlapFile, inOverlap);  
+                       if (inOverlap.eof()) {  done = true;  }
+               }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
+               
+               while (!done) {
+                       
+                       //get next overlap
+                       DistNode overlapNode;
+                       if (!hclusterWanted) {  
+                               if (count < overlapMatrix.size()) { //do we have another node in the matrix
+                                       overlapNode = overlapMatrix[count];
+                                       count++;
+                               }else { break; }
+                       }else { 
+                               if (!inOverlap.eof()) {
+                                       string firstName, secondName;
+                                       float overlapDistance;
+                                       inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
+                                       
+                                       map<string,int>::iterator itA = nameMap->find(firstName);
+                                       map<string,int>::iterator itB = nameMap->find(secondName);
+                                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
+                                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                                       
+                                       overlapNode.seq1 = itA->second;
+                                       overlapNode.seq2 = itB->second;
+                                       overlapNode.dist = overlapDistance;
+                               }else { inOverlap.close(); break; }
+                       } 
+               
+                       if (overlapNode.dist < dist) {
+                               //get names of seqs that overlap
+                               string name1 = nameMap->get(overlapNode.seq1);
+                               string name2 = nameMap->get(overlapNode.seq2);
+                               
+                               //use binInfo to find out if they are already in the same bin
+                               int binKeep = binInfo[name1];
+                               int binRemove = binInfo[name2];
+                               
+                               //if not merge bins and update binInfo
+                               if(binKeep != binRemove) {
+                                       //save names in old bin
+                                       string names = list->get(binRemove);
+                                       
+                                       //merge bins into name1s bin
+                                       newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
+                                       newList->set(binRemove, "");    
+                                       
+                                       //update binInfo
+                                       while (names.find_first_of(',') != -1) { 
+                                               //get name from bin
+                                               string name = names.substr(0,names.find_first_of(','));
+                                               //save name and bin number
+                                               binInfo[name] = binKeep;
+                                               names = names.substr(names.find_first_of(',')+1, names.length());
+                                       }
+                                       
+                                       //get last name
+                                       binInfo[names] = binKeep;
+                               }
+                               
+                       }else { done = true; }
+               }
+               
+               //return listvector
+               return newList;
+                               
+       }
+       catch(exception& e) {
+               errorOut(e, "MGClusterCommand", "mergeOPFs");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
+       try {
+               //sort distFile
+               ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff);       
+               readCluster->setFormat("column");
+               readCluster->read(nameMap);
+               string sortedDistFile = readCluster->getOutputFile();
+               ListVector* extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
+               
+               remove(unsortedDist.c_str());  //delete unsorted file
+               distFile = sortedDistFile;
+               delete extralist;
+               delete readCluster;
+               
+               //sort overlap file
+               readCluster = new ReadCluster(unsortedOverlap, cutoff);         
+               readCluster->setFormat("column");
+               readCluster->read(nameMap);
+               string sortedOverlapFile = readCluster->getOutputFile();
+               extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
+               
+               remove(unsortedOverlap.c_str());  //delete unsorted file
+               overlapFile = sortedOverlapFile;
+               delete extralist;
+               delete readCluster;
+       }
+       catch(exception& e) {
+               errorOut(e, "MGClusterCommand", "sortHclusterFiles");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+vector<DistNode> MGClusterCommand::getSeqs(ifstream& filehandle){
+       try {
+               string firstName, secondName;
+               float distance, prevDistance;
+               vector<DistNode> sameSeqs;
+               prevDistance = -1;
+       
+               //if you are not at the beginning of the file
+               if (exitedBreak) { 
+                       sameSeqs.push_back(next);
+                       prevDistance = next.dist;
+                       exitedBreak = false;
+               }
+       
+               //get entry
+               while (!filehandle.eof()) {
+                       
+                       filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
+       
+                       //save first one
+                       if (prevDistance == -1) { prevDistance = distance; }
+                       
+                       map<string,int>::iterator itA = nameMap->find(firstName);
+                       map<string,int>::iterator itB = nameMap->find(secondName);
+                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
+                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+               
+                       //using cutoff
+                       if (distance > cutoff) { break; }
+               
+                       if (distance != -1) { //-1 means skip me
+                       
+                               //are the distances the same
+                               if (distance == prevDistance) { //save in vector
+                                       DistNode temp(itA->second, itB->second, distance);
+                                       sameSeqs.push_back(temp);
+                                       exitedBreak = false;
+                               }else{ 
+                                       next.seq1 = itA->second;
+                                       next.seq2 = itB->second;
+                                       next.dist = distance;
+                                       exitedBreak = true;
+                                       break;
+                               }
+                       }
+               }
+               
+               return sameSeqs;
+       }
+       catch(exception& e) {
+               errorOut(e, "MGClusterCommand", "getSeqs");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+
+
+
+
diff --git a/mgclustercommand.h b/mgclustercommand.h

new file mode 100644 (file)

index 0000000..9aa86d4
--- /dev/null
+++ b/mgclustercommand.h
@@ -0,0 +1,61 @@
+#ifndef MGCLUSTERCOMMAND_H
+#define MGCLUSTERCOMMAND_H
+
+/*
+ *  mgclustercommand.h
+ *  Mothur
+ *
+ *  Created by westcott on 12/11/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+#include "readblast.h"
+#include "sparsematrix.hpp"
+#include "nameassignment.hpp"
+#include "globaldata.hpp"
+#include "cluster.hpp"
+#include "hcluster.h"
+
+/**********************************************************************/
+
+class MGClusterCommand : public Command {
+
+public:
+       MGClusterCommand(string);
+       ~MGClusterCommand();
+       int execute();
+       void help();
+       
+private:
+       GlobalData* globaldata;
+       ReadBlast* read;
+       NameAssignment* nameMap;
+       Cluster* cluster;
+       HCluster* hcluster;
+       ListVector* list;
+       ListVector oldList;
+       vector<DistNode> overlapMatrix;
+       DistNode next;
+
+       
+       string blastfile, method, namefile, overlapFile, distFile;
+       ofstream sabundFile, rabundFile, listFile;
+       float cutoff, penalty;
+       int precision, length, precisionLength;
+       bool abort, minWanted, hclusterWanted, exitedBreak, merge;
+       
+       void printData(ListVector*);
+       ListVector* mergeOPFs(map<string, int>, float);
+       void sortHclusterFiles(string, string);
+       vector<DistNode> getSeqs(ifstream&);
+
+};
+
+/**********************************************************************/
+
+#endif
+
+
+
diff --git a/nameassignment.cpp b/nameassignment.cpp

index 925608659a4261fc6924d23b16bbcafb13ed3558..d098a48460d3331e14529cb0c53884465f71a8cf 100644 (file)
--- a/nameassignment.cpp
+++ b/nameassignment.cpp
@@ -28,6 +28,7 @@ void NameAssignment::readMap(){
  //                     data[firstCol] = secondCol;                     //store data in map
  
                         list.push_back(secondCol);              //adds data's value to list
+                       reverse[rowIndex] = firstCol;
                         (*this)[firstCol] = rowIndex++;
                         gobble(fileHandle);
                 }
@@ -45,6 +46,7 @@ void NameAssignment::push_back(string name) {
         
                 int num = (*this).size();
                 (*this)[name] = num;
+               reverse[num] = name;
                 
                 list.push_back(name);
         }
@@ -64,11 +66,12 @@ ListVector NameAssignment::getListVector(void){
  
  //**********************************************************************************************************************
  
-void NameAssignment::print(void){
+void NameAssignment::print(ostream& out){
         try {
                 map<string,int>::iterator it;
+cout << (*this).size() << endl;
                 for(it = (*this).begin(); it!=(*this).end(); it++){
-                       mothurOut(it->first + "\t" + toString(it->second)); mothurOutEndLine();  //prints out keys and values of the map this.
+                       out << it->first << '\t' <<  it->second << endl;  //prints out keys and values of the map this.
                 }
         }
         catch(exception& e) {
@@ -84,6 +87,12 @@ int NameAssignment::get(string key){
         return  (*this)[key];   
  
  }
+//**********************************************************************************************************************
+
+string NameAssignment::get(int key){
+       
+       return  reverse[key];   
  
+}
  //**********************************************************************************************************************
  
diff --git a/nameassignment.hpp b/nameassignment.hpp

index 2470a84a630406eb8ecf61893398477b95c22c57..3ccb4256524eb67434b0d98fcc1c9d9bf3471935 100644 (file)
--- a/nameassignment.hpp
+++ b/nameassignment.hpp
@@ -11,11 +11,13 @@ public:
         void readMap();
         ListVector getListVector();
         int get(string);
-       void print();
+       string get(int);
+       void print(ostream&);
         void push_back(string);
  private:
         ifstream fileHandle;
         ListVector list;
+       map<int, string> reverse;
  };
  
  
diff --git a/readblast.cpp b/readblast.cpp

new file mode 100644 (file)

index 0000000..7a092f9
--- /dev/null
+++ b/readblast.cpp
@@ -0,0 +1,321 @@
+/*
+ *  readblast.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 12/10/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "readblast.h"
+#include "progress.hpp"
+
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareOverlap(DistNode left, DistNode right){
+       return (left.dist < right.dist);        
+} 
+/*********************************************************************************************/
+ReadBlast::ReadBlast(string file, float c, float p, int l, bool m, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(m), hclusterWanted(h) {
+       try {
+               matrix = NULL;
+       }
+       catch(exception& e) {
+               errorOut(e, "ReadBlast", "ReadBlast");
+               exit(1);
+       }
+} 
+/*********************************************************************************************/
+//assumptions about the blast file: 
+//1. if duplicate lines occur the first line is always best and is chosen
+//2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score...
+void ReadBlast::read(NameAssignment* nameMap) {
+       try {
+       
+               //if the user has not given a names file read names from blastfile
+               if (nameMap->size() == 0) { readNames(nameMap);  }
+               int nseqs = nameMap->size();
+
+               ifstream fileHandle;
+               openInputFile(blastfile, fileHandle);
+               
+               string firstName, secondName, eScore, currentRow;
+               string repeatName = "";
+               int count = 1;
+               float distance, thisoverlap, refScore;
+               float percentId; 
+               float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
+               
+               ofstream outDist;
+               ofstream outOverlap;
+               
+               //create objects needed for read
+               if (!hclusterWanted) {
+                       matrix = new SparseMatrix();
+               }else{
+                       overlapFile = getRootName(blastfile) + "overlap.dist";
+                       distFile = getRootName(blastfile) + "hclusterDists.dist";
+                       
+                       openOutputFile(overlapFile, outOverlap);
+                       openOutputFile(distFile, outDist);
+               }
+       
+               Progress* reading = new Progress("Reading blast:     ", nseqs * nseqs);
+               
+               //this is used to quickly find if we already have a distance for this combo
+               vector< map<int,float> > dists;  dists.resize(nseqs);  //dists[0][1] = distance from seq0 to seq1
+               map<int, float> thisRowsBlastScores;
+               
+               if (!fileHandle.eof()) {
+                       //read in line from file
+                       fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+                       gobble(fileHandle);
+                       
+                       currentRow = firstName;
+                       lengthThisSeq = numBases;
+                       repeatName = firstName + secondName;
+                       
+                       if (firstName == secondName) {   refScore = score;  }
+                       else{
+                               //convert name to number
+                               map<string,int>::iterator itA = nameMap->find(firstName);
+                               map<string,int>::iterator itB = nameMap->find(secondName);
+                               if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
+                               if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                               
+                               thisRowsBlastScores[itB->second] = score;
+                               
+                               //calc overlap score
+                               thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+                               
+                               //if there is a valid overlap, add it
+                               if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+                                       if (!hclusterWanted) {
+                                               DistNode overlapValue(itA->second, itB->second, thisoverlap);
+                                               overlap.push_back(overlapValue);
+                                       }else {
+                                               outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+                                       }
+                               }
+                       }
+               }else { mothurOut("Error in your blast file, cannot read."); mothurOutEndLine(); exit(1); }
+
+                               
+               //read file
+               while(!fileHandle.eof()){  
+                       
+                       //read in line from file
+                       fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+                       //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl;       
+                       gobble(fileHandle);
+                       
+                       string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
+                       
+                       //if this is a new pairing
+                       if (temp != repeatName) {
+                               repeatName = temp; 
+                               
+                               if (currentRow == firstName) {
+                                       //cout << "first = " << firstName << " second = " << secondName << endl;        
+                                       if (firstName == secondName) { 
+                                               refScore = score;  
+                                               reading->update((count + nseqs));  
+                                               count++; 
+                                       }else{
+                                               //convert name to number
+                                               map<string,int>::iterator itA = nameMap->find(firstName);
+                                               map<string,int>::iterator itB = nameMap->find(secondName);
+                                               if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
+                                               if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                                               
+                                               //save score
+                                               thisRowsBlastScores[itB->second] = score;
+                                                       
+                                               //calc overlap score
+                                               thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+                                               
+                                               //if there is a valid overlap, add it
+                                               if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+                                                       if (!hclusterWanted) {
+                                                               DistNode overlapValue(itA->second, itB->second, thisoverlap);
+                                                               //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
+                                                               overlap.push_back(overlapValue);
+                                                       }else {
+                                                               outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+                                                       }
+                                               }
+                                       } //end else
+                               }else { //end row
+                                       //convert blast scores to distance and add cell to sparse matrix if we can
+                                       map<int, float>::iterator it;
+                                       map<int, float>::iterator itDist;
+                                       for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
+                                               distance = 1.0 - (it->second / refScore);
+                                               
+                                               //do we already have the distance calculated for b->a
+                                               map<string,int>::iterator itA = nameMap->find(currentRow);
+                                               itDist = dists[it->first].find(itA->second);
+                                               
+                                               //if we have it then compare
+                                               if (itDist != dists[it->first].end()) {
+                                                       //if you want the minimum blast score ratio, then pick max distance
+                                                       if(minWanted) {  distance = max(itDist->second, distance);  }
+                                                       else{   distance = min(itDist->second, distance);  }
+                                                       
+                                                       //is this distance below cutoff
+                                                       if (distance < cutoff) {
+                                                               if (!hclusterWanted) {
+                                                                       PCell value(itA->second, it->first, distance);
+                                                                       matrix->addCell(value);
+                                                               }else{
+                                                                       outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
+                                                               }
+                                                       }
+                                                       //not going to need this again
+                                                       dists[it->first].erase(itDist);
+                                               }else { //save this value until we get the other ratio
+                                                       dists[itA->second][it->first] = distance;
+                                               }
+                                       }
+                                       //clear out last rows info
+                                       thisRowsBlastScores.clear();
+                                       
+                                       currentRow = firstName;
+                                       lengthThisSeq = numBases;
+                                       
+                                       //add this row to thisRowsBlastScores
+                                       if (firstName == secondName) {   refScore = score;  }
+                                       else{ //add this row to thisRowsBlastScores
+                                               
+                                               //convert name to number
+                                               map<string,int>::iterator itA = nameMap->find(firstName);
+                                               map<string,int>::iterator itB = nameMap->find(secondName);
+                                               if(itA == nameMap->end()){   cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";  exit(1);  }
+                                               if(itB == nameMap->end()){   cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+                                               
+                                               thisRowsBlastScores[itB->second] = score;
+                                               
+                                               //calc overlap score
+                                               thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
+                                               
+                                               //if there is a valid overlap, add it
+                                               if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
+                                                       if (!hclusterWanted) {
+                                                               DistNode overlapValue(itA->second, itB->second, thisoverlap);
+                                                               overlap.push_back(overlapValue);
+                                                       }else {
+                                                               outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
+                                                       }
+                                               }
+                                       }
+                               }//end if current row
+                       }//end if repeat
+               }//end while
+               
+               //get last rows info stored
+               //convert blast scores to distance and add cell to sparse matrix if we can
+               map<int, float>::iterator it;
+               map<int, float>::iterator itDist;
+               for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {  
+                       distance = 1.0 - (it->second / refScore);
+                       
+                       //do we already have the distance calculated for b->a
+                       map<string,int>::iterator itA = nameMap->find(currentRow);
+                       itDist = dists[it->first].find(itA->second);
+                       
+                       //if we have it then compare
+                       if (itDist != dists[it->first].end()) {
+                               //if you want the minimum blast score ratio, then pick max distance
+                               if(minWanted) {  distance = max(itDist->second, distance);  }
+                               else{   distance = min(itDist->second, distance);  }
+                               
+                               //is this distance below cutoff
+                               if (distance < cutoff) {
+                                       if (!hclusterWanted) {
+                                               PCell value(itA->second, it->first, distance);
+                                               matrix->addCell(value);
+                                       }else{
+                                               outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl;
+                                       }
+                               }
+                               //not going to need this again
+                               dists[it->first].erase(itDist);
+                       }else { //save this value until we get the other ratio
+                               dists[itA->second][it->first] = distance;
+                       }
+               }
+               //clear out info
+               thisRowsBlastScores.clear();
+               dists.clear();
+               
+               if (!hclusterWanted) {
+                       sort(overlap.begin(), overlap.end(), compareOverlap);
+               }else {
+                       outDist.close();
+                       outOverlap.close();
+               }
+               
+               reading->finish();
+               delete reading;
+               fileHandle.close();
+       }
+       catch(exception& e) {
+               errorOut(e, "ReadBlast", "read");
+               exit(1);
+       }
+} 
+/*********************************************************************************************/
+void ReadBlast::readNames(NameAssignment* nameMap) {
+       try {
+               mothurOut("Reading names... "); cout.flush();
+               
+               string name, hold, prevName;
+               int num = 1;
+               
+               ifstream in;
+               openInputFile(blastfile, in);
+               
+               //read first line
+               in >> prevName;
+               for (int i = 0; i < 11; i++) {  in >> hold;  }
+               gobble(in);
+               
+               //save name in nameMap
+               nameMap->push_back(prevName);
+               
+               while (!in.eof()) {
+                       
+                       //read line
+                       in >> name;
+                       for (int i = 0; i < 11; i++) {  in >> hold;  }
+                       gobble(in);
+                       
+                       //is this a new name?
+                       if (name != prevName) {
+                               prevName = name;
+                               nameMap->push_back(name);
+                               num++;
+                       }
+               }
+       
+               in.close();
+               
+               //write out names file
+               //string outNames = getRootName(blastfile) + "names";
+               //ofstream out;
+               //openOutputFile(outNames, out);
+               //nameMap->print(out);
+               //out.close();
+               
+               mothurOut(toString(num) + " names read."); mothurOutEndLine();
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "ReadBlast", "readNames");
+               exit(1);
+       }
+} 
+/*********************************************************************************************/
+
+
+
diff --git a/readblast.h b/readblast.h

new file mode 100644 (file)

index 0000000..0b47b59
--- /dev/null
+++ b/readblast.h
@@ -0,0 +1,59 @@
+#ifndef READBLAST_H
+#define READBLAST_H
+/*
+ *  readblast.h
+ *  Mothur
+ *
+ *  Created by westcott on 12/10/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "sparsematrix.hpp"
+#include "nameassignment.hpp"
+
+/****************************************************************************************/
+struct DistNode {
+       int seq1;
+       int seq2;
+       float dist;
+       DistNode(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
+       DistNode() {}
+       ~DistNode() {}
+};
+/****************************************************************************************/
+
+//Note: this class creates a sparsematrix and list if the read is executed, but does not delete them on deconstruction.
+//the user of this object is responsible for deleting the matrix and list if they call the read or there will be a memory leak
+//it is done this way so the read can be deleted and the information still used.
+
+class ReadBlast {
+       
+public:
+       ReadBlast(string, float, float, int, bool, bool); //blastfile, cutoff, penalty, length of overlap, min or max bsr, hclusterWanted
+       ~ReadBlast() {}
+       
+       void read(NameAssignment*);
+       SparseMatrix* getDistMatrix()           {       return matrix;          }
+       vector<DistNode> getOverlapMatrix()     {       return overlap;         }
+       string getOverlapFile()                         {       return overlapFile;     }
+       string getDistFile()                            {       return distFile;        }
+       
+private:
+       string blastfile, overlapFile, distFile;
+       int length;     //number of amino acids overlapped
+       float penalty, cutoff;  //penalty is used to adjust error rate
+       bool minWanted;  //if true choose min bsr, if false choose max bsr
+       bool hclusterWanted;
+       
+       SparseMatrix* matrix;
+       vector<DistNode> overlap;
+       
+       void readNames(NameAssignment*);
+};
+
+/*******************************************************************************************/
+
+#endif
+
diff --git a/readcolumn.cpp b/readcolumn.cpp

index fccc2cd06f8b045e48e4516229fc58d4ebe3e30d..6b8892b6eb1068563e8cf69e30b0274e07e0b15a 100644 (file)
--- a/readcolumn.cpp
+++ b/readcolumn.cpp
@@ -45,10 +45,10 @@ void ReadColumnMatrix::read(NameAssignment* nameMap){
                         map<string,int>::iterator itB = nameMap->find(secondName);\r
                         \r
                         if(itA == nameMap->end()){\r
-                               cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n";\r
+                               cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);\r
                         }\r
                         if(itB == nameMap->end()){\r
-                               cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n";\r
+                               cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);\r
                         }\r
  \r
                         if (distance == -1) { distance = 1000000; }\r
diff --git a/readdistcommand.cpp b/readdistcommand.cpp

index f6b41951962b7ce6b85ae50a9958bc0408c13835..f6cf5db3948ea8da4557fad5b98da360019922e7 100644 (file)
--- a/readdistcommand.cpp
+++ b/readdistcommand.cpp
@@ -164,6 +164,34 @@ int ReadDistCommand::execute(){
                         openInputFile(distFileName, in);
                         matrix = new FullMatrix(in); //reads the matrix file
                         in.close();
+                       
+                       //if files don't match...
+                       if (matrix->getNumSeqs() < groupMap->getNumSeqs()) {  
+                               mothurOut("Your distance file contains " + toString(matrix->getNumSeqs()) + " sequences, and your group file contains " + toString(groupMap->getNumSeqs()) + " sequences.");  mothurOutEndLine();                               
+                               //create new group file
+                               string newGroupFile = getRootName(groupfile) + "editted.groups";
+                               ofstream outGroups;
+                               openOutputFile(newGroupFile, outGroups);
+                               
+                               for (int i = 0; i < matrix->getNumSeqs(); i++) {
+                                       Names temp = matrix->getRowInfo(i);
+                                       outGroups << temp.seqName << '\t' << temp.groupName << endl;
+                               }
+                               outGroups.close();
+                               
+                               mothurOut(newGroupFile + " is a new group file containing only the sequence that are in your distance file. I will read this file instead."); mothurOutEndLine();
+                               
+                               //read new groupfile
+                               delete groupMap; groupMap = NULL;
+                               groupfile = newGroupFile;
+                               globaldata->setGroupFile(groupfile); 
+                               
+                               groupMap = new GroupMap(groupfile);
+                               groupMap->readMap();
+                               
+                               globaldata->gGroupmap = groupMap;
+                       }
+                       
                         //memory leak prevention
                         if (globaldata->gMatrix != NULL) { delete globaldata->gMatrix;  }
                         globaldata->gMatrix = matrix; //save matrix for coverage commands
diff --git a/slibshuff.cpp b/slibshuff.cpp

index 219c7094390ec43767806a0255e033774d31ab4d..1f2d3dfea52fade8645c2afd2eebf239e8fa7231 100644 (file)
--- a/slibshuff.cpp
+++ b/slibshuff.cpp
@@ -24,7 +24,6 @@ float SLibshuff::evaluatePair(int i, int j){
  vector<vector<double> > SLibshuff::evaluateAll(){
         try{
                 savedMins.resize(numGroups);
-               
                 vector<vector<double> > dCXYValues(numGroups);
  
                 for(int i=0;i<numGroups;i++){
diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp

index ca7a927134f1f904401ab017d456ff1f6859a569..f7773244bc37d888d4c44818cdd8dd1d020444ba 100644 (file)
--- a/trimseqscommand.cpp
+++ b/trimseqscommand.cpp
@@ -200,7 +200,6 @@ int TrimSeqsCommand::execute(){
                                 }
                                 if(minLength > 0 || maxLength > 0){
                                         success = cullLength(currSeq);
-                                       if ((currSeq.getUnaligned().length() > 300) && (success)) {  cout << "too long " << currSeq.getUnaligned().length() << endl;  }
                                         if(!success){   trashCode += 'l'; }
                                 }
                                 if(maxHomoP > 0){
author	westcott <westcott>
	Mon, 21 Dec 2009 15:03:14 +0000 (15:03 +0000)
committer	westcott <westcott>
	Mon, 21 Dec 2009 15:03:14 +0000 (15:03 +0000)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
aligncommand.cpp		patch \| blob \| history
aligncommand.h		patch \| blob \| history
alignmentdb.cpp		patch \| blob \| history
cluster.cpp		patch \| blob \| history
cluster.hpp		patch \| blob \| history
commandfactory.cpp		patch \| blob \| history
engine.cpp		patch \| blob \| history
engine.hpp		patch \| blob \| history
fullmatrix.h		patch \| blob \| history
getsharedotucommand.cpp		patch \| blob \| history
getsharedotucommand.h		patch \| blob \| history
hcluster.cpp		patch \| blob \| history
hcluster.h		patch \| blob \| history
hclustercommand.cpp		patch \| blob \| history
hclustercommand.h		patch \| blob \| history
libshuff.cpp		patch \| blob \| history
libshuffcommand.cpp		patch \| blob \| history
mgclustercommand.cpp	[new file with mode: 0644]	patch \| blob
mgclustercommand.h	[new file with mode: 0644]	patch \| blob
nameassignment.cpp		patch \| blob \| history
nameassignment.hpp		patch \| blob \| history
readblast.cpp	[new file with mode: 0644]	patch \| blob
readblast.h	[new file with mode: 0644]	patch \| blob
readcolumn.cpp		patch \| blob \| history
readdistcommand.cpp		patch \| blob \| history
slibshuff.cpp		patch \| blob \| history
trimseqscommand.cpp		patch \| blob \| history