]> git.donarmstrong.com Git - mothur.git/commitdiff
added formatmatrix, formatcolumn, and formatphylip classes. Used these classes in...
authorwestcott <westcott>
Thu, 14 Jan 2010 16:13:32 +0000 (16:13 +0000)
committerwestcott <westcott>
Thu, 14 Jan 2010 16:13:32 +0000 (16:13 +0000)
Mothur.xcodeproj/project.pbxproj
formatcolumn.cpp [new file with mode: 0644]
formatcolumn.h [new file with mode: 0644]
formatmatrix.h [new file with mode: 0644]
formatphylip.cpp [new file with mode: 0644]
formatphylip.h [new file with mode: 0644]
getoturepcommand.cpp
getoturepcommand.h
getsharedotucommand.cpp
readdistcommand.cpp

index 16d9d8eb82ef029c9debdd6b4f920e372c0c41b2..7f8568e03a0f9bdbd282ba3a87f38107359e44dc 100644 (file)
                A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
                A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
                A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D176CE10F2558500159497 /* pcacommand.cpp */; };
+               A7D86C7D10FE09AB00865661 /* formatcolumn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D86C7C10FE09AB00865661 /* formatcolumn.cpp */; };
+               A7D86C8C10FE09FE00865661 /* formatphylip.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D86C8B10FE09FE00865661 /* formatphylip.cpp */; };
                A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; };
                A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; };
                A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; };
                A7B0450D106CEEC90046FC83 /* slayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slayer.cpp; sourceTree = SOURCE_ROOT; };
                A7D176CD10F2558500159497 /* pcacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcacommand.h; sourceTree = SOURCE_ROOT; };
                A7D176CE10F2558500159497 /* pcacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pcacommand.cpp; sourceTree = SOURCE_ROOT; };
+               A7D86C6A10FE084300865661 /* formatmatrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formatmatrix.h; sourceTree = SOURCE_ROOT; };
+               A7D86C7B10FE09AB00865661 /* formatcolumn.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formatcolumn.h; sourceTree = SOURCE_ROOT; };
+               A7D86C7C10FE09AB00865661 /* formatcolumn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatcolumn.cpp; sourceTree = SOURCE_ROOT; };
+               A7D86C8A10FE09FE00865661 /* formatphylip.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formatphylip.h; sourceTree = SOURCE_ROOT; };
+               A7D86C8B10FE09FE00865661 /* formatphylip.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatphylip.cpp; sourceTree = SOURCE_ROOT; };
                A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = "<group>"; };
                A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = "<group>"; };
                A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; };
                3796441D0FB9B9650081FDB6 /* read */ = {
                        isa = PBXGroup;
                        children = (
+                               A7D86C6A10FE084300865661 /* formatmatrix.h */,
+                               A7D86C7B10FE09AB00865661 /* formatcolumn.h */,
+                               A7D86C7C10FE09AB00865661 /* formatcolumn.cpp */,
+                               A7D86C8A10FE09FE00865661 /* formatphylip.h */,
+                               A7D86C8B10FE09FE00865661 /* formatphylip.cpp */,
                                A727E84810D14568001A8432 /* readblast.h */,
                                A727E84910D14568001A8432 /* readblast.cpp */,
                                A751ACE81098B283003D0911 /* readcluster.h */,
                                A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */,
                                A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */,
                                A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */,
+                               A7D86C7D10FE09AB00865661 /* formatcolumn.cpp in Sources */,
+                               A7D86C8C10FE09FE00865661 /* formatphylip.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
diff --git a/formatcolumn.cpp b/formatcolumn.cpp
new file mode 100644 (file)
index 0000000..e68d85d
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ *  formatcolumn.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 1/13/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "formatcolumn.h"
+#include "progress.hpp"
+
+/***********************************************************************/
+FormatColumnMatrix::FormatColumnMatrix(string df) : filename(df){
+       openInputFile(filename, fileHandle);
+}
+/***********************************************************************/
+
+void FormatColumnMatrix::read(NameAssignment* nameMap){
+       try {           
+
+               string firstName, secondName;
+               float distance;
+               int nseqs = nameMap->size();
+
+               list = new ListVector(nameMap->getListVector());
+       
+               Progress* reading = new Progress("Formatting matrix:     ", nseqs * nseqs);
+
+               int lt = 1;
+               int refRow = 0; //we'll keep track of one cell - Cell(refRow,refCol) - and see if it's transpose
+               int refCol = 0; //shows up later - Cell(refCol,refRow).  If it does, then its a square matrix
+
+               //need to see if this is a square or a triangular matrix...
+               
+               ofstream out;
+               string tempOutFile = filename + ".temp";
+               openOutputFile(tempOutFile, out);
+       
+               while(fileHandle && lt == 1){  //let's assume it's a triangular matrix...
+               
+                       fileHandle >> firstName >> secondName >> distance;      // get the row and column names and distance
+       
+                       map<string,int>::iterator itA = nameMap->find(firstName);
+                       map<string,int>::iterator itB = nameMap->find(secondName);
+                       if(itA == nameMap->end()){      cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);   }
+                       if(itB == nameMap->end()){      cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+
+                       if (distance == -1) { distance = 1000000; }
+                       
+                       if(distance < cutoff && itA != itB){
+                                                       
+                               if(refRow == refCol){           // in other words, if we haven't loaded refRow and refCol...
+                                       refRow = itA->second;
+                                       refCol = itB->second;
+                                       
+                                       //making it square
+                                       out << itA->second << '\t' << itB->second << '\t' << distance << endl;
+                                       out << itB->second << '\t' << itA->second << '\t' << distance << endl;
+                               }
+                               else if(refRow == itA->second && refCol == itB->second){        lt = 0;         } //you are square
+                               else if(refRow == itB->second && refCol == itA->second){        lt = 0;         }  //you are square
+                               else{   //making it square
+                                       out << itA->second << '\t' << itB->second << '\t' << distance << endl;
+                                       out << itB->second << '\t' << itA->second << '\t' << distance << endl;
+                               }
+                               
+                               reading->update(itA->second * nseqs / 2);
+                       }
+                       gobble(fileHandle);
+               }
+               out.close();
+               fileHandle.close();
+               
+               string squareFile;
+               if(lt == 0){  // oops, it was square
+                       squareFile = filename;
+               }else{ squareFile = tempOutFile; }
+               
+               //sort file by first column so the distances for each row are together
+               string outfile = getRootName(squareFile) + "sorted.dist.temp";
+               
+               //use the unix sort 
+               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       string command = "sort -n " + squareFile + " -o " + outfile;
+                       system(command.c_str());
+               #else //sort using windows sort
+                       string command = "sort " + squareFile + " /O " + outfile;
+                       system(command.c_str());
+               #endif
+               
+
+               //output to new file distance for each row and save positions in file where new row begins
+               ifstream in;
+               openInputFile(outfile, in);
+               
+               distFile = outfile + ".rowFormatted";
+               openOutputFile(distFile, out);
+               
+               rowPos.resize(nseqs, -1);
+               int currentRow;
+               int first, second;
+               float dist;
+               map<int, float> rowMap;
+               map<int, float>::iterator itRow;
+               
+               //get first currentRow
+               in >> first;
+               currentRow = first;
+               
+               string firstString = toString(first);
+               for(int k = 0; k < firstString.length(); k++)  {   in.putback(firstString[k]);  }
+               
+               while(!in.eof()) {
+                       in >> first >> second >> dist; gobble(in);
+                       
+                       if (first != currentRow) {
+                               //save position in file of each new row
+                               rowPos[currentRow] = out.tellp();
+                               
+                               out << currentRow << '\t' << rowMap.size() << '\t';
+                               
+                               for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+                                       out << itRow->first << '\t' << itRow->second << '\t';
+                               }
+                               out << endl;
+                               
+                               currentRow = first;
+                               rowMap.clear();
+                               
+                               //save row you just read
+                               rowMap[second] = dist;
+
+                       }else{
+                               rowMap[second] = dist;
+                       }
+               }
+               
+               //print last Row
+               //save position in file of each new row
+               rowPos[currentRow] = out.tellp();
+               
+               out << currentRow << '\t' << rowMap.size() << '\t';
+               
+               for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+                       out << itRow->first << '\t' << itRow->second << '\t';
+               }
+               out << endl;
+               
+               
+               in.close();
+               out.close();
+               
+               
+               remove(tempOutFile.c_str());
+               remove(outfile.c_str());
+               
+               reading->finish();
+               list->setLabel("0");
+
+       }
+       catch(exception& e) {
+               errorOut(e, "FormatColumnMatrix", "read");
+               exit(1);
+       }
+}
+
+/***********************************************************************/
+FormatColumnMatrix::~FormatColumnMatrix(){}
+/***********************************************************************/
+
+
+
diff --git a/formatcolumn.h b/formatcolumn.h
new file mode 100644 (file)
index 0000000..c713867
--- /dev/null
@@ -0,0 +1,32 @@
+#ifndef FORMATCOLUMN_H
+#define FORMATCOLUMN_H
+/*
+ *  formatcolumn.h
+ *  Mothur
+ *
+ *  Created by westcott on 1/13/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "formatmatrix.h"
+
+/******************************************************/
+
+class FormatColumnMatrix : public FormatMatrix {
+       
+public:
+       FormatColumnMatrix(string);
+       ~FormatColumnMatrix();
+       void read(NameAssignment*);
+       
+private:
+       ifstream fileHandle;
+       string filename;
+       
+};
+
+/******************************************************/
+
+#endif
+
diff --git a/formatmatrix.h b/formatmatrix.h
new file mode 100644 (file)
index 0000000..a02ef6c
--- /dev/null
@@ -0,0 +1,43 @@
+#ifndef FORMATMATRIX_H
+#define FORMATMATRIX_H
+
+/*
+ *  formatmatrix.h
+ *  Mothur
+ *
+ *  Created by westcott on 1/13/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "listvector.hpp"
+#include "nameassignment.hpp"
+
+
+//**********************************************************************************************************************
+
+class FormatMatrix {
+
+public:
+       FormatMatrix(){ }
+       virtual ~FormatMatrix() {}
+       
+       virtual void read(NameAssignment*){};
+       
+       void setCutoff(float c)                 {       cutoff = c;                     }
+       ListVector* getListVector()             {       return list;            }
+       string getFormattedFileName()   {       return distFile;        }
+       vector<int> getRowPositions()   {       return rowPos;          }
+       
+protected:
+       ListVector* list;
+       float cutoff;
+       string distFile;
+       vector<int> rowPos;
+};
+
+//**********************************************************************************************************************
+
+#endif
+
diff --git a/formatphylip.cpp b/formatphylip.cpp
new file mode 100644 (file)
index 0000000..4a7878a
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ *  formatphylip.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 1/13/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "formatphylip.h"
+#include "progress.hpp"
+
+/***********************************************************************/
+FormatPhylipMatrix::FormatPhylipMatrix(string df) : filename(df) {
+        openInputFile(filename, fileHandle);
+}
+/***********************************************************************/
+//not using nameMap
+void FormatPhylipMatrix::read(NameAssignment* nameMap){
+       try {
+        
+                       float distance;
+                       int square, nseqs;
+                       string name;
+                       ofstream out;
+                       
+                       fileHandle >> nseqs >> name;
+                                               
+                       list = new ListVector(nseqs);
+                       list->set(0, name);
+                       
+                       char d;
+                       while((d=fileHandle.get()) != EOF){
+                
+                               if(isalnum(d)){  //you are square
+                                       square = 1;
+                                       fileHandle.close();  //reset file
+                                       
+                                       //open and get through numSeqs, code below formats rest of file
+                                       openInputFile(filename, fileHandle);
+                                       fileHandle >> nseqs; gobble(fileHandle);
+                                       
+                                       distFile = filename + ".rowFormatted";
+                                       openOutputFile(distFile, out);
+                                       break;
+                               }
+                               if(d == '\n'){
+                                       square = 0;
+                                       break;
+                               }
+                       }
+                       
+                       Progress* reading;
+                       reading = new Progress("Formatting matrix:     ", nseqs * nseqs);
+                       
+                       //lower triangle, so must go to column then formatted row file
+                       if(square == 0){
+                               int  index = 0;
+                               
+                               ofstream outTemp;
+                               string tempFile = filename + ".temp";
+                               openOutputFile(tempFile, outTemp);
+                
+                               //convert to square column matrix
+                               for(int i=1;i<nseqs;i++){
+                                       fileHandle >> name;
+                                       
+                                       list->set(i, name);
+                                       
+                                       for(int j=0;j<i;j++){
+                                               fileHandle >> distance;
+                                               
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if(distance < cutoff){
+                                                       outTemp << i << '\t' << j << '\t' << distance << endl;
+                                                       outTemp << j << '\t' << i << '\t' << distance << endl;
+                                               }
+                                               index++;
+                                               reading->update(index);
+                                       }
+                               }
+                               outTemp.close();
+                               
+                               //format from square column to rowFormatted
+                               //sort file by first column so the distances for each row are together
+                               string outfile = getRootName(tempFile) + "sorted.dist.temp";
+                               
+                               //use the unix sort 
+                               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                                       string command = "sort -n " + tempFile + " -o " + outfile;
+                                       system(command.c_str());
+                               #else //sort using windows sort
+                                       string command = "sort " + tempFile + " /O " + outfile;
+                                       system(command.c_str());
+                               #endif
+                               
+
+                               //output to new file distance for each row and save positions in file where new row begins
+                               ifstream in;
+                               openInputFile(outfile, in);
+                               
+                               distFile = outfile + ".rowFormatted";
+                               openOutputFile(distFile, out);
+                               
+                               rowPos.resize(nseqs, -1);
+                               int currentRow;
+                               int first, second;
+                               float dist;
+                               map<int, float> rowMap;
+                               map<int, float>::iterator itRow;
+                               
+                               //get first currentRow
+                               in >> first;
+                               currentRow = first;
+                               
+                               string firstString = toString(first);
+                               for(int k = 0; k < firstString.length(); k++)  {   in.putback(firstString[k]);  }
+                               
+                               while(!in.eof()) {
+                                       in >> first >> second >> dist; gobble(in);
+                                       
+                                       if (first != currentRow) {
+                                               //save position in file of each new row
+                                               rowPos[currentRow] = out.tellp();
+                                               
+                                               out << currentRow << '\t' << rowMap.size() << '\t';
+                                               
+                                               for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+                                                       out << itRow->first << '\t' << itRow->second << '\t';
+                                               }
+                                               out << endl;
+                                               
+                                               currentRow = first;
+                                               rowMap.clear();
+                                               
+                                               //save row you just read
+                                               rowMap[second] = dist;
+                                               
+                                               index++;
+                                               reading->update(index);
+                                       }else{
+                                               rowMap[second] = dist;
+                                       }
+                               }
+                               
+                               //print last Row
+                               //save position in file of each new row
+                               rowPos[currentRow] = out.tellp();
+                               
+                               out << currentRow << '\t' << rowMap.size() << '\t';
+                               
+                               for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+                                       out << itRow->first << '\t' << itRow->second << '\t';
+                               }
+                               out << endl;
+                               
+                               in.close();
+                               out.close();
+                               
+                               remove(tempFile.c_str());
+                               remove(outfile.c_str());
+                       }
+                       else{ //square matrix convert directly to formatted row file
+                               int index = nseqs;
+                               map<int, float> rowMap;
+                               map<int, float>::iterator itRow;
+
+                
+                               for(int i=0;i<nseqs;i++){
+                                       fileHandle >> name;                
+                                                                               
+                                       list->set(i, name);
+                                       
+                                       for(int j=0;j<nseqs;j++){
+                                               fileHandle >> distance;
+                                               
+                                               if (distance == -1) { distance = 1000000; }
+                                               
+                                               if((distance < cutoff) && (j != i)){
+                                                       rowMap[j] = distance;
+                                               }
+                                               index++;
+                                               reading->update(index);
+                                       }
+                                       
+                                       //save position in file of each new row
+                                       rowPos[i] = out.tellp();
+
+                                       //output row to file
+                                       out << i << '\t' << rowMap.size() << '\t';
+                                       for (itRow = rowMap.begin(); itRow != rowMap.end(); itRow++) {
+                                               out << itRow->first << '\t' << itRow->second << '\t';
+                                       }
+                                       out << endl;
+                                       
+                                       //clear map for new row's info
+                                       rowMap.clear();
+                               }
+                       }
+                       reading->finish();
+                       delete reading;
+                       
+                       list->setLabel("0");
+                       fileHandle.close();
+                       out.close();
+                       
+       }
+       catch(exception& e) {
+               errorOut(e, "FormatPhylipMatrix", "read");
+                exit(1);
+       }
+}
+/***********************************************************************/
+FormatPhylipMatrix::~FormatPhylipMatrix(){}
+/***********************************************************************/
+
+
diff --git a/formatphylip.h b/formatphylip.h
new file mode 100644 (file)
index 0000000..9ca14bb
--- /dev/null
@@ -0,0 +1,31 @@
+#ifndef FORMATPHYLIP_H
+#define FORMATPHYLIP_H
+
+/*
+ *  formatphylip.h
+ *  Mothur
+ *
+ *  Created by westcott on 1/13/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "formatmatrix.h"
+
+/******************************************************/
+
+class FormatPhylipMatrix : public FormatMatrix {
+       
+public:
+       FormatPhylipMatrix(string);
+       ~FormatPhylipMatrix();
+       void read(NameAssignment*);
+private:
+       ifstream fileHandle;
+       string filename;
+};
+
+/******************************************************/
+
+#endif
+
index 41cc6b5eefca511b4137ecf635a841ca24159f1a..9667755e00343e6edccac6d5a607f246c0af3ba8 100644 (file)
@@ -8,6 +8,12 @@
  */
 
 #include "getoturepcommand.h"
+#include "readphylip.h"
+#include "readcolumn.h"
+#include "formatphylip.h"
+#include "formatcolumn.h"
+
+
 
 //********************************************************************************************************************
 //sorts lowest to highest
@@ -42,7 +48,7 @@ GetOTURepCommand::GetOTURepCommand(string option){
                        help(); abort = true;
                } else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column"};
+                       string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -55,12 +61,6 @@ GetOTURepCommand::GetOTURepCommand(string option){
                                if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
                        }
                        
-                       //make sure the user has already run the read.otu command
-                       if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
-                               mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine(); 
-                               abort = true;
-                       }
-                       
                        //check for required parameters
                        fastafile = validParameter.validFile(parameters, "fasta", true);
                        if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
@@ -73,10 +73,16 @@ GetOTURepCommand::GetOTURepCommand(string option){
                        phylipfile = validParameter.validFile(parameters, "phylip", true);
                        if (phylipfile == "not found") { phylipfile = "";  }
                        else if (phylipfile == "not open") { abort = true; }    
+                       else { distFile = phylipfile; format = "phylip";   }
                        
                        columnfile = validParameter.validFile(parameters, "column", true);
                        if (columnfile == "not found") { columnfile = ""; }
                        else if (columnfile == "not open") { abort = true; }    
+                       else { distFile = columnfile; format = "column";   }
+                       
+                       namefile = validParameter.validFile(parameters, "name", true);
+                       if (namefile == "not open") { abort = true; }   
+                       else if (namefile == "not found") { namefile = ""; }
                        
                        if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
                        else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
@@ -86,22 +92,12 @@ GetOTURepCommand::GetOTURepCommand(string option){
                        //check for optional parameter and set defaults
                        // ...at some point should added some additional type checking...
                        label = validParameter.validFile(parameters, "label", false);                   
-                       if (label == "not found") { label = ""; }
+                       if (label == "not found") { label = ""; allLines = 1;  }
                        else { 
                                if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
                                else { allLines = 1;  }
                        }
                        
-                       //if the user has not specified any labels use the ones from read.otu
-                       if (label == "") {  
-                               allLines = globaldata->allLines; 
-                               labels = globaldata->labels; 
-                       }
-                       
-                       namesfile = validParameter.validFile(parameters, "name", true);
-                       if (namesfile == "not open") { abort = true; }  
-                       else if (namesfile == "not found") { namesfile = ""; }
-
                        groupfile = validParameter.validFile(parameters, "group", true);
                        if (groupfile == "not open") { groupfile = ""; abort = true; }
                        else if (groupfile == "not found") { groupfile = ""; }
@@ -122,27 +118,15 @@ GetOTURepCommand::GetOTURepCommand(string option){
                                sorted = "";
                        }
                        
-                       if (abort == false) {
-
-                               //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
-                               if (globaldata->gListVector != NULL) {
-                                       vector<string> names;
-                                       string binnames;
-                                       //map names to rows in sparsematrix
-                                       for (int i = 0; i < globaldata->gListVector->size(); i++) {
-                                               names.clear();
-                                               binnames = globaldata->gListVector->get(i);
-       
-                                               splitAtComma(binnames, names);
-                               
-                                               for (int j = 0; j < names.size(); j++) {
-                                                       nameToIndex[names[j]] = i;
-                                               }
-                                       }
-                               } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
-
-                               fasta = new FastaMap();
-                       }
+                       string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
+                       large = isTrue(temp);
+                       
+                       temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
+                       convert(temp, precision); 
+                       
+                       temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
+                       convert(temp, cutoff); 
+                       cutoff += (5 / (precision * 10.0));
                }
        }
        catch(exception& e) {
@@ -156,13 +140,14 @@ GetOTURepCommand::GetOTURepCommand(string option){
 void GetOTURepCommand::help(){
        try {
                mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
-               mothurOut("The get.oturep command parameters are list, fasta, name, group, sorted and label.  The fasta and list parameters are required.\n");
+               mothurOut("The get.oturep command parameters are list, fasta, name, group, large, sorted and label.  The fasta and list parameters are required.\n");
                mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
                mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
                mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
                mothurOut("The default value for label is all labels in your inputfile.\n");
                mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
-               mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
+               mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
+               mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
                mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
                mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
        }
@@ -174,16 +159,7 @@ void GetOTURepCommand::help(){
 
 //**********************************************************************************************************************
 
-GetOTURepCommand::~GetOTURepCommand(){
-       if (abort == false) {
-               delete input;  globaldata->ginput = NULL;
-               delete read;
-               delete fasta;
-               if (groupfile != "") {
-                       delete groupMap;  globaldata->gGroupmap = NULL;
-               }
-       }
-}
+GetOTURepCommand::~GetOTURepCommand(){}
 
 //**********************************************************************************************************************
 
@@ -193,15 +169,93 @@ int GetOTURepCommand::execute(){
                if (abort == true) { return 0; }
                int error;
                
-               //read fastafile
-               fasta->readFastaFile(fastafile);
+               if (!large) {
+                       //read distance files
+                       if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
+                       else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
+                       else { mothurOut("File format error."); mothurOutEndLine(); return 0;  }
+                       
+                       readMatrix->setCutoff(cutoff);
+       
+                       if(namefile != ""){     
+                               nameMap = new NameAssignment(namefile);
+                               nameMap->readMap();
+                       }else{  nameMap = NULL;         }
+                       
+                       readMatrix->read(nameMap);
+
+                       //get matrix
+                       if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
+                       globaldata->gListVector = readMatrix->getListVector();
+
+                       SparseMatrix* matrix = readMatrix->getMatrix();
+                       
+                       // Create a data structure to quickly access the distance information.
+                       // It consists of a vector of distance maps, where each map contains
+                       // all distances of a certain sequence. Vector and maps are accessed
+                       // via the index of a sequence in the distance matrix
+                       seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
+                       for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
+                               seqVec[currentCell->row][currentCell->column] = currentCell->dist;
+                       }
+                       
+                       delete matrix;
+                       delete readMatrix;
+               }else {
+                       //process file and set up indexes
+                       if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
+                       else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
+                       else { mothurOut("File format error."); mothurOutEndLine(); return 0;  }
+                       
+                       formatMatrix->setCutoff(cutoff);
+       
+                       if(namefile != ""){     
+                               nameMap = new NameAssignment(namefile);
+                               nameMap->readMap();
+                       }else{  nameMap = NULL;         }
+                       
+                       formatMatrix->read(nameMap);
+
+                       //get matrix
+                       if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
+                       globaldata->gListVector = formatMatrix->getListVector();
+                       
+                       distFile = formatMatrix->getFormattedFileName();
+                       
+                       //positions in file where the distances for each sequence begin
+                       //rowPositions[1] = position in file where distance related to sequence 1 start.
+                       rowPositions = formatMatrix->getRowPositions();
+                       
+                       delete formatMatrix;
+                       
+                       //openfile for getMap to use
+                       openInputFile(distFile, inRow);
+               }
                
-               //in.close();
-               //read distance file and generate indexed distance file that can be used by findrep function
-//....new reading class....//
+               //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
+               if (globaldata->gListVector != NULL) {
+                       vector<string> names;
+                       string binnames;
+                       //map names to rows in sparsematrix
+                       for (int i = 0; i < globaldata->gListVector->size(); i++) {
+                               names.clear();
+                               binnames = globaldata->gListVector->get(i);
+                               
+                               splitAtComma(binnames, names);
+                               
+                               for (int j = 0; j < names.size(); j++) {
+                                       nameToIndex[names[j]] = i;
+                               }
+                       }
+               } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
                
+               fasta = new FastaMap();
+               
+               //read fastafile
+               fasta->readFastaFile(fastafile);
+                               
                //if user gave a namesfile then use it
-               if (namesfile != "") {  readNamesFile();        }
+               if (namefile != "") {   readNamesFile();        }
                
                //set format to list so input can get listvector
                globaldata->setFormat("list");
@@ -217,7 +271,7 @@ int GetOTURepCommand::execute(){
                //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
                set<string> processedLabels;
                set<string> userLabels = labels;
-               
+       
                while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                        
                        if (allLines == 1 || labels.count(list->getLabel()) == 1){
@@ -273,9 +327,18 @@ int GetOTURepCommand::execute(){
                        if (error == 1) { return 0; } //there is an error in hte input files, abort command
                }
                
-               delete list;
+               //close and remove formatted matrix file
+               inRow.close();
+               remove(distFile.c_str());
+               
                globaldata->gListVector = NULL;
-
+               delete input;  globaldata->ginput = NULL;
+               delete read;
+               delete fasta;
+               if (groupfile != "") {
+                       delete groupMap;  globaldata->gGroupmap = NULL;
+               }
+               
                return 0;
        }
        catch(exception& e) {
@@ -288,7 +351,7 @@ int GetOTURepCommand::execute(){
 void GetOTURepCommand::readNamesFile() {
        try {
                vector<string> dupNames;
-               openInputFile(namesfile, inNames);
+               openInputFile(namefile, inNames);
                
                string name, names, sequence;
        
@@ -354,37 +417,58 @@ string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, i
                // the first sequence of the OTU is the representative one
                if ((names.size() == 1) || (list->getLabel() == "unique")) {
                        return names[0];
-               } else {
+               }else{
                        vector<int> seqIndex(names.size());
                        vector<float> max_dist(names.size());
+                       vector<float> total_dist(names.size());
 
                        //fill seqIndex and initialize sums
                        for (size_t i = 0; i < names.size(); i++) {
                                seqIndex[i] = nameToIndex[names[i]];
                                max_dist[i] = 0.0;
+                               total_dist[i] = 0.0;
                        }
                        
                        // loop through all entries in seqIndex
                        SeqMap::iterator it;
                        SeqMap currMap;
                        for (size_t i=0; i < seqIndex.size(); i++) {
-       //currMap = seqVec[seqIndex[i]];
+                       
+                               if (!large) {   currMap = seqVec[seqIndex[i]];  }
+                               else            {       currMap = getMap(seqIndex[i]);  }
+                               
                                for (size_t j=0; j < seqIndex.size(); j++) {
                                        it = currMap.find(seqIndex[j]);         
                                        if (it != currMap.end()) {
                                                max_dist[i] = max(max_dist[i], it->second);
                                                max_dist[j] = max(max_dist[j], it->second);
+                                               total_dist[i] += it->second;
+                                               total_dist[j] += it->second;
+                                       }else{ //if you can't find the distance make it the cutoff
+                                               max_dist[i] = max(max_dist[i], cutoff);
+                                               max_dist[j] = max(max_dist[j], cutoff);
+                                               total_dist[i] += cutoff;
+                                               total_dist[j] += cutoff;
                                        }
                                }
                        }
                        
                        // sequence with the smallest maximum distance is the representative
+                       //if tie occurs pick sequence with smallest average distance
                        float min = 10000;
                        int minIndex;
                        for (size_t i=0; i < max_dist.size(); i++) {
                                if (max_dist[i] < min) {
                                        min = max_dist[i];
                                        minIndex = i;
+                               }else if (max_dist[i] == min) {
+                                       float currentAverage = total_dist[minIndex] / (float) total_dist.size();
+                                       float newAverage = total_dist[i] / (float) total_dist.size();
+                                       
+                                       if (newAverage < currentAverage) {
+                                               min = max_dist[i];
+                                               minIndex = i;
+                                       }
                                }
                        }
 
@@ -406,12 +490,19 @@ int GetOTURepCommand::process(ListVector* processList) {
                string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
                openOutputFile(outputFileName, out);
                vector<repStruct> reps;
-
+               
+               ofstream newNamesOutput;
+               string outputNamesFile = getRootName(listfile) + processList->getLabel() + ".rep.names";
+               openOutputFile(outputNamesFile, newNamesOutput);
+               
                //for each bin in the list vector
                for (int i = 0; i < processList->size(); i++) {
                        string groups;
                        int binsize;
                        nameRep = findRep(i, groups, processList, binsize);
+                       
+                       //output to new names file
+                       newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
 
                        //print out name and sequence for that bin
                        sequence = fasta->getSequence(nameRep);
@@ -432,6 +523,7 @@ int GetOTURepCommand::process(ListVector* processList) {
                        }else { 
                                mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
                                remove(outputFileName.c_str());
+                               remove(outputNamesFile.c_str());
                                return 1;
                        }
                }
@@ -456,6 +548,7 @@ int GetOTURepCommand::process(ListVector* processList) {
                }
 
                out.close();
+               newNamesOutput.close();
                return 0;
 
        }
@@ -466,3 +559,33 @@ int GetOTURepCommand::process(ListVector* processList) {
 }
 
 //**********************************************************************************************************************
+SeqMap GetOTURepCommand::getMap(int row) {
+       try {
+               SeqMap rowMap;
+               
+               //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
+               if (rowPositions[row] != -1){
+                       //go to row in file
+                       inRow.seekg(rowPositions[row]);
+                       
+                       int rowNum, numDists, colNum;
+                       float dist;
+                       
+                       inRow >> rowNum >> numDists;
+                       
+                       for(int i = 0; i < numDists; i++) {
+                               inRow >> colNum >> dist;
+                               rowMap[colNum] = dist;
+                               
+                       }
+               }
+               
+               return rowMap;
+       }
+       catch(exception& e) {
+               errorOut(e, "GetOTURepCommand", "getMap");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
index 58fec8d4f3e3aaaf148a9ff8455c4fba93c6b35f..0e8a121793823677186d67e5d3cd255dd7ec47d1 100644 (file)
@@ -18,6 +18,8 @@
 #include "readotu.h"
 #include "fastamap.h"
 #include "groupmap.h"
+#include "readmatrix.hpp"
+#include "formatmatrix.h"
 
 typedef list<PCell>::iterator MatData;
 typedef map<int, float> SeqMap;
@@ -48,15 +50,24 @@ private:
        InputData* input;
        FastaMap* fasta;
        GroupMap* groupMap;
-       string filename, fastafile, listfile, namesfile, groupfile, label, sorted, phylipfile, columnfile, namefile;
+       ReadMatrix* readMatrix;
+       FormatMatrix* formatMatrix;
+       NameAssignment* nameMap;
+       string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, columnfile, distFile, format;
        ofstream out;
-       ifstream in, inNames;
-       bool abort, allLines, groupError;
+       ifstream in, inNames, inRow;
+       bool abort, allLines, groupError, large;
        set<string> labels; //holds labels to be used
        map<string, int> nameToIndex;  //maps sequence name to index in sparsematrix
+       float cutoff;
+       int precision;
+       vector<SeqMap> seqVec;                  // contains maps with sequence index and distance
+                                                                       // for all distances related to a certain sequence
+       vector<int> rowPositions;
 
        void readNamesFile();
        int process(ListVector*);
+       SeqMap getMap(int);
        string findRep(int, string&, ListVector*, int&);        // returns the name of the "representative" sequence of given bin, 
                                                                        // fills a string containing the groups in that bin if a groupfile is given,
                                                                        // and returns the number of sequences in the given bin
index 8081f1090f3754e9904bdc838a2133d0baf58f25..f224b479d610b179511ec3bd2b88ca2afe2b210a 100644 (file)
@@ -357,7 +357,7 @@ void GetSharedOTUCommand::process(ListVector* shared) {
                                        }
                                        
                                        outFasta << seqs[k].getAligned() << endl;
-                               }
+                               }else {         mothurOut(seqs[k].getName() + " is not in your fasta file. Please correct."); mothurOutEndLine();       }
                        }
                        
                        outFasta.close();
index 7e81d7f1a3efe2535f1d9d11715bb2c517a508ec..1c134f543ef45e89dd97ef6b0462b8cd3a282d67 100644 (file)
@@ -206,7 +206,7 @@ int ReadDistCommand::execute(){
                        if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix;  }
                        globaldata->gSparseMatrix = read->getMatrix();
                        numDists = globaldata->gSparseMatrix->getNNodes();
-       cout << "matrix contains " << numDists << " distances." << endl;
+       //cout << "matrix contains " << numDists << " distances." << endl;
                        
       int lines = cutoff / (1.0/precision);
       vector<float> dist_cutoff(lines+1,0);