]> git.donarmstrong.com Git - mothur.git/commitdiff
added warning about merging with something above cutoff to cluster. working on chimeras
authorwestcott <westcott>
Tue, 23 Feb 2010 19:31:30 +0000 (19:31 +0000)
committerwestcott <westcott>
Tue, 23 Feb 2010 19:31:30 +0000 (19:31 +0000)
23 files changed:
bellerophon.cpp
bellerophon.h
ccode.cpp
ccode.h
chimera.cpp
chimera.h
chimeracheckrdp.cpp
chimeracheckrdp.h
chimeraseqscommand.cpp
chimeraseqscommand.h
chimeraslayer.cpp
chimeraslayer.h
cluster.cpp
cluster.hpp
clustercommand.cpp
distancecommand.cpp
distancecommand.h
matrixoutputcommand.cpp
matrixoutputcommand.h
mgclustercommand.cpp
mgclustercommand.h
pintail.cpp
pintail.h

index afb88d2ea29e7b59e971fb6cee9c95f8ffd6e475..17859dc710309b7535e234aec56791bc7a8ee1b2 100644 (file)
@@ -27,7 +27,7 @@ Bellerophon::Bellerophon(string name, string o)  {
 }
 
 //***************************************************************************************************************
-void Bellerophon::print(ostream& out) {
+void Bellerophon::print(ostream& out, ostream& outAcc) {
        try {
                int above1 = 0;
                out << "Name\tScore\tLeft\tRight\t" << endl;
@@ -38,6 +38,7 @@ void Bellerophon::print(ostream& out) {
                        //calc # of seqs with preference above 1.0
                        if (pref[i].score[0] > 1.0) { 
                                above1++; 
+                               outAcc << pref[i].name << endl;
                                mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
                                mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
                        }
index aa53f9852f20202ae77841fbf9262a9802356bbe..dd5b3da8035cf1528c07a2964f17b9b1aa051cab 100644 (file)
@@ -29,7 +29,7 @@ class Bellerophon : public Chimera {
                ~Bellerophon() {};
                
                int getChimeras();
-               void print(ostream&);
+               void print(ostream&, ostream&);
                
                void setCons(string){};
                void setQuantiles(string) {};
index 326af7cec61a88eb6ab859f203098c0ded3a9cfb..28e9723f6c19984b3e6cb9dbcfc869c0d75d6ffe 100644 (file)
--- a/ccode.cpp
+++ b/ccode.cpp
@@ -35,7 +35,7 @@ void Ccode::printHeader(ostream& out) {
        out << "For full window mapping info refer to " << mapInfo << endl << endl;
 }
 //***************************************************************************************************************
-void Ccode::print(ostream& out) {
+void Ccode::print(ostream& out, ostream& outAcc) {
        try {
                
                mothurOutEndLine();
@@ -112,6 +112,7 @@ void Ccode::print(ostream& out) {
                        
                if (results) {
                        mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
+                       outAcc << querySeq->getName() << endl;
                }
 
                //free memory
diff --git a/ccode.h b/ccode.h
index c60b1302f9802af48e3d34f847ab67e8d425cf52..5d05cd6a85a06b622e011c28e16bd30959503e6b 100644 (file)
--- a/ccode.h
+++ b/ccode.h
@@ -28,7 +28,7 @@ class Ccode : public Chimera {
                ~Ccode();
                
                int getChimeras(Sequence* query);
-               void print(ostream&);
+               void print(ostream&, ostream&);
                void printHeader(ostream&);             
        private:
        
index 9ad287b68d8457e6dffd6e101bbca007c64745eb..85b0b38c8daab23e44c5c676ed1f38009afb7a62 100644 (file)
@@ -22,14 +22,14 @@ string Chimera::createFilter(vector<Sequence*> seqs, float t) {
                vector<int> t;          t.resize(seqs[0]->getAligned().length(), 0);
                vector<int> g;          g.resize(seqs[0]->getAligned().length(), 0);
                vector<int> c;          c.resize(seqs[0]->getAligned().length(), 0);
-               
+       
                filterString = (string(seqs[0]->getAligned().length(), '1'));
                
                //for each sequence
                for (int i = 0; i < seqs.size(); i++) {
                
                        string seqAligned = seqs[i]->getAligned();
-                       
+               
                        for (int j = 0; j < seqAligned.length(); j++) {
                                //if this spot is a gap
                                if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
@@ -47,10 +47,8 @@ string Chimera::createFilter(vector<Sequence*> seqs, float t) {
                        if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
                        
                        else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {      filterString[i] = '0';  numColRemoved++;  }
-                       //cout << "a = " << a[i] <<  " t = " << t[i] <<  " g = " << g[i] <<  " c = " << c[i] << endl;
+                       cout << "a = " << a[i] <<  " t = " << t[i] <<  " g = " << g[i] <<  " c = " << c[i] << endl;
                }
-       
-//cout << "filter = " << filterString << endl; 
 
                mothurOut("Filter removed " + toString(numColRemoved) + " columns.");  mothurOutEndLine();
                return filterString;
@@ -95,7 +93,7 @@ vector<Sequence*> Chimera::readSeqs(string file) {
                openInputFile(file, in);
                vector<Sequence*> container;
                int count = 0;
-               int length = 0;
+               length = 0;
                unaligned = false;
                
                //read in seqs and store in vector
index da908ce07571a54148f561e28c16b94e8b09f68b..f893d63ed9c57e7d53d84843cb3b647cc9dd89d6 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -114,6 +114,7 @@ class Chimera {
                virtual void setTemplateSeqs(vector<Sequence*> t)       {       templateSeqs = t;       }
                virtual bool getUnaligned()                             {       return unaligned;                       }
                virtual void setTemplateFile(string t)  {   templateFileName = t;       }
+               virtual int getLength()                                 {   return length;      }
                
                virtual void setCons(string){};
                virtual void setQuantiles(string){};
@@ -127,14 +128,14 @@ class Chimera {
                virtual void printHeader(ostream&){};
                virtual int getChimeras(Sequence*){ return 0; }
                virtual int getChimeras(){ return 0; }
-               virtual void print(ostream&){}; 
+               virtual void print(ostream&, ostream&){};       
                
                
        protected:
                
                vector<Sequence*> templateSeqs;
                bool filter, correction, svg, unaligned;
-               int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters;
+               int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters, length;
                float divR;
                string seqMask, quanfile, filterString, name, outputDir, templateFileName;
                Sequence* getSequence(string);  //find sequence from name       
index 39ed4870fd1f6668839b3abf6299b537dc4c3ebf..0a2ea59e0257b9fb58be28e9ca0cb3e0356d749e 100644 (file)
@@ -24,7 +24,7 @@ ChimeraCheckRDP::~ChimeraCheckRDP() {
        }
 }      
 //***************************************************************************************************************
-void ChimeraCheckRDP::print(ostream& out) {
+void ChimeraCheckRDP::print(ostream& out, ostream& outAcc) {
        try {
                
                mothurOut("Processing: " + querySeq->getName()); mothurOutEndLine();
index e12e7d3d9d6cf3a08e44d453e7ea5afd2a34f06b..0ba851a9d6906b48a44b66d311882e0a88c8c400 100644 (file)
@@ -29,7 +29,7 @@ class ChimeraCheckRDP : public Chimera {
                ~ChimeraCheckRDP();
                
                int getChimeras(Sequence*);
-               void print(ostream&);
+               void print(ostream&, ostream&);
                void doPrep();
                
        private:
index 2ec6ea9c85e2d8034aafa6527fd53b6bf28a052d..d34149d1e74fb8738a89133fc9e343268ef9121f 100644 (file)
@@ -311,15 +311,18 @@ int ChimeraSeqsCommand::execute(){
                chimera->setIters(iters);
                chimera->setTemplateFile(templatefile);
 
-               
+               string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras";
+               string accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".accnos";
+
                
                vector<Sequence*> templateSeqs;
                if ((method != "bellerophon") && (method != "chimeracheck")) {   
                        templateSeqs = chimera->readSeqs(templatefile);   
                        if (chimera->getUnaligned()) { 
-                               mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine(); 
+                               mothurOut("Your template sequences are different lengths, please correct."); mothurOutEndLine(); 
                                //free memory
                                for (int i = 0; i < templateSeqs.size(); i++)           {  delete templateSeqs[i];              }
+                               delete chimera;
                                return 0; 
                        }
                        
@@ -329,18 +332,24 @@ int ChimeraSeqsCommand::execute(){
                }else if (method == "bellerophon") {//run bellerophon separately since you need to read entire fastafile to run it
                        chimera->getChimeras();
                        
-                       string outputFName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras";
                        ofstream out;
-                       openOutputFile(outputFName, out);
+                       openOutputFile(outputFileName, out);
                        
-                       chimera->print(out);
+                       ofstream out2;
+                       openOutputFile(accnosFileName, out2);
+                       
+                       chimera->print(out, out2);
                        out.close();
+                       out2.close(); 
+                       
                        return 0;
                }
                
                //some methods need to do prep work before processing the chimeras
                chimera->doPrep(); 
                
+               templateSeqsLength = chimera->getLength();
+               
                ofstream outHeader;
                string tempHeader = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras.tempHeader";
                openOutputFile(tempHeader, outHeader);
@@ -348,7 +357,6 @@ int ChimeraSeqsCommand::execute(){
                chimera->printHeader(outHeader);
                outHeader.close();
                
-               string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras";
 
                //break up file
                #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
@@ -360,7 +368,7 @@ int ChimeraSeqsCommand::execute(){
                                
                                lines.push_back(new linePair(0, numSeqs));
                                
-                               driver(lines[0], outputFileName, fastafile);
+                               driver(lines[0], outputFileName, fastafile, accnosFileName);
                                
                        }else{
                                vector<int> positions;
@@ -391,14 +399,19 @@ int ChimeraSeqsCommand::execute(){
                                }
                                
                                
-                               createProcesses(outputFileName, fastafile); 
-                               
+                               createProcesses(outputFileName, fastafile, accnosFileName); 
+                       
                                rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
-                                                               
+                               rename((accnosFileName + toString(processIDS[0]) + ".temp").c_str(), accnosFileName.c_str());
+                                       
                                //append alignment and report files
                                for(int i=1;i<processors;i++){
                                        appendOutputFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
                                        remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
+                                       
+                                       appendOutputFiles((accnosFileName + toString(processIDS[i]) + ".temp"), accnosFileName);
+                                       remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str());
+
                                }
                        }
 
@@ -409,21 +422,23 @@ int ChimeraSeqsCommand::execute(){
                        inFASTA.close();
                        lines.push_back(new linePair(0, numSeqs));
                        
-                       driver(lines[0], outputFileName, fastafile);
+                       driver(lines[0], outputFileName, fastafile, accnosFileName);
                #endif
                
                //mothurOut("Output File Names: ");
                //if ((filter) && (method == "bellerophon")) { mothurOut(
                //if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; }
                //      else                             { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
-               
+       
                appendOutputFiles(tempHeader, outputFileName);
+       
                remove(outputFileName.c_str());
                rename(tempHeader.c_str(), outputFileName.c_str());
-
-               for (int i = 0; i < templateSeqs.size(); i++)           {   delete templateSeqs[i];     }
+       
+               for (int i = 0; i < templateSeqs.size(); i++)           {  delete templateSeqs[i];      } 
+               delete chimera;
                
-               if (method == "chimeracheck") { mothurOutEndLine(); mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine();  }
+               if (method == "chimeracheck") { remove(accnosFileName.c_str());  mothurOutEndLine(); mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine();  }
                
                mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences.");       mothurOutEndLine();
                
@@ -436,11 +451,14 @@ int ChimeraSeqsCommand::execute(){
        }
 }//**********************************************************************************************************************
 
-int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filename){
+int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filename, string accnos){
        try {
                ofstream out;
                openOutputFile(outputFName, out);
                
+               ofstream out2;
+               openOutputFile(accnos, out2);
+               
                ifstream inFASTA;
                openInputFile(filename, inFASTA);
 
@@ -451,12 +469,16 @@ int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filena
                        Sequence* candidateSeq = new Sequence(inFASTA);  gobble(inFASTA);
                                
                        if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
-                                       
-                               //find chimeras
-                               chimera->getChimeras(candidateSeq);
+                               
+                               if ((candidateSeq->getAligned().length() != templateSeqsLength) && (method != "chimeracheck")) {  //chimeracheck does not require seqs to be aligned
+                                       mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); mothurOutEndLine();
+                               }else{
+                                       //find chimeras
+                                       chimera->getChimeras(candidateSeq);
                
-                               //print results
-                               chimera->print(out);
+                                       //print results
+                                       chimera->print(out, out2);
+                               }
                        }
                        delete candidateSeq;
                        
@@ -467,6 +489,7 @@ int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filena
                if((line->numSeqs) % 100 != 0){ mothurOut("Processing sequence: " + toString(line->numSeqs)); mothurOutEndLine();               }
                
                out.close();
+               out2.close();
                inFASTA.close();
                                
                return 1;
@@ -479,7 +502,7 @@ int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filena
 
 /**************************************************************************************************/
 
-void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename) {
+void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename, string accnos) {
        try {
 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                int process = 0;
@@ -493,7 +516,7 @@ void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename)
                                processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
                                process++;
                        }else if (pid == 0){
-                               driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename);
+                               driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp");
                                exit(0);
                        }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
                }
@@ -520,7 +543,7 @@ void ChimeraSeqsCommand::appendOutputFiles(string temp, string filename) {
                ifstream input;
                
                openOutputFileAppend(temp, output);
-               openInputFile(filename, input);
+               openInputFile(filename, input, "noerror");
                
                while(char c = input.get()){
                        if(input.eof())         {       break;                  }
index 3e437ec33c5382b57b2364570a4a4caa094c97ea..56477a11c5249b88ded4acd2c87ac547e8bc9754 100644 (file)
@@ -35,14 +35,14 @@ private:
        vector<int> processIDS;   //processid
        vector<linePair*> lines;
        
-       int driver(linePair*, string, string);
-       void createProcesses(string, string);
+       int driver(linePair*, string, string, string);
+       void createProcesses(string, string, string);
        void appendOutputFiles(string, string); 
 
        bool abort;
        string method, fastafile, templatefile, consfile, quanfile, maskfile, namefile, outputDir, search;
        bool filter, correction, svg, printAll, realign;
-       int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs;
+       int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
        float divR;
        Chimera* chimera;
        
index 769032856bb5ca0801b21b3457ea5fa36bc90a20..6ca8c2945bb4600e6c7f518b1d7c4dfa1a529225 100644 (file)
@@ -118,7 +118,7 @@ void ChimeraSlayer::printHeader(ostream& out) {
        out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
 }
 //***************************************************************************************************************
-void ChimeraSlayer::print(ostream& out) {
+void ChimeraSlayer::print(ostream& out, ostream& outAcc) {
        try {
                if (chimeraFlags == "yes") {
                        string chimeraFlag = "no";
@@ -130,6 +130,7 @@ void ChimeraSlayer::print(ostream& out) {
                        if (chimeraFlag == "yes") {     
                                if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
                                        mothurOut(querySeq->getName() + "\tyes"); mothurOutEndLine();
+                                       outAcc << querySeq->getName() << endl;
                                }
                        }
                        
index bf9cf7f8166884e1dda08f0de7a1989bec961c65..14a22beed74f0d0fe6dbea9e5d255117d3187008 100644 (file)
@@ -27,7 +27,7 @@ class ChimeraSlayer : public Chimera {
                ~ChimeraSlayer();
                
                int getChimeras(Sequence*);
-               void print(ostream&);
+               void print(ostream&, ostream&);
                void printHeader(ostream&);
                void doPrep();
                
index 931b44d70288c120cf151487f88580e07207cc68..4009ed075e63c78e4fb661352c675cff3209e030 100644 (file)
@@ -15,7 +15,7 @@
 /***********************************************************************/
 
 Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) :
-rabund(rav), list(lv), dMatrix(dm), cutoff(c)
+rabund(rav), list(lv), dMatrix(dm)
 {
 /*
        cout << "sizeof(MatData): " << sizeof(MatData) << endl;
@@ -56,6 +56,9 @@ rabund(rav), list(lv), dMatrix(dm), cutoff(c)
                seqVec[currentCell->column].push_back(currentCell);
        }
        mapWanted = false;  //set to true by mgcluster to speed up overlap merge
+       
+       //save so you can modify as it changes in average neighbor
+       cutoff = c;
 }
 
 /***********************************************************************/
@@ -170,11 +173,12 @@ void Cluster::clusterNames(){
 //This function clusters based on the method of the derived class
 //At the moment only average and complete linkage are covered, because
 //single linkage uses a different approach.
-void Cluster::update(){
+void Cluster::update(double& cutOFF){
        try {
                getRowColCells();       
        
-               vector<int> found(nColCells, 0);
+               vector<int> foundCol(nColCells, 0);
+
                int search;
                bool changed;
 
@@ -187,11 +191,13 @@ void Cluster::update(){
                                } else {
                                        search = rowCells[i]->row;
                                }
-               
+                               
+                               bool merged = false;
                                for (int j=0;j<nColCells;j++) {
-                                       if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) {
+                                       if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance
                                                if (colCells[j]->row == search || colCells[j]->column == search) {
-                                                       found[j] = 1;
+                                                       foundCol[j] = 1;
+                                                       merged = true;
                                                        changed = updateDistance(colCells[j], rowCells[i]);
                                                        // If the cell's distance changed and it had the same distance as 
                                                        // the smallest distance, invalidate the mins vector in SparseMatrix
@@ -203,9 +209,16 @@ void Cluster::update(){
                                                        }
                                                        break;
                                                }
-                                       }
+                                       }               
                                }
-                               removeCell(rowCells[i], i , -1);
+                               //if not merged it you need it for warning 
+                               if (!merged) {  
+                                       mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); mothurOutEndLine(); 
+                                       if (cutOFF > rowCells[i]->dist) {  cutOFF = rowCells[i]->dist;  mothurOut("changing cutoff to " + toString(cutOFF));  mothurOutEndLine(); }
+
+                               }
+                               removeCell(rowCells[i], i , -1);  
+                               
                        }
                }
                clusterBins();
@@ -214,12 +227,12 @@ void Cluster::update(){
                // Special handling for singlelinkage case, not sure whether this
                // could be avoided
                for (int i=nColCells-1;i>=0;i--) {
-                       if (found[i] == 0) {
-                               removeCell(colCells[i], -1, i);
-cout << "smallRow = " << smallRow+1 << " smallCol = " << smallCol+1 << endl;
+                       if (foundCol[i] == 0) {
                                if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) {
-                                       mothurOut("Warning: merging " + toString(colCells[i]->row+1)  + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine();
+                                       mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); mothurOutEndLine();
+                                       if (cutOFF > colCells[i]->dist) {  cutOFF = colCells[i]->dist;  mothurOut("changing cutoff to " + toString(cutOFF));  mothurOutEndLine(); }
                                }
+                               removeCell(colCells[i], -1, i);
                        }
                }
        }
index 38897c9cf0a2a2c0c95933eb7cabc8732f95839e..685c843c0199b73083f7eb3a17693b33fe1a4d36 100644 (file)
@@ -14,7 +14,7 @@ class Cluster {
        
 public:
        Cluster(RAbundVector*, ListVector*, SparseMatrix*, float);
-    virtual void update();                             
+    virtual void update(double&);                              
        virtual string getTag() = 0;
        virtual void setMapWanted(bool m);  
        virtual map<string, int> getSeqtoBin()  {  return seq2Bin;      }
index 9d09d2cbe465da6531bbf720f0fd7d89aec08bce..be7cc51d9f4ecf2765cd878a932b3a804e4bf1ba 100644 (file)
@@ -158,7 +158,7 @@ int ClusterCommand::execute(){
 
                        loops++;
 
-                       cluster->update();
+                       cluster->update(cutoff);
                        float dist = matrix->getSmallDist();
                        float rndDist = roundDist(dist, precision);
 
index 5364cbfee12cb3bc1813d0112beccb96126528cf..c615b8068ae1576b07f2e64a2742bfcb36e3e9b3 100644 (file)
@@ -26,7 +26,7 @@ DistanceCommand::DistanceCommand(string option){
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta", "phylip", "calc", "countends", "cutoff", "processors", "outputdir","inputdir"};
+                       string Array[] =  {"fasta", "output", "calc", "countends", "cutoff", "processors", "outputdir","inputdir"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -90,8 +90,9 @@ DistanceCommand::DistanceCommand(string option){
                        temp = validParameter.validFile(parameters, "processors", false);       if(temp == "not found"){        temp = "1"; }
                        convert(temp, processors); 
                        
-                       phylip = validParameter.validFile(parameters, "phylip", false);         if(phylip == "not found"){      phylip = "F"; }
-       
+                       output = validParameter.validFile(parameters, "output", false);         if(output == "not found"){      output = "column"; }
+                       
+                       if ((output != "column") && (output != "lt") && (output != "square")) { mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); mothurOutEndLine(); output = "column"; }
                        
                        ValidCalculators validCalculator;
                        
@@ -137,11 +138,12 @@ DistanceCommand::~DistanceCommand(){
 void DistanceCommand::help(){
        try {
                mothurOut("The dist.seqs command reads a file containing sequences and creates a distance file.\n");
-               mothurOut("The dist.seqs command parameters are fasta, calc, countends, cutoff and processors.  \n");
+               mothurOut("The dist.seqs command parameters are fasta, calc, countends, output, cutoff and processors.  \n");
                mothurOut("The fasta parameter is required.\n");
                mothurOut("The calc parameter allows you to specify the method of calculating the distances.  Your options are: nogaps, onegap or eachgap. The default is onegap.\n");
                mothurOut("The countends parameter allows you to specify whether to include terminal gaps in distance.  Your options are: T or F. The default is T.\n");
                mothurOut("The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n");
+               mothurOut("The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n");
                mothurOut("The processors parameter allows you to specify number of processors to use.  The default is 1.\n");
                mothurOut("The dist.seqs command should be in the following format: \n");
                mothurOut("dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n");
@@ -165,15 +167,17 @@ int DistanceCommand::execute(){
                
                string outputFile;
                
-               //doses the user want the phylip formatted file as well
-               if (isTrue(phylip) == true) {
+               if (output == "lt") { //does the user want lower triangle phylip formatted file 
                        outputFile = outputDir + getRootName(getSimpleName(fastafile)) + "phylip.dist";
                        remove(outputFile.c_str());
                        
                        //output numSeqs to phylip formatted dist file
-               }else { //user wants column format
+               }else if (output == "column") { //user wants column format
                        outputFile = outputDir + getRootName(getSimpleName(fastafile)) + "dist";
                        remove(outputFile.c_str());
+               }else { //assume square
+                       outputFile = outputDir + getRootName(getSimpleName(fastafile)) + "square.dist";
+                       remove(outputFile.c_str());
                }
                                
 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
@@ -205,6 +209,8 @@ int DistanceCommand::execute(){
                driver(0, numSeqs, outputFile, cutoff);
 #endif
                
+               if (output == "square") {  convertMatrix(outputFile); }
+               
                delete distCalculator;
                
                return 0;
@@ -260,10 +266,10 @@ int DistanceCommand::driver(int startLine, int endLine, string dFileName, float
                outFile.setf(ios::fixed, ios::showpoint);
                outFile << setprecision(4);
                
-               if(isTrue(phylip) && startLine == 0){   outFile << alignDB.getNumSeqs() << endl;        }
+               if((output == "lt") && startLine == 0){ outFile << alignDB.getNumSeqs() << endl;        }
                
                for(int i=startLine;i<endLine;i++){
-                       if(isTrue(phylip))      {       
+                       if(output == "lt")      {       
                                string name = alignDB.get(i).getName();
                                if (name.length() < 10) { //pad with spaces to make compatible
                                        while (name.length() < 10) {  name += " ";  }
@@ -275,13 +281,18 @@ int DistanceCommand::driver(int startLine, int endLine, string dFileName, float
                                double dist = distCalculator->getDist();
                                
                                if(dist <= cutoff){
-                                       if (!isTrue(phylip)) { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
+                                       if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; }
                                }
-                               if (isTrue(phylip)) {  outFile << dist << '\t'; }
+                               if (output == "lt") {  outFile << dist << '\t'; }
                                
+                               if (output == "square") { //make a square column you can convert to square phylip
+                                       outFile << alignDB.get(i).getName() << '\t' << alignDB.get(j).getName() << '\t' << dist << endl;
+                                       outFile << alignDB.get(j).getName() << '\t' << alignDB.get(i).getName() << '\t' << dist << endl;
+                               }
+
                        }
                        
-                       if (isTrue(phylip) == true) { outFile << endl; }
+                       if (output == "lt") { outFile << endl; }
                        
                        if(i % 100 == 0){
                                mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); mothurOutEndLine();
@@ -299,7 +310,90 @@ int DistanceCommand::driver(int startLine, int endLine, string dFileName, float
                exit(1);
        }
 }
+/**************************************************************************************************/
+void DistanceCommand::convertMatrix(string outputFile) {
+       try{
 
+               //sort file by first column so the distances for each row are together
+               string outfile = getRootName(outputFile) + "sorted.dist.temp";
+               
+               //use the unix sort 
+               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       string command = "sort -n " + outputFile + " -o " + outfile;
+                       system(command.c_str());
+               #else //sort using windows sort
+                       string command = "sort " + outputFile + " /O " + outfile;
+                       system(command.c_str());
+               #endif
+               
+
+               //output to new file distance for each row and save positions in file where new row begins
+               ifstream in;
+               openInputFile(outfile, in);
+               
+               ofstream out;
+               openOutputFile(outputFile, out);
+               
+               out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
+
+               out << alignDB.getNumSeqs() << endl;
+               
+               //get first currentRow
+               string first, currentRow, second;
+               float dist;
+               map<string, float> rowDists; //take advantage of the fact that maps are already sorted by key 
+               map<string, float>::iterator it;
+               
+               in >> first;
+               currentRow = first;
+               
+               rowDists[first] = 0.00; //distance to yourself is 0.0
+               
+               in.seekg(0);
+               //openInputFile(outfile, in);
+               
+               while(!in.eof()) {
+                       in >> first >> second >> dist; gobble(in);
+                               
+                       if (first != currentRow) {
+                               //print out last row
+                               out << currentRow << '\t'; //print name
+
+                               //print dists
+                               for (it = rowDists.begin(); it != rowDists.end(); it++) {
+                                       out << it->second << '\t';
+                               }
+                               out << endl;
+                               
+                               //start new row
+                               currentRow = first;
+                               rowDists.clear();
+                               rowDists[first] = 0.00;
+                               rowDists[second] = dist;
+                       }else{
+                               rowDists[second] = dist;
+                       }
+               }
+               //print out last row
+               out << currentRow << '\t'; //print name
+                               
+               //print dists
+               for (it = rowDists.begin(); it != rowDists.end(); it++) {
+                       out << it->second << '\t';
+               }
+               out << endl;
+               
+               in.close();
+               out.close();
+               
+               remove(outfile.c_str());
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "DistanceCommand", "convertMatrix");
+               exit(1);
+       }
+}
 /**************************************************************************************************
 void DistanceCommand::appendFiles(string temp, string filename) {
        try{
@@ -324,3 +418,5 @@ void DistanceCommand::appendFiles(string temp, string filename) {
        }
 }
 /**************************************************************************************************/
+
+
index c7aaf46137a97715baee381d56824482a0e08e80..7521ac70bdf6cf323b6442577ec7b1e609d6ccd8 100644 (file)
@@ -34,7 +34,7 @@ private:
        Dist* distCalculator;
        SequenceDB alignDB;
 
-       string countends, phylip, fastafile, calc, outputDir;
+       string countends, output, fastafile, calc, outputDir;
        int processors;
        float cutoff;
        map<int, int> processIDS;   //end line, processid
@@ -46,6 +46,7 @@ private:
        //void appendFiles(string, string);
        void createProcesses(string);
        int driver(/*Dist*, SequenceDB, */int, int, string, float);
+       void convertMatrix(string);
 
 };
 
index fb74875159a0aabe98e8ea79a540802d25ca2f21..87e6489a5e4c79010b8d57df7511117e028cc312 100644 (file)
@@ -36,7 +36,7 @@ MatrixOutputCommand::MatrixOutputCommand(string option){
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"label","calc","groups","outputdir","inputdir"};
+                       string Array[] =  {"label","calc","groups","outputdir","inputdir", "output"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -70,6 +70,9 @@ MatrixOutputCommand::MatrixOutputCommand(string option){
                                else { allLines = 1;  }
                        }
                        
+                       output = validParameter.validFile(parameters, "output", false);         if(output == "not found"){      output = "lt"; }
+                       if ((output != "lt") && (output != "square")) { mothurOut(output + " is not a valid output form. Options are lt and square. I will use lt."); mothurOutEndLine(); output = "lt"; }
+                       
                        //if the user has not specified any labels use the ones from read.otu
                        if (label == "") {  
                                allLines = globaldata->allLines; 
@@ -136,10 +139,11 @@ MatrixOutputCommand::MatrixOutputCommand(string option){
 void MatrixOutputCommand::help(){
        try {
                mothurOut("The dist.shared command can only be executed after a successful read.otu command.\n");
-               mothurOut("The dist.shared command parameters are groups, calc and label.  No parameters are required.\n");
+               mothurOut("The dist.shared command parameters are groups, calc, output and label.  No parameters are required.\n");
                mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included used.\n");
                mothurOut("The group names are separated by dashes. The label parameter allows you to select what distance levels you would like distance matrices created for, and is also separated by dashes.\n");
                mothurOut("The dist.shared command should be in the following format: dist.shared(groups=yourGroups, calc=yourCalcs, label=yourLabels).\n");
+               mothurOut("The output parameter allows you to specify format of your distance matrix. Options are lt, and square. The default is lt.\n");
                mothurOut("Example dist.shared(groups=A-B-C, calc=jabund-sorabund).\n");
                mothurOut("The default value for groups is all the groups in your groupfile.\n");
                mothurOut("The default value for calc is jclass and thetayc.\n");
@@ -262,17 +266,28 @@ int MatrixOutputCommand::execute(){
 void MatrixOutputCommand::printSims(ostream& out) {
        try {
                
-               //output column headers
+               out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
+               
+               //output num seqs
                out << simMatrix.size() << endl;
                
-               for (int m = 0; m < simMatrix.size(); m++)      {
-                       out << lookup[m]->getGroup() << '\t';
-                       for (int n = 0; n < m; n++)     {
-                               out << simMatrix[m][n] << '\t'; 
+               if (output == "lt") {
+                       for (int m = 0; m < simMatrix.size(); m++)      {
+                               out << lookup[m]->getGroup() << '\t';
+                               for (int n = 0; n < m; n++)     {
+                                       out << simMatrix[m][n] << '\t'; 
+                               }
+                               out << endl;
+                       }
+               }else{
+                       for (int m = 0; m < simMatrix.size(); m++)      {
+                               out << lookup[m]->getGroup() << '\t';
+                               for (int n = 0; n < simMatrix[m].size(); n++)   {
+                                       out << simMatrix[m][n] << '\t'; 
+                               }
+                               out << endl;
                        }
-                       out << endl;
                }
-
        }
        catch(exception& e) {
                errorOut(e, "MatrixOutputCommand", "printSims");
@@ -315,7 +330,7 @@ void MatrixOutputCommand::process(vector<SharedRAbundVector*> thisLookup){
                                                }
                                        }
                                        
-                                       exportFileName = outputDir + getRootName(getSimpleName(globaldata->inputFileName)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".dist";
+                                       exportFileName = outputDir + getRootName(getSimpleName(globaldata->inputFileName)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist";
                                        openOutputFile(exportFileName, out);
                                        printSims(out);
                                        out.close();
index 6c1ce895ac0459b2b1e0358814bc0e30d8bfe51c..ff7519ee00751dcad80ebdb9bc04e384f54cd45b 100644 (file)
@@ -42,7 +42,7 @@ private:
        InputData* input;
        ValidCalculators* validCalculator;
        vector<SharedRAbundVector*> lookup;
-       string exportFileName;
+       string exportFileName, output;
        int numGroups;
        ofstream out;
 
index 07c51479f35ff00db1694dcef2bc135df32976da..fba2959b5f11751ea714ed79d8c2fcdf0ac24bc4 100644 (file)
@@ -191,7 +191,7 @@ int MGClusterCommand::execute(){
                        //cluster using cluster classes
                        while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
                                
-                               cluster->update();
+                               cluster->update(cutoff);
                                float dist = distMatrix->getSmallDist();
                                float rndDist = roundDist(dist, precision);
                                
index 89b72fceb2ad6c57cf0cde5ae531bf985bf3468f..a1dac221b891a8551a36b7493d5bb7a603204cf5 100644 (file)
@@ -40,7 +40,8 @@ private:
        
        string blastfile, method, namefile, overlapFile, distFile, outputDir;
        ofstream sabundFile, rabundFile, listFile;
-       float cutoff, penalty;
+       double cutoff;
+       float penalty;
        int precision, length, precisionLength;
        bool abort, minWanted, hclusterWanted, merge;
        
index 46da9137a18c072bfc1033025fd2551ea92c728a..63a90148699985581c80febe11288a99d5cd87bf 100644 (file)
@@ -80,15 +80,9 @@ void Pintail::doPrep() {
                bool reRead = false;
                //create filter if needed for later
                if (filter) {
-                       reRead = true;
-                       
-                       if (seqMask != "") {
-                               //mask templates
-                               for (int i = 0; i < templateSeqs.size(); i++) {
-                                       decalc->runMask(templateSeqs[i]);
-                               }
-                       }
                        
+cout << "filter" << endl;                      
+                                               
                        //read in all query seqs
                        ifstream in; 
                        openInputFile(fastafile, in);
@@ -106,11 +100,20 @@ void Pintail::doPrep() {
                        //merge query seqs and template seqs
                        temp = templateSeqs;
                        for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
-                       
+cout << temp.size() << endl;   
+                       if (seqMask != "") {
+                           reRead = true;
+       cout << "masked "  << seqMask << endl;
+                               //mask templates
+                               for (int i = 0; i < temp.size(); i++) {
+                                       decalc->runMask(temp[i]);
+                               }
+                       }
+
                        mergedFilterString = createFilter(temp, 0.5);
-                       
+       cout << "here" << endl;         
                        //reread template seqs
-                       for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+                       //for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
                }
                
                
@@ -128,6 +131,13 @@ void Pintail::doPrep() {
                                }
                        }
                        
+                       if (filter) { 
+                               reRead = true;
+                               for (int i = 0; i < templateSeqs.size(); i++) {
+                                       runFilter(templateSeqs[i]);
+                               }
+                       }
+                       
                        mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
                        if (processors == 1) { 
                                quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
@@ -139,19 +149,20 @@ void Pintail::doPrep() {
                        
                        if ((!filter) && (seqMask == "")) {
                                noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
-                       }else if ((filter) && (seqMask == "")) { 
-                               noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.quan";
                        }else if ((!filter) && (seqMask != "")) { 
                                noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
                        }else if ((filter) && (seqMask != "")) { 
-                               noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.masked.quan";
+                               noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
+                       }else if ((filter) && (seqMask == "")) { 
+                               noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
                        }
 
+
+
                                                
                        decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
                        
-                       openOutputFile(noOutliers, out5);
-                       
+                       openOutputFile(noOutliers, out5);                       
                        //adjust quantiles
                        for (int i = 0; i < quantilesMembers.size(); i++) {
                                vector<float> temp;
@@ -209,7 +220,7 @@ void Pintail::doPrep() {
        }
 }
 //***************************************************************************************************************
-void Pintail::print(ostream& out) {
+void Pintail::print(ostream& out, ostream& outAcc) {
        try {
                int index = ceil(deviation);
                
@@ -227,6 +238,7 @@ void Pintail::print(ostream& out) {
                out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
                if (chimera == "Yes") {
                        mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine();
+                       outAcc << querySeq->getName() << endl;
                }
                out << "Observed\t";
                
index efb8ba6aa31afcd9c12bb3a85cfa21d2cd8310aa..2364c86c6bab6d779cb13131245d626244777895 100644 (file)
--- a/pintail.h
+++ b/pintail.h
@@ -28,7 +28,7 @@ class Pintail : public Chimera {
                ~Pintail();
                
                int getChimeras(Sequence*);
-               void print(ostream&);
+               void print(ostream&, ostream&);
                
                void setCons(string c)          { consfile = c;  }
                void setQuantiles(string q) { quanfile = q;  }
@@ -42,7 +42,7 @@ class Pintail : public Chimera {
                string fastafile, consfile;
                
                vector<linePair*> templateLines;
-               Sequence*querySeq;
+               Sequence* querySeq;
                                
                Sequence* bestfit;  //closest match to query in template