]> git.donarmstrong.com Git - mothur.git/commitdiff
addition of summary.seq command [pds]
authorpschloss <pschloss>
Wed, 3 Jun 2009 01:39:31 +0000 (01:39 +0000)
committerpschloss <pschloss>
Wed, 3 Jun 2009 01:39:31 +0000 (01:39 +0000)
15 files changed:
Mothur.xcodeproj/project.pbxproj
commandfactory.cpp
database.cpp
distancecommand.cpp
engine.cpp
fastamap.cpp
filterseqscommand.cpp
filterseqscommand.h
nastreport.cpp
seqsummarycommand.cpp [new file with mode: 0644]
seqsummarycommand.h [new file with mode: 0644]
sequence.cpp
sequence.hpp
validcommands.cpp
validparameter.cpp

index 6c867787cf4b3c7265c5eb999edfc203e9a9aa4d..245ed5535ecc449999a20ae363a09cd4d90b6037 100644 (file)
                7E412F490F8D21B600381DD0 /* slibshuff.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E412F480F8D21B600381DD0 /* slibshuff.cpp */; };
                7E412FEA0F8D3E2C00381DD0 /* libshuff.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E412FE90F8D3E2C00381DD0 /* libshuff.cpp */; };
                7E4130F80F8E58FA00381DD0 /* dlibshuff.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E4130F60F8E58FA00381DD0 /* dlibshuff.cpp */; };
+               7E9214820FD2081200699BF3 /* seqsummarycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E9214810FD2081200699BF3 /* seqsummarycommand.cpp */; };
                7EC3D4550FA0FFF900338DA5 /* coverage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4500FA0FFF900338DA5 /* coverage.cpp */; };
                7EC3D4560FA0FFF900338DA5 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */; };
                8DD76F6A0486A84900D96B5E /* Mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6859E8B029090EE04C91782 /* Mothur.1 */; };
                7E412FE90F8D3E2C00381DD0 /* libshuff.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = libshuff.cpp; sourceTree = SOURCE_ROOT; };
                7E4130F60F8E58FA00381DD0 /* dlibshuff.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = dlibshuff.cpp; sourceTree = SOURCE_ROOT; };
                7E4130F70F8E58FA00381DD0 /* dlibshuff.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dlibshuff.h; sourceTree = SOURCE_ROOT; };
+               7E9214800FD2081200699BF3 /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = "<group>"; };
+               7E9214810FD2081200699BF3 /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = "<group>"; };
                7EC3D4500FA0FFF900338DA5 /* coverage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = coverage.cpp; sourceTree = SOURCE_ROOT; };
                7EC3D4510FA0FFF900338DA5 /* coverage.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = coverage.h; sourceTree = SOURCE_ROOT; };
                7EC3D4520FA0FFF900338DA5 /* sharedanderberg.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedanderberg.h; sourceTree = SOURCE_ROOT; };
                37D928A90F2133E5001D4494 /* commands */ = {
                        isa = PBXGroup;
                        children = (
+                               7E9214800FD2081200699BF3 /* seqsummarycommand.h */,
+                               7E9214810FD2081200699BF3 /* seqsummarycommand.cpp */,
                                37D927CD0F21331F001D4494 /* command.hpp */,
                                378DC5CD0FBDE1C8003B8607 /* aligncommand.h */,
                                378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */,
                                373C699C0FC1E63600137ACD /* solow.cpp in Sources */,
                                EB72FE260FC1F5CA0051AC11 /* shen.cpp in Sources */,
                                21E859D80FC4632E005E1A48 /* matrixoutputcommand.cpp in Sources */,
+                               7E9214820FD2081200699BF3 /* seqsummarycommand.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
index 06e7e84f06602796d143cdb8d535e46006dc0fb1..0edb024b35084c16255ccc030f41efe271bf5826 100644 (file)
@@ -43,7 +43,7 @@
 #include "distancecommand.h"
 #include "aligncommand.h"
 #include "matrixoutputcommand.h"
-
+#include "seqsummarycommand.h"
 
 /***********************************************************/
 
@@ -97,6 +97,7 @@ Command* CommandFactory::getCommand(string commandName){
                else if(commandName == "concensus")                             {   command = new ConcensusCommand();                   }
                else if(commandName == "dist.seqs")                             {   command = new DistanceCommand();                    }
                else if(commandName == "align.seqs")                    {   command = new AlignCommand();                               }
+               else if(commandName == "summary.seqs")                  {       command = new SeqSummaryCommand();                      }
                else                                                                                    {       command = new NoCommand();                                      }
 
                return command;
index 8c080dbf95e79d6dc574e52d5679747018df00fa..5b93321a4c8b0f25915d479407f5152918d12f6a 100644 (file)
@@ -30,10 +30,10 @@ Database::Database(string fastaFileName){           //      This assumes that the template dat
        
        string seqName, sequence;
        for(int i=0;i<numSeqs;i++){
-               templateSequences[i] = new Sequence();          //      We're storing the aligned template sequences as a vector of
+//             templateSequences[i] = new Sequence();          //      We're storing the aligned template sequences as a vector of
                                                                                                        //      pointers to Sequence objects
                fastaFile >> seqName;
-               templateSequences[i]->setName(seqName);
+//             templateSequences[i]->setName(seqName);
                
                char letter;
                string aligned;
@@ -45,8 +45,9 @@ Database::Database(string fastaFileName){             //      This assumes that the template dat
                                aligned += letter;
                        }
                }
-               templateSequences[i]->setAligned(aligned);      //      Obviously, we need the fully aligned template sequence
-               templateSequences[i]->setUnaligned(aligned);//  We will also need the unaligned sequence for pairwise alignments
+               templateSequences[i] = new Sequence(seqName, aligned);
+//             templateSequences[i]->setAligned(aligned);      //      Obviously, we need the fully aligned template sequence
+//             templateSequences[i]->setUnaligned(aligned);//  We will also need the unaligned sequence for pairwise alignments
                fastaFile.putback(letter);                                      //      and database searches
        }
        
index 6fee651cb8963680ec294051e671cfc57b9c6135..1bc16f34bc231147a3610a894ad3ad04c2d49bed 100644 (file)
@@ -33,7 +33,7 @@ DistanceCommand::DistanceCommand(){
                                        }else if (globaldata->Estimators[i] == "eachgap") { 
                                                distCalculator = new eachGapDist();     
                                        }else if (globaldata->Estimators[i] == "onegap") {
-                                               distCalculator = new oneGapDist();                                      }
+                                       distCalculator = new oneGapDist();                                      }
                                }
                        }
                }else {
@@ -49,7 +49,7 @@ DistanceCommand::DistanceCommand(){
                                }
                        }
                }
-                               
+               
                //reset calc for next command
                globaldata->setCalc("");
        }
@@ -71,113 +71,190 @@ int DistanceCommand::execute(){
                string filename = globaldata->inputFileName;
                
                if(globaldata->getFastaFile() != "") {
-                       readSeqs =  new ReadFasta(filename); }
+               readSeqs =  new ReadFasta(filename); }
                else if(globaldata->getNexusFile() != "") {
-                       readSeqs = new ReadNexus(filename); }
+               readSeqs = new ReadNexus(filename); }
                else if(globaldata->getClustalFile() != "") {
-                       readSeqs = new ReadClustal(filename); }
+               readSeqs = new ReadClustal(filename); }
                else if(globaldata->getPhylipFile() != "") {
-                       readSeqs = new ReadPhylip(filename); }
-                       
+               readSeqs = new ReadPhylip(filename); }
+               
                readSeqs->read();
                seqDB = readSeqs->getDB();
-       
+               
                int numSeqs = seqDB->getNumSeqs();
                cutoff += 0.005;
-
+               
                string distFile = getRootName(globaldata->getFastaFile()) + "dist";
                
                remove(distFile.c_str());
                
                //#     if defined (_WIN32)
-                       //figure out how to implement the fork and wait commands in windows
+               //figure out how to implement the fork and wait commands in windows
                //      driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);
                //#     endif
                
-               #       if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                       if(processors == 1){
-                               driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);    
-                       }       
-                       else if(processors == 2){
-               
-                               int pid = fork();
-                               if(pid > 0){
-                                       driver(distCalculator, seqDB, 0, (numSeqs/sqrt(2)), distFile + "tempa", cutoff);        
-                                       appendFiles((distFile+"tempa"), distFile);
+#if defined (__APPLE__) || (__MACH__)
+               if(processors == 1){
+                       driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);    
+               }       
+               else if(processors == 2){
+                       
+                       int pid = fork();
+                       if(pid > 0){
+                               driver(distCalculator, seqDB, 0, (numSeqs/sqrt(2)), distFile + "tempa", cutoff);        
+                               appendFiles((distFile+"tempa"), distFile);
+                               remove((distFile + "tempa").c_str());   
+                       }
+                       else{
+                               driver(distCalculator, seqDB, (numSeqs/sqrt(2)), numSeqs, distFile + "tempb", cutoff);  
+                               appendFiles((distFile+"tempb"), distFile);
+                               remove((distFile + "tempb").c_str());   
+                       }
+                       wait(NULL);
+                       
+               }
+               else if(processors == 3){
+                       int pid1 = fork();
+                       if(pid1 > 0){
+                               int pid2 = fork();
+                               if(pid2 > 0){
+                                       driver(distCalculator, seqDB, 0, sqrt(3) * numSeqs / 3, distFile + "tempa", cutoff);
+                                       appendFiles(distFile+"tempa", distFile);
                                        remove((distFile + "tempa").c_str());   
                                }
                                else{
-                                       driver(distCalculator, seqDB, (numSeqs/sqrt(2)), numSeqs, distFile + "tempb", cutoff);  
-                                       appendFiles((distFile+"tempb"), distFile);
-                                       remove((distFile + "tempb").c_str());   
+                                       driver(distCalculator, seqDB, sqrt(3) * numSeqs / 3, sqrt(6) * numSeqs / 3, distFile + "tempb", cutoff);        
+                                       appendFiles(distFile+"tempb", distFile);
+                                       remove((distFile + "tempb").c_str());                           
                                }
                                wait(NULL);
-
                        }
-                       else if(processors == 3){
-                               int pid1 = fork();
-                               if(pid1 > 0){
-                                       int pid2 = fork();
-                                       if(pid2 > 0){
-                                               driver(distCalculator, seqDB, 0, sqrt(3) * numSeqs / 3, distFile + "tempa", cutoff);
-                                               appendFiles(distFile+"tempa", distFile);
-                                               remove((distFile + "tempa").c_str());   
-                                       }
-                                       else{
-                                               driver(distCalculator, seqDB, sqrt(3) * numSeqs / 3, sqrt(6) * numSeqs / 3, distFile + "tempb", cutoff);        
-                                               appendFiles(distFile+"tempb", distFile);
-                                               remove((distFile + "tempb").c_str());                           
-                                       }
-                                       wait(NULL);
+                       else{
+                               driver(distCalculator, seqDB, sqrt(6) * numSeqs / 3, numSeqs, distFile + "tempc", cutoff);      
+                               appendFiles(distFile+"tempc", distFile);
+                               remove((distFile + "tempc").c_str());                   
+                       }
+                       wait(NULL);
+               }
+               else if(processors == 4){
+                       int pid1 = fork();
+                       if(pid1 > 0){
+                               int pid2 = fork();
+                               if(pid2 > 0){
+                                       driver(distCalculator, seqDB, 0, numSeqs / 2, distFile + "tempa", cutoff);      
+                                       appendFiles(distFile+"tempa", distFile);
+                                       remove((distFile + "tempa").c_str());                   
                                }
                                else{
-                                       driver(distCalculator, seqDB, sqrt(6) * numSeqs / 3, numSeqs, distFile + "tempc", cutoff);      
-                                       appendFiles(distFile+"tempc", distFile);
-                                       remove((distFile + "tempc").c_str());                   
+                                       driver(distCalculator, seqDB, numSeqs / 2, (numSeqs/sqrt(2)), distFile + "tempb", cutoff);      
+                                       appendFiles(distFile+"tempb", distFile);
+                                       remove((distFile + "tempb").c_str());                           
                                }
                                wait(NULL);
                        }
-                       else if(processors == 4){
-                               int pid1 = fork();
-                               if(pid1 > 0){
-                                       int pid2 = fork();
-                                       if(pid2 > 0){
-                                               driver(distCalculator, seqDB, 0, numSeqs / 2, distFile + "tempa", cutoff);      
-                                               appendFiles(distFile+"tempa", distFile);
-                                               remove((distFile + "tempa").c_str());                   
-                                       }
-                                       else{
-                                               driver(distCalculator, seqDB, numSeqs / 2, (numSeqs/sqrt(2)), distFile + "tempb", cutoff);      
-                                               appendFiles(distFile+"tempb", distFile);
-                                               remove((distFile + "tempb").c_str());                           
-                                       }
-                                       wait(NULL);
+                       else{
+                               int pid3 = fork();
+                               if(pid3 > 0){
+                                       driver(distCalculator, seqDB, (numSeqs/sqrt(2)), (sqrt(3) * numSeqs / 2), distFile + "tempc", cutoff);  
+                                       appendFiles(distFile+"tempc", distFile);
+                                       remove((distFile + "tempc").c_str());                           
                                }
                                else{
-                                       int pid3 = fork();
-                                       if(pid3 > 0){
-                                               driver(distCalculator, seqDB, (numSeqs/sqrt(2)), (sqrt(3) * numSeqs / 2), distFile + "tempc", cutoff);  
-                                               appendFiles(distFile+"tempc", distFile);
-                                               remove((distFile + "tempc").c_str());                           
-                                       }
-                                       else{
-                                               driver(distCalculator, seqDB, (sqrt(3) * numSeqs / 2), numSeqs, distFile + "tempd", cutoff);    
-                                               appendFiles(distFile+"tempd", distFile);
-                                               remove((distFile + "tempd").c_str());                           
-                                       }
-                                       wait(NULL);
+                                       driver(distCalculator, seqDB, (sqrt(3) * numSeqs / 2), numSeqs, distFile + "tempd", cutoff);    
+                                       appendFiles(distFile+"tempd", distFile);
+                                       remove((distFile + "tempd").c_str());                           
                                }
                                wait(NULL);
                        }
                        wait(NULL);
-               #       else
-                       driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);
-               #       endif
-       
+               }
+               wait(NULL);
+#elif (linux) || (__linux)
+               if(processors == 1){
+                       driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);    
+               }       
+               else if(processors == 2){
+                       
+                       int pid = fork();
+                       if(pid > 0){
+                               driver(distCalculator, seqDB, 0, (numSeqs/sqrt(2)), distFile + "tempa", cutoff);        
+                               appendFiles((distFile+"tempa"), distFile);
+                               remove((distFile + "tempa").c_str());   
+                       }
+                       else{
+                               driver(distCalculator, seqDB, (numSeqs/sqrt(2)), numSeqs, distFile + "tempb", cutoff);  
+                               appendFiles((distFile+"tempb"), distFile);
+                               remove((distFile + "tempb").c_str());   
+                       }
+                       wait();
+                       
+               }
+               else if(processors == 3){
+                       int pid1 = fork();
+                       if(pid1 > 0){
+                               int pid2 = fork();
+                               if(pid2 > 0){
+                                       driver(distCalculator, seqDB, 0, sqrt(3) * numSeqs / 3, distFile + "tempa", cutoff);
+                                       appendFiles(distFile+"tempa", distFile);
+                                       remove((distFile + "tempa").c_str());   
+                               }
+                               else{
+                                       driver(distCalculator, seqDB, sqrt(3) * numSeqs / 3, sqrt(6) * numSeqs / 3, distFile + "tempb", cutoff);        
+                                       appendFiles(distFile+"tempb", distFile);
+                                       remove((distFile + "tempb").c_str());                           
+                               }
+                               wait();
+                       }
+                       else{
+                               driver(distCalculator, seqDB, sqrt(6) * numSeqs / 3, numSeqs, distFile + "tempc", cutoff);      
+                               appendFiles(distFile+"tempc", distFile);
+                               remove((distFile + "tempc").c_str());                   
+                       }
+                       wait();
+               }
+               else if(processors == 4){
+                       int pid1 = fork();
+                       if(pid1 > 0){
+                               int pid2 = fork();
+                               if(pid2 > 0){
+                                       driver(distCalculator, seqDB, 0, numSeqs / 2, distFile + "tempa", cutoff);      
+                                       appendFiles(distFile+"tempa", distFile);
+                                       remove((distFile + "tempa").c_str());                   
+                               }
+                               else{
+                                       driver(distCalculator, seqDB, numSeqs / 2, (numSeqs/sqrt(2)), distFile + "tempb", cutoff);      
+                                       appendFiles(distFile+"tempb", distFile);
+                                       remove((distFile + "tempb").c_str());                           
+                               }
+                               wait();
+                       }
+                       else{
+                               int pid3 = fork();
+                               if(pid3 > 0){
+                                       driver(distCalculator, seqDB, (numSeqs/sqrt(2)), (sqrt(3) * numSeqs / 2), distFile + "tempc", cutoff);  
+                                       appendFiles(distFile+"tempc", distFile);
+                                       remove((distFile + "tempc").c_str());                           
+                               }
+                               else{
+                                       driver(distCalculator, seqDB, (sqrt(3) * numSeqs / 2), numSeqs, distFile + "tempd", cutoff);    
+                                       appendFiles(distFile+"tempd", distFile);
+                                       remove((distFile + "tempd").c_str());                           
+                               }
+                               wait();
+                       }
+                       wait();
+               }
+               wait();
+               
+#else
+               driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);
+#endif
+               
                delete distCalculator;
-       
+               
                return 0;
-
+               
        }
        catch(exception& e) {
                cout << "Standard Error: " << e.what() << " has occurred in the DistanceCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
@@ -194,29 +271,29 @@ int DistanceCommand::execute(){
 int DistanceCommand::driver(Dist* distCalculator, SequenceDB* align, int startLine, int endLine, string dFileName, float cutoff){
        try {
                int startTime = time(NULL);
-       
+               
                ofstream distFile(dFileName.c_str(), ios::trunc);
                distFile.setf(ios::fixed, ios::showpoint);
                distFile << setprecision(4);
-       
-               for(int i=startLine;i<endLine;i++){
                
+               for(int i=startLine;i<endLine;i++){
+                       
                        for(int j=0;j<i;j++){
                                distCalculator->calcDist(align->get(i), align->get(j));
                                double dist = distCalculator->getDist();
-
+                               
                                if(dist <= cutoff){
                                        distFile << align->get(i).getName() << ' ' << align->get(j).getName() << ' ' << dist << endl;
                                }
-                       
+                               
                        }
                        if(i % 100 == 0){
                                cout << i << '\t' << time(NULL) - startTime << endl;
                        }
-               
+                       
                }
                cout << endLine-1 << '\t' << time(NULL) - startTime << endl;
-       
+               
                return 1;
        }
        catch(exception& e) {
@@ -227,7 +304,7 @@ int DistanceCommand::driver(Dist* distCalculator, SequenceDB* align, int startLi
                cout << "An unknown error has occurred in the DistanceCommand class function driver. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
                exit(1);
        }       
-
+       
 }
 
 /**************************************************************************************************/
@@ -250,7 +327,7 @@ void DistanceCommand::appendFiles(string temp, string filename) {
                                output << line << endl;   // Appending back newline char 
                        }
                }       
-                       
+               
                input.close();
                output.close();
        }
index add84b9e944ceeaaee5cd4313836bca2a751315d..ddd6776cfc78f48f371ba677fd268c7a46964ade 100644 (file)
@@ -46,8 +46,8 @@ bool InteractEngine::getInput(){
                bool errorFree;
                ErrorCheck* errorCheckor = new ErrorCheck();
                
-               cout << "mothur v1.2.0" << endl;
-               cout << "Last updated: 4/14/2009" << endl << endl;
+               cout << "mothur v.1.3.0" << endl;
+               cout << "Last updated: 4/25/2009" << endl << endl;
                cout << "by" << endl;
                cout << "Patrick D. Schloss" << endl << endl;
                cout << "Department of Microbiology" << endl;
index a1beda31ac5ee82c532676163179197a004f91eb..ebfecedb860bafb6385ca2f75a3752702354498b 100644 (file)
@@ -19,7 +19,7 @@ void FastaMap::readFastaFile(ifstream& in) {
                name = line.substr(1, line.length());  //rips off '>'
        
                //read through file
-               while (in.eof() != true) {
+               while (!in.eof()) {
                        in >> line;
                        if (line != "") {
                                if (isalnum(line.at(0))) {  //if it's a sequence line
@@ -41,19 +41,9 @@ void FastaMap::readFastaFile(ifstream& in) {
                                        sequence = "";
                                }
                        }
+                       gobble(in);
                }
        
-               //store last sequence and name info.
-               seqmap[name] = sequence;
-               it = data.find(sequence);
-               if (it == data.end()) {         //it's unique.
-                       data[sequence].groupname = name;  //group name will be the name of the first duplicate sequence found.
-                       data[sequence].groupnumber = 1;
-                       data[sequence].names = name;
-               }else { // its a duplicate.
-                       data[sequence].names += "," + name;
-                       data[sequence].groupnumber++;
-               }
                        
        }
        catch(exception& e) {
index 6c022426645cc7af9ae353106f4d167e64812bee..a6bd54982e9ca509463f5d40822d1ae232e83bb1 100644 (file)
@@ -23,7 +23,7 @@ FilterSeqsCommand::FilterSeqsCommand(){
        db = readSeqs->getDB();
        numSeqs = db->size();
        
-       alignmentLength = db->get(0).getLength();
+       alignmentLength = db->get(0).getAlignLength();
 
        filter = string(alignmentLength, '1');
 }
index 2279689d4671d5a7fae99204d0ab5421b5070ed6..af4c03a950a0f42ad4fd9d7453ad698423381bd0 100644 (file)
@@ -44,4 +44,4 @@ private:
 
 };
 
-#endif
\ No newline at end of file
+#endif
index 67ec702fdd122b732b0f7e0598fe54f791a121c8..799793ece4dda8567d1dcb5a9865a70c9dd45f3e 100644 (file)
@@ -48,14 +48,14 @@ void NastReport::print(){
 
 void NastReport::setCandidate(Sequence* candSeq){ 
        queryName = candSeq->getName();
-       queryLength = candSeq->getUnalignLength();
+       queryLength = candSeq->getNumBases();
 }
 
 /******************************************************************************************************************/
 
 void NastReport::setTemplate(Sequence* tempSeq){ 
        templateName = tempSeq->getName();
-       templateLength = tempSeq->getUnalignLength();
+       templateLength = tempSeq->getNumBases();
 }
 
 /******************************************************************************************************************/
diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp
new file mode 100644 (file)
index 0000000..e71c2af
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ *  seqcoordcommand.cpp
+ *  Mothur
+ *
+ *  Created by Pat Schloss on 5/30/09.
+ *  Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "seqsummarycommand.h"
+
+//***************************************************************************************************************
+
+SeqSummaryCommand::SeqSummaryCommand(){
+       try {
+               globaldata = GlobalData::getInstance();
+               
+               if(globaldata->getFastaFile() != "")            {       readSeqs = new ReadFasta(globaldata->inputFileName);    }
+               else if(globaldata->getNexusFile() != "")       {       readSeqs = new ReadNexus(globaldata->inputFileName);    }
+               else if(globaldata->getClustalFile() != "") {   readSeqs = new ReadClustal(globaldata->inputFileName);  }
+               else if(globaldata->getPhylipFile() != "")      {       readSeqs = new ReadPhylip(globaldata->inputFileName);   }
+               
+               readSeqs->read();
+               db = readSeqs->getDB();
+               numSeqs = db->size();
+       }
+       catch(exception& e) {
+               cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }
+       catch(...) {
+               cout << "An unknown error has occurred in the SeqCoordCommand class function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }       
+}
+
+//***************************************************************************************************************
+
+SeqSummaryCommand::~SeqSummaryCommand(){
+}
+
+//***************************************************************************************************************
+
+int SeqSummaryCommand::execute(){
+       try{
+               
+               ofstream outfile;
+               string summaryFile = getRootName(globaldata->inputFileName) + "summary";
+               openOutputFile(summaryFile, outfile);
+
+               vector<int> startPosition(numSeqs, 0);
+               vector<int> endPosition(numSeqs, 0);
+               vector<int> seqLength(numSeqs, 0);
+               vector<int> ambigBases(numSeqs, 0);
+               vector<int> longHomoPolymer(numSeqs, 0);
+               
+               if(db->get(0).getIsAligned() == 1){
+                       outfile << "seqname\tstart\tend\tlength\tambiguities\tlonghomopolymer" << endl;                 
+                       for(int i = 0; i < numSeqs; i++) {
+                               Sequence current = db->get(i);
+                               startPosition[i] = current.getStartPos();
+                               endPosition[i] = current.getEndPos();
+                               seqLength[i] = current.getNumBases();
+                               ambigBases[i] = current.getAmbigBases();
+                               longHomoPolymer[i] = current.getLongHomoPolymer();
+                               outfile << current.getName() << '\t' << startPosition[i] << '\t' << endPosition[i] << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl;
+                       }
+               }
+               else{
+                       outfile << "seqname\tlength\tambiguities\tlonghomopolymer" << endl;
+                       for(int i=0;i<numSeqs;i++){
+                               Sequence current = db->get(i);
+                               seqLength[i] = current.getNumBases();
+                               ambigBases[i] = current.getAmbigBases();
+                               longHomoPolymer[i] = current.getLongHomoPolymer();
+                               outfile << current.getName() << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl;
+                       }
+               }
+               
+               sort(seqLength.begin(), seqLength.end());
+               sort(ambigBases.begin(), ambigBases.end());
+               sort(longHomoPolymer.begin(), longHomoPolymer.end());
+               
+               int median                      = int(numSeqs * 0.500);
+               int lowestPtile         = int(numSeqs * 0.025);
+               int lowPtile            = int(numSeqs * 0.250);
+               int highPtile           = int(numSeqs * 0.750);
+               int highestPtile        = int(numSeqs * 0.975);
+               int max                         = numSeqs - 1;
+               
+               cout << endl;
+               if(db->get(0).getIsAligned() == 1){
+                       sort(startPosition.begin(), startPosition.end());
+                       sort(endPosition.begin(), endPosition.end());
+                                       
+                       cout << "\t\tStart\tEnd\tLength\tN's\tPolymer" << endl;
+                       cout << "Minimum:\t" << startPosition[0] << '\t' << endPosition[0] << '\t' << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl;
+                       cout << "2.5%-tile:\t" << startPosition[lowestPtile] << '\t' << endPosition[lowestPtile] << '\t' << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl;
+                       cout << "25%-tile:\t" << startPosition[lowPtile] << '\t' << endPosition[lowPtile] << '\t' << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl;
+                       cout << "Median: \t" << startPosition[median] << '\t' << endPosition[median] << '\t' << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl;
+                       cout << "75%-tile:\t" << startPosition[highPtile] << '\t' << endPosition[highPtile] << '\t' << seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl;
+                       cout << "97.5%-tile:\t" << startPosition[highestPtile] << '\t' << endPosition[highestPtile] << '\t' << seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl;
+                       cout << "Maximum:\t" << startPosition[max] << '\t' << endPosition[max] << '\t' << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl;
+               }
+               else{
+                       cout << "\t\tLength\tN's\tPolymer" << endl;
+                       cout << "Minimum:\t" << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl;
+                       cout << "2.5%-tile:\t" << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl;
+                       cout << "25%-tile:\t" << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl;
+                       cout << "Median: \t" << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl;
+                       cout << "75%-tile:\t"<< seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl;
+                       cout << "97.5%-tile:\t"<< seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl;
+                       cout << "Maximum:\t" << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl;
+               }
+               cout << "# of Seqs:\t" << numSeqs << endl;
+               
+               return 0;
+       }
+       catch(exception& e) {
+               cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }
+       catch(...) {
+               cout << "An unknown error has occurred in the FilterSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }
+       
+}
+
+//***************************************************************************************************************
+
+
diff --git a/seqsummarycommand.h b/seqsummarycommand.h
new file mode 100644 (file)
index 0000000..02103e7
--- /dev/null
@@ -0,0 +1,36 @@
+#ifndef SEQSUMMARYCOMMAND_H
+#define SEQSUMMARYCOMMAND_H
+
+/*
+ *  seqcoordcommand.h
+ *  Mothur
+ *
+ *  Created by Pat Schloss on 5/30/09.
+ *  Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "command.hpp"
+#include "globaldata.hpp"
+#include "readfasta.h"
+#include "readnexus.h"
+#include "readclustal.h"
+#include "readseqsphylip.h"
+
+using namespace std;
+
+class SeqSummaryCommand : public Command {
+public:
+       SeqSummaryCommand();
+       ~SeqSummaryCommand();
+       int execute();
+       
+private:
+       int numSeqs;    
+       GlobalData* globaldata; 
+       ReadSeqs* readSeqs;
+       SequenceDB* db;
+};
+
+#endif
index 2f167c52bb225edce6a31cbc623b8bcfd3d4c61c..423b9058b23b0815326465f86b9916777c0359ce 100644 (file)
@@ -13,16 +13,22 @@ using namespace std;
 
 /***********************************************************************/
 
-Sequence::Sequence()  {}
+Sequence::Sequence(){
+       initialize();
+}
 
 /***********************************************************************/
 
 Sequence::Sequence(string newName, string sequence) {
+
+       initialize();   
        name = newName;
        if(sequence.find_first_of('-') != string::npos) {
                setAligned(sequence);
+               isAligned = 1;
        }
        setUnaligned(sequence);
+       
 }
 //********************************************************************************************************************
 
@@ -57,22 +63,22 @@ Sequence::Sequence(ifstream& fastaFile){
 
 //********************************************************************************************************************
 
-string Sequence::convert2ints() {
+void Sequence::initialize(){
        
-       if(unaligned == "")     {       /* need to throw an error */    }
+       name = "";
+       unaligned = "";
+       aligned = "";
+       pairwise = "";
        
-       string processed;
+       numBases = 0;
+       alignmentLength = 0;
+       isAligned = 0;
+       startPos = -1;
+       endPos = -1;
+       longHomoPolymer = 0;
+       ambigBases = 0;
        
-       for(int i=0;i<unaligned.length();i++) {
-               if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
-               else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
-               else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
-               else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
-               else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
-               else                                                                    {       processed += '4';       }
-       }
-       return processed;
-}
+}      
 
 //********************************************************************************************************************
 
@@ -95,6 +101,7 @@ void Sequence::setUnaligned(string sequence){
        else {
                unaligned = sequence;
        }
+       numBases = unaligned.length();
        
 }
 
@@ -103,26 +110,28 @@ void Sequence::setUnaligned(string sequence){
 void Sequence::setAligned(string sequence){
        
        //if the alignment starts or ends with a gap, replace it with a period to indicate missing data
-       
-       if(sequence[0] == '-'){
-               for(int i=0;i<sequence.length();i++){
-                       if(sequence[i] == '-'){
-                               sequence[i] = '.';
+       aligned = sequence;
+       alignmentLength = aligned.length();
+
+       if(aligned[0] == '-'){
+               for(int i=0;i<alignmentLength;i++){
+                       if(aligned[i] == '-'){
+                               aligned[i] = '.';
                        }
                        else{
                                break;
                        }
                }
-               for(int i=sequence.length()-1;i>=0;i--){
-                       if(sequence[i] == '-'){
-                               sequence[i] = '.';
+               for(int i=alignmentLength-1;i>=0;i--){
+                       if(aligned[i] == '-'){
+                               aligned[i] = '.';
                        }
                        else{
                                break;
                        }
                }
        }
-       aligned = sequence;
+       
 }
 
 //********************************************************************************************************************
@@ -133,6 +142,25 @@ void Sequence::setPairwise(string sequence){
 
 //********************************************************************************************************************
 
+string Sequence::convert2ints() {
+       
+       if(unaligned == "")     {       /* need to throw an error */    }
+       
+       string processed;
+       
+       for(int i=0;i<unaligned.length();i++) {
+               if(toupper(unaligned[i]) == 'A')                {       processed += '0';       }
+               else if(toupper(unaligned[i]) == 'C')   {       processed += '1';       }
+               else if(toupper(unaligned[i]) == 'G')   {       processed += '2';       }
+               else if(toupper(unaligned[i]) == 'T')   {       processed += '3';       }
+               else if(toupper(unaligned[i]) == 'U')   {       processed += '3';       }
+               else                                                                    {       processed += '4';       }
+       }
+       return processed;
+}
+
+//********************************************************************************************************************
+
 string Sequence::getName(){
        return name;
 }
@@ -157,34 +185,96 @@ string Sequence::getUnaligned(){
 
 //********************************************************************************************************************
 
-int Sequence::getLength(){
-       if(unaligned.length() > aligned.length())
-               return unaligned.length();
-       return aligned.length();
+int Sequence::getNumBases(){
+       return numBases;
 }
 
 //********************************************************************************************************************
 
 void Sequence::printSequence(ostream& out){
-       string toPrint = unaligned;
-       if(aligned.length() > unaligned.length())
-               toPrint = aligned;
-       out << ">" << name << "\n" << toPrint << "\n";
+
+       out << ">" << name << endl;
+       if(isAligned){
+               out << aligned << endl;
+       }
+       else{
+               out << unaligned << endl;
+       }
 }
 
 //********************************************************************************************************************
 
-int Sequence::getUnalignLength(){
-       return unaligned.length();
+int Sequence::getAlignLength(){
+       return alignmentLength;
 }
 
 //********************************************************************************************************************
 
-int Sequence::getAlignLength(){
-       return aligned.length();
+int Sequence::getAmbigBases(){
+       if(ambigBases == -1){
+       
+               for(int j=0;j<numBases;j++){
+                       if(unaligned[j] != 'A' && unaligned[j] != 'T' && unaligned[j] != 'G' && unaligned[j] != 'C'){
+                               ambigBases++;
+                       }
+               }
+       }       
+       
+       return ambigBases;
 }
 
 //********************************************************************************************************************
 
+int Sequence::getLongHomoPolymer(){
+       if(longHomoPolymer == 0){
 
+               int homoPolymer = 1;
+               for(int j=1;j<numBases;j++){
+                       if(unaligned[j] == unaligned[j-1]){
+                               homoPolymer++;
+                       }
+                       else{
+                               if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
+                               homoPolymer = 1;
+                       }
+               }
+               if(homoPolymer > longHomoPolymer){      longHomoPolymer = homoPolymer;  }
+       }
+       return longHomoPolymer;
+}
+
+//********************************************************************************************************************
 
+int Sequence::getStartPos(){
+       if(endPos == -1){
+               for(int j = 0; j < alignmentLength; j++) {
+                       if(aligned[j] != '.'){
+                               startPos = j;
+                               break;
+                       }
+               }
+       }       
+       return startPos;
+}
+
+//********************************************************************************************************************
+
+int Sequence::getEndPos(){
+       if(endPos == -1){
+               for(int j=alignmentLength-1;j>=0;j--){
+                       if(aligned[j] != '.'){
+                               endPos = j;
+                               break;
+                       }
+               }
+       }
+       return endPos;
+}
+
+//********************************************************************************************************************
+
+bool Sequence::getIsAligned(){
+       return isAligned;
+}
+
+//********************************************************************************************************************
index 37ca87cf019ead5dd230bb8226d09761e7d16880..929ca781ba9906c736bc83e2a79b556b583651f2 100644 (file)
@@ -37,18 +37,27 @@ public:
        string getAligned();
        string getPairwise();
        string getUnaligned();
-       int getLength();  //the greater of the lengths of unaligned and aligned
-       int getUnalignLength();
+       int getNumBases();
+       int getStartPos();
+       int getEndPos();
        int getAlignLength();
+       int getAmbigBases();
+       int getLongHomoPolymer();
+       bool getIsAligned();
        void printSequence(ostream&);
        
 private:
+       void initialize();
        string name;
        string unaligned;
        string aligned;
        string pairwise;
-       int length;
-       int lengthAligned;
+       int numBases;
+       int alignmentLength;
+       bool isAligned;
+       int longHomoPolymer;
+       int ambigBases;
+       int startPos, endPos;
 };
 
 /**************************************************************************************************/
index 6f2790d50b34fa8ed9bc86c6ac3d31a85f136046..544933c804abcf2d1aa372936352d9c476aac838 100644 (file)
@@ -44,6 +44,7 @@ ValidCommands::ValidCommands() {
                commands["help"]                                = "help"; 
                commands["filter.seqs"]                 = "filter.seqs";
                commands["align.seqs"]                  = "align.seqs";
+               commands["summary.seqs"]                = "summary.seqs";
                commands["quit"]                                = "quit"; 
 
                                
index 4aad1c22c037515fcb5d6f166aae8af4842d8fd4..6069a4fc387521a327b21b5c257bf76a7d170c36 100644 (file)
@@ -270,6 +270,9 @@ void ValidParameters::initCommandParameters() {
                string filterseqsArray[] =  {"fasta","phylip","clustal","nexus", "trump", "soft", "hard", "vertical"};
                commandParameters["filter.seqs"] = addParameters(filterseqsArray, sizeof(filterseqsArray)/sizeof(string));
 
+               string summaryseqsArray[] =  {"fasta","phylip","clustal","nexus"};
+               commandParameters["summary.seqs"] = addParameters(summaryseqsArray, sizeof(summaryseqsArray)/sizeof(string));
+
                string vennArray[] =  {"groups","line","label","calc"};
                commandParameters["venn"] = addParameters(vennArray, sizeof(vennArray)/sizeof(string));