]> git.donarmstrong.com Git - mothur.git/commitdiff
pat's edits of screen.seqs and summary.seqs
authorpschloss <pschloss>
Thu, 4 Jun 2009 13:59:25 +0000 (13:59 +0000)
committerpschloss <pschloss>
Thu, 4 Jun 2009 13:59:25 +0000 (13:59 +0000)
15 files changed:
aligncommand.cpp
aligncommand.h
database.cpp
filterseqscommand.cpp
filterseqscommand.h
globaldata.cpp
kmerdb.cpp
mothur.h
screenseqscommand.cpp
screenseqscommand.h
seqsummarycommand.cpp
seqsummarycommand.h
sequence.cpp
sequence.hpp
validparameter.cpp

index 242e9a8fe0e3ba7a7f199846ae0e80cba10f0edb..08c291d2bd28e1f74efdb0562c2c1ccc329d9a25 100644 (file)
 AlignCommand::AlignCommand(){
        try {
                globaldata = GlobalData::getInstance();
-               candidateFileName = globaldata->getCandidateFile();
-               templateFileName = globaldata->getFastaFile();
-               openInputFile(candidateFileName, in);
+               if(globaldata->getFastaFile() == "" && globaldata->getPhylipFile() == "" && globaldata->getNexusFile() == "" && globaldata->getClustalFile() == ""){
+                       cout << "you forgot a template file" << endl;
+               }
+               openInputFile(globaldata->getCandidateFile(), in);
+               
                convert(globaldata->getKSize(), kmerSize);
                convert(globaldata->getMatch(), match);
                convert(globaldata->getMismatch(), misMatch);
@@ -73,28 +75,33 @@ int AlignCommand::execute(){
                srand( (unsigned)time( NULL ) );  //needed to assign names to temporary files
                
                Database* templateDB;
-               if(globaldata->getSearch() == "kmer")                   {       templateDB = new KmerDB(templateFileName, kmerSize);                                                            }
-               else if(globaldata->getSearch() == "suffix")    {       templateDB = new SuffixDB(templateFileName);                                                                            }
-               else if(globaldata->getSearch() == "blast")             {       templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch);        }
-               else {  cout << globaldata->getSearch() << " is not a valid search option. I will run the command using suffix." << endl;
-                               templateDB = new SuffixDB(templateFileName);            }
+               if(globaldata->getSearch() == "kmer")                   {       templateDB = new KmerDB(globaldata->getFastaFile() , kmerSize); }
+               else if(globaldata->getSearch() == "suffix")    {       templateDB = new SuffixDB(globaldata->getFastaFile());                  }
+               else if(globaldata->getSearch() == "blast")             {       templateDB = new BlastDB(globaldata->getFastaFile(), gapOpen, gapExtend, match, misMatch);      }
+               else {
+                       cout << globaldata->getSearch() << " is not a valid search option. I will run the command using kmer, ksize=8." << endl;
+                       templateDB = new KmerDB(globaldata->getFastaFile(), kmerSize);
+               }
        
                Alignment* alignment;
-               if(globaldata->getAlign() == "gotoh")                                   {       alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000);        }
-               else if(globaldata->getAlign() == "needleman")                  {       alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000);                       }
-               else if(globaldata->getAlign() == "blast")                              {       alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch);            }
-               else if(globaldata->getAlign() == "noalign")                    {       alignment = new NoAlign();                                                                                                      }
-               else {  cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using blast." << endl;
-                               alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch);    }
+               if(globaldata->getAlign() == "gotoh")                   {       alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000);        }
+               else if(globaldata->getAlign() == "needleman")  {       alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000);                       }
+               else if(globaldata->getAlign() == "blast")              {       alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch);            }
+               else if(globaldata->getAlign() == "noalign")    {       alignment = new NoAlign();                                                                                                      }
+               else {
+                       cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using needleman." << endl;
+                       alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000);
+               }
                                
                int numFastaSeqs=count(istreambuf_iterator<char>(in),istreambuf_iterator<char>(), '>');
                in.seekg(0);
        
-               string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.align";
+               candidateFileName = globaldata->getCandidateFile();
+               string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align";
                ofstream candidateAlignmentFile;
                openOutputFile(candidateAligngmentFName, candidateAlignmentFile);
 
-               string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.report";
+               string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align.report";
                NastReport report(candidateReportFName);
 
                cout << "We are going to align the " << numFastaSeqs << " sequences in " << candidateFileName << "..." << endl;
@@ -110,6 +117,7 @@ int AlignCommand::execute(){
                        Sequence* templateSeq = templateDB->findClosestSequence(candidateSeq);
                        report.setTemplate(templateSeq);
                        report.setSearchParameters(globaldata->getSearch(), templateDB->getSearchScore());
+                       
                                
                        Nast nast(alignment, candidateSeq, templateSeq);
                        report.setAlignmentParameters(globaldata->getAlign(), alignment);
index cf30a6b456dfcf803b137a2500a10dd36f7a5f25..b5ff7825c7ae06fb8351a2ed9b398379d6abbc9a 100644 (file)
 
 class AlignCommand : public Command {
        
-       public:
-               AlignCommand(); 
-               ~AlignCommand();
-               int execute();  
-       
-       private:
-               GlobalData* globaldata;
-               string candidateFileName, templateFileName, distanceFileName;
-               int kmerSize;
-               float match, misMatch, gapOpen, gapExtend;
-               ofstream out;
-               ifstream in;
+public:
+       AlignCommand(); 
+       ~AlignCommand();
+       int execute();  
+
+private:
+       GlobalData* globaldata;
+       string candidateFileName, templateFileName, distanceFileName;
+       int kmerSize;
+       float match, misMatch, gapOpen, gapExtend;
+       ofstream out;
+       ifstream in;
 
 };
 
index 5b93321a4c8b0f25915d479407f5152918d12f6a..44e663bde3c9a9d9bd40895ef1a600779594ce21 100644 (file)
@@ -23,6 +23,7 @@ Database::Database(string fastaFileName){             //      This assumes that the template dat
 
        cout << endl << "Reading in the " << fastaFileName << " template sequences...\t";       cout.flush();
 
+       //all of this is elsewhere already!
        numSeqs=count(istreambuf_iterator<char>(fastaFile),istreambuf_iterator<char>(), '>');   //      count the number of
        fastaFile.seekg(0);                                                                                                                                             //      sequences
        
@@ -30,11 +31,8 @@ Database::Database(string fastaFileName){            //      This assumes that the template dat
        
        string seqName, sequence;
        for(int i=0;i<numSeqs;i++){
-//             templateSequences[i] = new Sequence();          //      We're storing the aligned template sequences as a vector of
-                                                                                                       //      pointers to Sequence objects
                fastaFile >> seqName;
-//             templateSequences[i]->setName(seqName);
-               
+               seqName = seqName.substr(1);
                char letter;
                string aligned;
                
@@ -46,12 +44,12 @@ Database::Database(string fastaFileName){           //      This assumes that the template dat
                        }
                }
                templateSequences[i] = new Sequence(seqName, aligned);
-//             templateSequences[i]->setAligned(aligned);      //      Obviously, we need the fully aligned template sequence
-//             templateSequences[i]->setUnaligned(aligned);//  We will also need the unaligned sequence for pairwise alignments
-               fastaFile.putback(letter);                                      //      and database searches
+               fastaFile.putback(letter);
        }
        
        fastaFile.close();
+       //all of this is elsewhere already!
+       
        cout << "DONE." << endl;        cout.flush();
 
 }
index a6bd54982e9ca509463f5d40822d1ae232e83bb1..8864bb3c57d32052220013ebf88267f865ccdbd1 100644 (file)
 FilterSeqsCommand::FilterSeqsCommand(){
        globaldata = GlobalData::getInstance();
        
-       if(globaldata->getFastaFile() != "")            {       readSeqs =  new ReadFasta(globaldata->inputFileName);   }
-       else if(globaldata->getNexusFile() != "")       {       readSeqs = new ReadNexus(globaldata->inputFileName);    }
-       else if(globaldata->getClustalFile() != "") {   readSeqs = new ReadClustal(globaldata->inputFileName);  }
-       else if(globaldata->getPhylipFile() != "")      {       readSeqs = new ReadPhylip(globaldata->inputFileName);   }
-       
-       readSeqs->read();
-       db = readSeqs->getDB();
-       numSeqs = db->size();
-       
-       alignmentLength = db->get(0).getAlignLength();
-
-       filter = string(alignmentLength, '1');
+       if(globaldata->getFastaFile() == "")            {       cout << "You must enter a fasta formatted file" << endl;        }
+       trump = globaldata->getTrump()[0];
+       vertical = 
+//     readSeqs->read();
+//     db = readSeqs->getDB();
+//     numSeqs = db->size();
+//     
+//     alignmentLength = db->get(0).getAlignLength();
+//
+//     filter = string(alignmentLength, '1');
 }
 
 /**************************************************************************************/
 
 void FilterSeqsCommand::doHard() {
        
-       string hardName = globaldata->getHard();
-       string hardFilter = "";
-               
-       ifstream fileHandle;
-       openInputFile(hardName, fileHandle);
-       
-       fileHandle >> hardFilter;
-       
-       if(hardFilter.length() != filter.length()){
-               cout << "The hard filter is not the same length as the alignment: Hard filter will not be applied." << endl;
-       }
-       else{
-               filter = hardFilter;
-       }
-       
+//     string hardName = globaldata->getHard();
+//     string hardFilter = "";
+//             
+//     ifstream fileHandle;
+//     openInputFile(hardName, fileHandle);
+//     
+//     fileHandle >> hardFilter;
+//     
+//     if(hardFilter.length() != filter.length()){
+//             cout << "The hard filter is not the same length as the alignment: Hard filter will not be applied." << endl;
+//     }
+//     else{
+//             filter = hardFilter;
+//     }
+
 }
 
 /**************************************************************************************/
 
 void FilterSeqsCommand::doTrump() {
 
-       char trump = globaldata->getTrump()[0];
        
        for(int i = 0; i < numSeqs; i++) {
                string curAligned = db->get(i).getAligned();;
@@ -71,62 +68,77 @@ void FilterSeqsCommand::doTrump() {
 
 void FilterSeqsCommand::doVertical() {
 
-       vector<int> counts(alignmentLength, 0);
-       
-       for(int i = 0; i < numSeqs; i++) {
-               string curAligned = db->get(i).getAligned();;
-               
-               for(int j = 0; j < alignmentLength; j++) {
-                       if(curAligned[j] == '-' || curAligned[j] == '.'){
-                               counts[j]++;
-                       }
-               }
-       }
-       for(int i=0;i<alignmentLength;i++){
-               if(counts[i] == numSeqs)        {       filter[i] = '0';                }
-       }
+//     vector<int> counts(alignmentLength, 0);
+//     
+//     for(int i = 0; i < numSeqs; i++) {
+//             string curAligned = db->get(i).getAligned();;
+//             
+//             for(int j = 0; j < alignmentLength; j++) {
+//                     if(curAligned[j] == '-' || curAligned[j] == '.'){
+//                             counts[j]++;
+//                     }
+//             }
+//     }
+//     for(int i=0;i<alignmentLength;i++){
+//             if(counts[i] == numSeqs)        {       filter[i] = '0';                }
+//     }
 }
 
 /**************************************************************************************/
 
 void FilterSeqsCommand::doSoft() {
 
-       int softThreshold = numSeqs * (float)atoi(globaldata->getSoft().c_str()) / 100.0;
-
-       vector<int> a(alignmentLength, 0);
-       vector<int> t(alignmentLength, 0);
-       vector<int> g(alignmentLength, 0);
-       vector<int> c(alignmentLength, 0);
-       vector<int> x(alignmentLength, 0);
-       
-       for(int i=0;i<numSeqs;i++){
-               string curAligned = db->get(i).getAligned();;
-
-               for(int j=0;j<alignmentLength;j++){
-                       if(toupper(curAligned[j]) == 'A')                                                                               {       a[j]++; }
-                       else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[i]) == 'U') {       t[j]++; }
-                       else if(toupper(curAligned[j]) == 'G')                                                                  {       g[j]++; }
-                       else if(toupper(curAligned[j]) == 'C')                                                                  {       c[j]++; }
-               }
-       }
-
-       for(int i=0;i<alignmentLength;i++){
-               if(a[i] < softThreshold && t[i] < softThreshold && g[i] < softThreshold && c[i] < softThreshold){
-                       filter[i] = '0';                        
-               }
-       }
+//     int softThreshold = numSeqs * (float)atoi(globaldata->getSoft().c_str()) / 100.0;
+//
+//     vector<int> a(alignmentLength, 0);
+//     vector<int> t(alignmentLength, 0);
+//     vector<int> g(alignmentLength, 0);
+//     vector<int> c(alignmentLength, 0);
+//     vector<int> x(alignmentLength, 0);
+//     
+//     for(int i=0;i<numSeqs;i++){
+//             string curAligned = db->get(i).getAligned();;
+//
+//             for(int j=0;j<alignmentLength;j++){
+//                     if(toupper(curAligned[j]) == 'A')                                                                               {       a[j]++; }
+//                     else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[i]) == 'U') {       t[j]++; }
+//                     else if(toupper(curAligned[j]) == 'G')                                                                  {       g[j]++; }
+//                     else if(toupper(curAligned[j]) == 'C')                                                                  {       c[j]++; }
+//             }
+//     }
+//
+//     for(int i=0;i<alignmentLength;i++){
+//             if(a[i] < softThreshold && t[i] < softThreshold && g[i] < softThreshold && c[i] < softThreshold){
+//                     filter[i] = '0';                        
+//             }
+//     }
 }
 
 /**************************************************************************************/
 
 int FilterSeqsCommand::execute() {     
        try {
-                                               
-               if(globaldata->getHard().compare("") != 0)              {       doHard();               }       //      has to be applied first!
-               if(globaldata->getTrump().compare("") != 0)             {       doTrump();              }
-               if(globaldata->getVertical() == "T")                    {       doVertical();   }
-               if(globaldata->getSoft().compare("") != 0)              {       doSoft();               }
+               
+               ifstream inFASTA;
+               openInputFile(globaldata->getFastaFile(), inFASTA);
+               
+               Sequence currSequence(inFASTA);
+               alignmentLength = currSequence.getAlignLength();
+               
+               //while
+               
+               
+               if(globaldata->getHard().compare("") != 0)      {       doHard();               }       //      has to be applied first!
+               if(globaldata->getTrump().compare("") != 0)     {       doTrump();              }
+               if(isTrue(globaldata->getVertical()))           {       doVertical();   }
+               if(globaldata->getSoft().compare("") != 0)      {       doSoft();               }
 
+               
+               
+               
+               
+               
+               
                ofstream outfile;
                string filterFile = getRootName(globaldata->inputFileName) + "filter";
                openOutputFile(filterFile, outfile);
index af4c03a950a0f42ad4fd9d7453ad698423381bd0..b59d0a4b4cfa1ce7f7d86deb48b90515f5cc965a 100644 (file)
@@ -32,15 +32,16 @@ private:
        void doSoft();
        void doHard();
        void doVertical();
-       
+       string filter;  
        int alignmentLength;
-       int numSeqs;
+
+       char trump;
+       bool vertical;
        
        GlobalData* globaldata; 
-       ReadSeqs* readSeqs;
-       SequenceDB* db;
+//     ReadSeqs* readSeqs;
+//     SequenceDB* db;
        
-       string filter;
 
 };
 
index ec937eac09d96bc04bdce2f1f0f2fcf9c08bc3ce..62641beab3e0603715ff4ff57c6a2d7326a22421 100644 (file)
@@ -141,7 +141,7 @@ void GlobalData::parseGlobalData(string commandString, string optionText){
                        if (key == "nexus" )    { nexusfile = value; inputFileName = value; fileroot = value; format = "nexus";         }
                        if (key == "clustal" )  { clustalfile = value; inputFileName = value; fileroot = value; format = "clustal"; } 
                        if (key == "tree" )             { treefile = value; inputFileName = value; fileroot = value; format = "tree";           } 
-                       if (key == "shared" )   { sharedfile = value; inputFileName = value; fileroot = value; format = "sharedfile";   } 
+                       if (key == "shared" )   { sharedfile = value; inputFileName = value; fileroot = value; format = "sharedfile";   }
                        if (key == "name" )             { namefile = value;             }
                        if (key == "order" )    { orderfile = value;    }
                        if (key == "group" )    { groupfile = value;    }
@@ -413,7 +413,7 @@ void GlobalData::clear() {
        processors              =       "1";
        size            =   "0";
        search                  =       "kmer";
-       ksize                   =       "7";
+       ksize                   =       "8";
        align                   =       "needleman";
        match                   =       "1.0";
        mismatch                =       "-1.0";
@@ -449,7 +449,7 @@ void GlobalData::reset() {
        processors              =       "1";
        size            =   "0";
        search                  =       "kmer";
-       ksize                   =       "7";
+       ksize                   =       "8";
        align                   =       "needleman";
        match                   =       "1.0";
        mismatch                =       "-1.0";
index 27e6997d27878577e9121e6ecfb8950cee02e0c7..9652ac472fe38e14bff9707afdaba95526c35bff 100644 (file)
@@ -57,20 +57,19 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS
 /**************************************************************************************************/
 
 Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
-
-       vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
-       vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
+       
+       Kmer kmer(kmerSize);
        
        searchScore = 0;
        int maxSequence = 0;
-       
-       string query = candidateSeq->getUnaligned();                    //      we want to search using an unaligned dna sequence
 
-       int numKmers = query.length() - kmerSize + 1;   
-       Kmer kmer(kmerSize);
-       
+       vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
+       vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
+
+       int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
+
        for(int i=0;i<numKmers;i++){
-               int kmerNumber = kmer.getKmerNumber(query, i);          //      go through the query sequence and get a kmer number
+               int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
                if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
                        for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
                                matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
@@ -80,14 +79,14 @@ Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
        }
 
        for(int i=0;i<numSeqs;i++){                                                             //      find the db sequence that shares the most kmers with
-               if(matches[i] > searchScore){                                           //      the query sequence
+               if(matches[i] > searchScore){                                   //      the query sequence
                        searchScore = matches[i];
                        maxSequence = i;
                }
        }
-       searchScore = 100 * searchScore / (float)numKmers;
-       return templateSequences[maxSequence];                                  //      return the Sequence object corresponding to the db
-                                                                                                                       //      sequence with the most shared kmers.
+
+       searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
+       return templateSequences[maxSequence];                                  //      sequence with the most shared kmers.
 }
 
 /**************************************************************************************************/
index 670a32c8326a76da2cf894cac0d7ad10a9a10825..f07f343b05d91148c64ebccc79b7d165bf51c8f7 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -316,6 +316,16 @@ inline int openOutputFileAppend(string fileName, ofstream& fileHandle){
 }
 
 
+/***********************************************************************/
+
+inline int getNumSeqs(ifstream& file){
+       
+       int numSeqs = count(istreambuf_iterator<char>(file),istreambuf_iterator<char>(), '>');
+       file.seekg(0);
+       return numSeqs;
+
+}
+
 /***********************************************************************/
 
 //This function parses the estimator options and puts them in a vector
index 94094c7088cce89e1ffba059d815bd06583bb418..586e436982c3629b842e0b9ddb0da73a9e3d5ebb 100644 (file)
 ScreenSeqsCommand::ScreenSeqsCommand(){
        try {
                globaldata = GlobalData::getInstance();
-               
-               if(globaldata->getFastaFile() != "")            {       readSeqs = new ReadFasta(globaldata->inputFileName);    }
-               else if(globaldata->getNexusFile() != "")       {       readSeqs = new ReadNexus(globaldata->inputFileName);    }
-               else if(globaldata->getClustalFile() != "") {   readSeqs = new ReadClustal(globaldata->inputFileName);  }
-               else if(globaldata->getPhylipFile() != "")      {       readSeqs = new ReadPhylip(globaldata->inputFileName);   }
-               
-               readSeqs->read();
-               db = readSeqs->getDB();
-               numSeqs = db->size();
+               if(globaldata->getFastaFile() == "")            {       cout << "you must provide a fasta formatted file" << endl;      }
        }
        catch(exception& e) {
                cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
@@ -36,9 +28,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(){
 
 //***************************************************************************************************************
 
-ScreenSeqsCommand::~ScreenSeqsCommand(){
-       delete readSeqs;
-}
+ScreenSeqsCommand::~ScreenSeqsCommand(){       /*      do nothing      */      }
 
 //***************************************************************************************************************
 
@@ -52,6 +42,9 @@ int ScreenSeqsCommand::execute(){
                convert(globaldata->getMinLength(), minLength);
                convert(globaldata->getMaxLength(), maxLength);
                
+               ifstream inFASTA;
+               openInputFile(globaldata->getFastaFile(), inFASTA);
+               
                set<string> badSeqNames;
                
                string goodSeqFile = getRootName(globaldata->inputFileName) + "good" + getExtension(globaldata->inputFileName);
@@ -59,16 +52,16 @@ int ScreenSeqsCommand::execute(){
                
                ofstream goodSeqOut;    openOutputFile(goodSeqFile, goodSeqOut);
                ofstream badSeqOut;             openOutputFile(badSeqFile, badSeqOut);          
-
-               for(int i=0;i<numSeqs;i++){
-                       Sequence currSeq = db->get(i);
+               
+               while(!inFASTA.eof()){
+                       Sequence currSeq(inFASTA);
                        bool goodSeq = 1;               //      innocent until proven guilty
-                       if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())          {       goodSeq = 0;    }
-                       if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                        {       goodSeq = 0;    }
-                       if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())        {       goodSeq = 0;    }
-                       if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()){  goodSeq = 0;    }
-                       if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())        {       goodSeq = 0;    }
-                       if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())        {       goodSeq = 0;    }
+                       if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
+                       if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
+                       if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
+                       if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
+                       if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
+                       if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
                        
                        if(goodSeq == 1){
                                currSeq.printSequence(goodSeqOut);      
@@ -77,12 +70,8 @@ int ScreenSeqsCommand::execute(){
                                currSeq.printSequence(badSeqOut);       
                                badSeqNames.insert(currSeq.getName());
                        }
-               }
-               
-               if(globaldata->getNameFile() != ""){
-                       screenNameGroupFile(badSeqNames);
-                       
-               }
+                       gobble(inFASTA);
+               }       
                
                return 0;
        }
@@ -173,4 +162,38 @@ void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
 
 //***************************************************************************************************************
 
+void ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
+
+       ifstream inputGroups;
+       openInputFile(globaldata->getGroupFile(), inputGroups);
+       string seqName, group;
+       set<string>::iterator it;
+       
+       string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile());
+       string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile());
+       
+       ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
+       ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
+       
+       while(!inputGroups.eof()){
+               inputGroups >> seqName >> group;
+               it = badSeqNames.find(seqName);
+               
+               if(it != badSeqNames.end()){
+                       badSeqNames.erase(it);
+                       badGroupOut << seqName << '\t' << group << endl;
+               }
+               else{
+                       goodGroupOut << seqName << '\t' << group << endl;
+               }
+               gobble(inputGroups);
+       }
+       inputGroups.close();
+       goodGroupOut.close();
+       badGroupOut.close();
+       
+}
+
+//***************************************************************************************************************
+
 
index f88fe2c442cf1d8ea6b0457a45390383c0c45698..56c9d82f1625a2a9481a203b7b217a9bc3b66bc4 100644 (file)
@@ -16,7 +16,6 @@
 #include "readnexus.h"
 #include "readclustal.h"
 #include "readseqsphylip.h"
-#include <set>
 
 using namespace std;
 
@@ -28,10 +27,9 @@ public:
        int execute();
 private:
        void screenNameGroupFile(set<string>);
-       int numSeqs;    
+       void screenGroupFile(set<string>);
+
        GlobalData* globaldata; 
-       ReadSeqs* readSeqs;
-       SequenceDB* db;
 };
 
 #endif
index 2bffecb30c1e3c7af065ffee6add227e70e6124c..18eddec2469cc6ea34ddc7663620f6ead6021006 100644 (file)
 SeqSummaryCommand::SeqSummaryCommand(){
        try {
                globaldata = GlobalData::getInstance();
-               
-               if(globaldata->getFastaFile() != "")            {       readSeqs = new ReadFasta(globaldata->inputFileName);    }
-               else if(globaldata->getNexusFile() != "")       {       readSeqs = new ReadNexus(globaldata->inputFileName);    }
-               else if(globaldata->getClustalFile() != "") {   readSeqs = new ReadClustal(globaldata->inputFileName);  }
-               else if(globaldata->getPhylipFile() != "")      {       readSeqs = new ReadPhylip(globaldata->inputFileName);   }
-               
-               readSeqs->read();
-               db = readSeqs->getDB();
-               numSeqs = db->size();
+               if(globaldata->getFastaFile() == "")            {       cout << "you need to at least enter a fasta file name" << endl; }
        }
        catch(exception& e) {
                cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
@@ -36,83 +28,69 @@ SeqSummaryCommand::SeqSummaryCommand(){
 
 //***************************************************************************************************************
 
-SeqSummaryCommand::~SeqSummaryCommand(){
-       delete readSeqs;
-}
+SeqSummaryCommand::~SeqSummaryCommand(){       /*      do nothing      */      }
 
 //***************************************************************************************************************
 
 int SeqSummaryCommand::execute(){
        try{
-               
-               ofstream outfile;
-               string summaryFile = getRootName(globaldata->inputFileName) + "summary";
-               openOutputFile(summaryFile, outfile);
 
-               vector<int> startPosition(numSeqs, 0);
-               vector<int> endPosition(numSeqs, 0);
-               vector<int> seqLength(numSeqs, 0);
-               vector<int> ambigBases(numSeqs, 0);
-               vector<int> longHomoPolymer(numSeqs, 0);
+               ifstream inFASTA;
+               openInputFile(globaldata->getFastaFile(), inFASTA);
+               int numSeqs = 0;
+
+               ofstream outSummary;
+               string summaryFile = globaldata->getFastaFile() + ".summary";
+               openOutputFile(summaryFile, outSummary);
                
-               if(db->get(0).getIsAligned() == 1){
-                       outfile << "seqname\tstart\tend\tlength\tambiguities\tlonghomopolymer" << endl;                 
-                       for(int i = 0; i < numSeqs; i++) {
-                               Sequence current = db->get(i);
-                               startPosition[i] = current.getStartPos();
-                               endPosition[i] = current.getEndPos();
-                               seqLength[i] = current.getNumBases();
-                               ambigBases[i] = current.getAmbigBases();
-                               longHomoPolymer[i] = current.getLongHomoPolymer();
-                               outfile << current.getName() << '\t' << startPosition[i] << '\t' << endPosition[i] << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl;
-                       }
-               }
-               else{
-                       outfile << "seqname\tlength\tambiguities\tlonghomopolymer" << endl;
-                       for(int i=0;i<numSeqs;i++){
-                               Sequence current = db->get(i);
-                               seqLength[i] = current.getNumBases();
-                               ambigBases[i] = current.getAmbigBases();
-                               longHomoPolymer[i] = current.getLongHomoPolymer();
-                               outfile << current.getName() << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl;
-                       }
+               vector<int> startPosition;
+               vector<int> endPosition;
+               vector<int> seqLength;
+               vector<int> ambigBases;
+               vector<int> longHomoPolymer;
+               
+               outSummary << "seqname\tstart\tend\tnbases\tambigs\tpolymer" << endl;                   
+
+               while(!inFASTA.eof()){
+                       Sequence current(inFASTA);
+                       startPosition.push_back(current.getStartPos());
+                       endPosition.push_back(current.getEndPos());
+                       seqLength.push_back(current.getNumBases());
+                       ambigBases.push_back(current.getAmbigBases());
+                       longHomoPolymer.push_back(current.getLongHomoPolymer());
+
+                       outSummary << current.getName() << '\t';
+                       outSummary << current.getStartPos() << '\t' << current.getEndPos() << '\t';
+                       outSummary << current.getNumBases() << '\t' << current.getAmbigBases() << '\t';
+                       outSummary << current.getLongHomoPolymer() << endl;
+                       
+                       numSeqs++;
+                       gobble(inFASTA);
                }
+               inFASTA.close();
                
+               sort(startPosition.begin(), startPosition.end());
+               sort(endPosition.begin(), endPosition.end());
                sort(seqLength.begin(), seqLength.end());
                sort(ambigBases.begin(), ambigBases.end());
                sort(longHomoPolymer.begin(), longHomoPolymer.end());
                
-               int median                      = int(numSeqs * 0.500);
-               int lowestPtile         = int(numSeqs * 0.025);
-               int lowPtile            = int(numSeqs * 0.250);
-               int highPtile           = int(numSeqs * 0.750);
-               int highestPtile        = int(numSeqs * 0.975);
-               int max                         = numSeqs - 1;
+               int ptile0_25   = int(numSeqs * 0.025);
+               int ptile25             = int(numSeqs * 0.250);
+               int ptile50             = int(numSeqs * 0.500);
+               int ptile75             = int(numSeqs * 0.750);
+               int ptile97_5   = int(numSeqs * 0.975);
+               int ptile100    = numSeqs - 1;
                
                cout << endl;
-               if(db->get(0).getIsAligned() == 1){
-                       sort(startPosition.begin(), startPosition.end());
-                       sort(endPosition.begin(), endPosition.end());
-                                       
-                       cout << "\t\tStart\tEnd\tLength\tN's\tPolymer" << endl;
-                       cout << "Minimum:\t" << startPosition[0] << '\t' << endPosition[0] << '\t' << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl;
-                       cout << "2.5%-tile:\t" << startPosition[lowestPtile] << '\t' << endPosition[lowestPtile] << '\t' << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl;
-                       cout << "25%-tile:\t" << startPosition[lowPtile] << '\t' << endPosition[lowPtile] << '\t' << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl;
-                       cout << "Median: \t" << startPosition[median] << '\t' << endPosition[median] << '\t' << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl;
-                       cout << "75%-tile:\t" << startPosition[highPtile] << '\t' << endPosition[highPtile] << '\t' << seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl;
-                       cout << "97.5%-tile:\t" << startPosition[highestPtile] << '\t' << endPosition[highestPtile] << '\t' << seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl;
-                       cout << "Maximum:\t" << startPosition[max] << '\t' << endPosition[max] << '\t' << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl;
-               }
-               else{
-                       cout << "\t\tLength\tN's\tPolymer" << endl;
-                       cout << "Minimum:\t" << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl;
-                       cout << "2.5%-tile:\t" << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl;
-                       cout << "25%-tile:\t" << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl;
-                       cout << "Median: \t" << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl;
-                       cout << "75%-tile:\t"<< seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl;
-                       cout << "97.5%-tile:\t"<< seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl;
-                       cout << "Maximum:\t" << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl;
-               }
+               cout << "\t\tStart\tEnd\tNBases\tAmbigs\tPolymer" << endl;
+               cout << "Minimum:\t" << startPosition[0] << '\t' << endPosition[0] << '\t' << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl;
+               cout << "2.5%-tile:\t" << startPosition[ptile0_25] << '\t' << endPosition[ptile0_25] << '\t' << seqLength[ptile0_25] << '\t' << ambigBases[ptile0_25] << '\t' << longHomoPolymer[ptile0_25] << endl;
+               cout << "25%-tile:\t" << startPosition[ptile25] << '\t' << endPosition[ptile25] << '\t' << seqLength[ptile25] << '\t' << ambigBases[ptile25] << '\t' << longHomoPolymer[ptile25] << endl;
+               cout << "Median: \t" << startPosition[ptile50] << '\t' << endPosition[ptile50] << '\t' << seqLength[ptile50] << '\t' << ambigBases[ptile50] << '\t' << longHomoPolymer[ptile50] << endl;
+               cout << "75%-tile:\t" << startPosition[ptile75] << '\t' << endPosition[ptile75] << '\t' << seqLength[ptile75] << '\t' << ambigBases[ptile75] << '\t' << longHomoPolymer[ptile75] << endl;
+               cout << "97.5%-tile:\t" << startPosition[ptile97_5] << '\t' << endPosition[ptile97_5] << '\t' << seqLength[ptile97_5] << '\t' << ambigBases[ptile97_5] << '\t' << longHomoPolymer[ptile97_5] << endl;
+               cout << "Maximum:\t" << startPosition[ptile100] << '\t' << endPosition[ptile100] << '\t' << seqLength[ptile100] << '\t' << ambigBases[ptile100] << '\t' << longHomoPolymer[ptile100] << endl;
                cout << "# of Seqs:\t" << numSeqs << endl;
                
                return 0;
index 02103e77e023fff8e60b1bc71cdd52e47bf09470..9b2be27ef8337e694484927e33298d8ec9841359 100644 (file)
@@ -27,10 +27,8 @@ public:
        int execute();
        
 private:
-       int numSeqs;    
        GlobalData* globaldata; 
-       ReadSeqs* readSeqs;
-       SequenceDB* db;
+
 };
 
 #endif
index e9487c519e0d3e0378266eb537d15e28979546dc..98aa7b01862c494018c710ec607f5132d8ce2db1 100644 (file)
@@ -25,7 +25,6 @@ Sequence::Sequence(string newName, string sequence) {
        name = newName;
        if(sequence.find_first_of('-') != string::npos) {
                setAligned(sequence);
-               isAligned = 1;
        }
        setUnaligned(sequence);
        
@@ -33,6 +32,7 @@ Sequence::Sequence(string newName, string sequence) {
 //********************************************************************************************************************
 
 Sequence::Sequence(ifstream& fastaFile){
+       initialize();
        
        string accession;                               //      provided a file handle to a fasta-formatted sequence file, read in the next
        fastaFile >> accession;                 //      accession number and sequence we find...
@@ -131,7 +131,7 @@ void Sequence::setAligned(string sequence){
                        }
                }
        }
-       
+       isAligned = 1;  
 }
 
 //********************************************************************************************************************
@@ -249,11 +249,13 @@ int Sequence::getStartPos(){
        if(endPos == -1){
                for(int j = 0; j < alignmentLength; j++) {
                        if(aligned[j] != '.'){
-                               startPos = j;
+                               startPos = j + 1;
                                break;
                        }
                }
-       }       
+       }
+       if(isAligned == 0){     startPos = 1;   }
+
        return startPos;
 }
 
@@ -263,11 +265,13 @@ int Sequence::getEndPos(){
        if(endPos == -1){
                for(int j=alignmentLength-1;j>=0;j--){
                        if(aligned[j] != '.'){
-                               endPos = j;
+                               endPos = j + 1;
                                break;
                        }
                }
        }
+       if(isAligned == 0){     endPos = numBases;      }
+       
        return endPos;
 }
 
@@ -278,3 +282,19 @@ bool Sequence::getIsAligned(){
 }
 
 //********************************************************************************************************************
+
+void Sequence::reverseComplement(){
+
+       string temp;
+       for(int i=numBases-1;i>=0;i--){
+               if(unaligned[i] == 'A')         {       temp += 'T';    }
+               else if(unaligned[i] == 'T'){   temp += 'A';    }
+               else if(unaligned[i] == 'G'){   temp += 'C';    }
+               else if(unaligned[i] == 'C'){   temp += 'G';    }
+               else                                            {       temp += 'N';    }
+       }
+       unaligned = temp;
+       
+}
+
+//********************************************************************************************************************
index 929ca781ba9906c736bc83e2a79b556b583651f2..904e25aeb7a5b4d3476df576217d12820f95f358 100644 (file)
@@ -31,6 +31,7 @@ public:
        void setPairwise(string);
        void setAligned(string);
        void setLength();
+       void reverseComplement();
        
        string convert2ints();
        string getName();
index b67ee806116ae4b1cc8b568bb3c0d74bfcfccae3..cd3b05474e18fe4eb49c3173408f5b9f0b2daba7 100644 (file)
@@ -279,7 +279,7 @@ void ValidParameters::initCommandParameters() {
                string summaryseqsArray[] =  {"fasta","phylip","clustal","nexus"};
                commandParameters["summary.seqs"] = addParameters(summaryseqsArray, sizeof(summaryseqsArray)/sizeof(string));
 
-               string screenseqsArray[] =  {"fasta","phylip","clustal","nexus", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"};
+               string screenseqsArray[] =  {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"};
                commandParameters["screen.seqs"] = addParameters(screenseqsArray, sizeof(screenseqsArray)/sizeof(string));
 
                string vennArray[] =  {"groups","line","label","calc"};