]> git.donarmstrong.com Git - mothur.git/commitdiff
added screen.seqs command - pds
authorpschloss <pschloss>
Wed, 3 Jun 2009 18:21:39 +0000 (18:21 +0000)
committerpschloss <pschloss>
Wed, 3 Jun 2009 18:21:39 +0000 (18:21 +0000)
commandfactory.cpp
fastamap.cpp
globaldata.cpp
globaldata.hpp
mothur.h
screenseqscommand.cpp [new file with mode: 0644]
screenseqscommand.h [new file with mode: 0644]
seqsummarycommand.cpp
validcommands.cpp
validparameter.cpp

index 9b6411855b19a8cee7a9f5a4489d8fc522a689c8..7b3a5c024fa31655a22946f3b5b2a86af6ae5b79 100644 (file)
@@ -46,6 +46,7 @@
 #include "getsabundcommand.h"
 #include "getrabundcommand.h"
 #include "seqsummarycommand.h"
+#include "screenseqscommand.h"
 
 
 /***********************************************************/
@@ -69,7 +70,7 @@ Command* CommandFactory::getCommand(string commandName){
        try {
                delete command;   //delete the old command
 
-                        if(commandName == "read.dist")                         {       command = new ReadDistCommand();                        }
+               if(commandName == "read.dist")                                  {       command = new ReadDistCommand();                        }
                else if(commandName == "read.otu")                              {       command = new ReadOtuCommand();                         }
                else if(commandName == "read.tree")                             {       command = new ReadTreeCommand();                        }
                else if(commandName == "cluster")                               {       command = new ClusterCommand();                         }
@@ -103,6 +104,7 @@ Command* CommandFactory::getCommand(string commandName){
                else if(commandName == "dist.seqs")                             {   command = new DistanceCommand();                    }
                else if(commandName == "align.seqs")                    {   command = new AlignCommand();                               }
                else if(commandName == "summary.seqs")                  {       command = new SeqSummaryCommand();                      }
+               else if(commandName == "screen.seqs")                   {       command = new ScreenSeqsCommand();                      }
                else                                                                                    {       command = new NoCommand();                                      }
 
                return command;
index ebfecedb860bafb6385ca2f75a3752702354498b..0848363be7cf4099cbb50cb58a3d11623eecc78f 100644 (file)
@@ -21,29 +21,39 @@ void FastaMap::readFastaFile(ifstream& in) {
                //read through file
                while (!in.eof()) {
                        in >> line;
-                       if (line != "") {
-                               if (isalnum(line.at(0))) {  //if it's a sequence line
-                                       sequence += line;
-                               }
-                               else{
-                               //input sequence info into map
-                                       seqmap[name] = sequence;  
-                                       it = data.find(sequence);
-                                       if (it == data.end()) {         //it's unique.
-                                               data[sequence].groupname = name;  //group name will be the name of the first duplicate sequence found.
-                                               data[sequence].groupnumber = 1;
-                                               data[sequence].names = name;
-                                       }else { // its a duplicate.
-                                               data[sequence].names += "," + name;
-                                               data[sequence].groupnumber++;
-                                       }
-                                       name = (line.substr(1, (line.npos))); //The line you just read is a new name so rip off '>'
-                                       sequence = "";
+
+                       if (line[0] != '>') {  //if it's a sequence line
+                               sequence += line;
+                       }
+                       else{
+                       //input sequence info into map
+                               seqmap[name] = sequence;  
+
+                               it = data.find(sequence);
+                               if (it == data.end()) {         //it's unique.
+                                       data[sequence].groupname = name;  //group name will be the name of the first duplicate sequence found.
+                                       data[sequence].groupnumber = 1;
+                                       data[sequence].names = name;
+                               }else { // its a duplicate.
+                                       data[sequence].names += "," + name;
+                                       data[sequence].groupnumber++;
                                }
+                               name = (line.substr(1, (line.npos))); //The line you just read is a new name so rip off '>'
+                               sequence = "";
                        }
+                       
                        gobble(in);
                }
-       
+               it = data.find(sequence);
+               if (it == data.end()) {         //it's unique.
+                       data[sequence].groupname = name;  //group name will be the name of the first duplicate sequence found.
+                       data[sequence].groupnumber = 1;
+                       data[sequence].names = name;
+               }else { // its a duplicate.
+                       data[sequence].names += "," + name;
+                       data[sequence].groupnumber++;
+               }
+               
                        
        }
        catch(exception& e) {
index a8e88c52ffbd9d9fe9bb711d4be2cf0c361c9212..ec937eac09d96bc04bdce2f1f0f2fcf9c08bc3ce 100644 (file)
@@ -96,6 +96,12 @@ void GlobalData::parseGlobalData(string commandString, string optionText){
                                if (key == "mismatch")          { mismatch = value;         }
                                if (key == "gapopen")           { gapopen = value;              }
                                if (key == "gapextend" )        { gapextend = value;    }
+                               if (key == "start" )            { startPos = value;     }
+                               if (key == "end" )                      { endPos = value;       }
+                               if (key == "maxambig" )         { maxAmbig = value;     }
+                               if (key == "maxhomop" )         { maxHomoPolymer = value;       }
+                               if (key == "minlength" )        { minLength = value;    }
+                               if (key == "maxlength" )        { maxLength = value;    }
                                
                                if (key == "line") {//stores lines to be used in a vector
                                        lines.clear();
@@ -168,6 +174,13 @@ void GlobalData::parseGlobalData(string commandString, string optionText){
                        if (key == "mismatch")          { mismatch = value;         }
                        if (key == "gapopen")           { gapopen = value;              }
                        if (key == "gapextend" )        { gapextend = value;    }
+                       if (key == "start" )            { startPos = value;     }
+                       if (key == "end" )                      { endPos = value;       }
+                       if (key == "maxambig" )         { maxAmbig = value;     }
+                       if (key == "maxhomop" )         { maxHomoPolymer = value;       }
+                       if (key == "minlength" )        { minLength = value;    }
+                       if (key == "maxlength" )        { maxLength = value;    }
+
 
                        if (key == "line") {//stores lines to be used in a vector
                                lines.clear();
@@ -211,7 +224,6 @@ void GlobalData::parseGlobalData(string commandString, string optionText){
                        splitAtDash(calc, Estimators); 
                }
                if (commandName == "collect.shared") {
-
                        if ((calc == "default") || (calc == "")) { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
                        Estimators.clear();
                        splitAtDash(calc, Estimators); 
@@ -323,6 +335,12 @@ string GlobalData::getMatch()                      {       return match;           }
 string GlobalData::getMismatch()               {       return mismatch;        }
 string GlobalData::getGapopen()                        {       return gapopen;         }
 string GlobalData::getGapextend()              {       return gapextend;       }
+string GlobalData::getStartPos()               {       return startPos;        }
+string GlobalData::getEndPos()                 {       return endPos;          }
+string GlobalData::getMaxAmbig()               {       return maxAmbig;        }
+string GlobalData::getMaxHomoPolymer() {       return maxHomoPolymer;  }
+string GlobalData::getMinLength()              {       return minLength;       }
+string GlobalData::getMaxLength()              {       return maxLength;       }
 
 
 void GlobalData::setListFile(string file)              {       listfile = file;        inputFileName = file;                                   }
@@ -401,6 +419,13 @@ void GlobalData::clear() {
        mismatch                =       "-1.0";
        gapopen                 =       "-1.0";
        gapextend               =       "-2.0";
+       startPos                =       "-1";
+       endPos                  =       "-1";
+       maxAmbig                =       "-1";
+       maxHomoPolymer  =       "-1";
+       minLength               =       "-1";
+       maxLength               =       "-1";
+       
 }
 
 //*******************************************************/
@@ -434,7 +459,13 @@ void GlobalData::reset() {
        trump           =   "";         
        hard                    =   "";         
        soft            =   ""; 
-
+       startPos                =       "-1";
+       endPos                  =       "-1";
+       maxAmbig                =       "-1";
+       maxHomoPolymer  =       "-1";
+       minLength               =       "-1";
+       maxLength               =       "-1";
+       
 }
 /*******************************************************/
 
@@ -449,6 +480,8 @@ GlobalData::~GlobalData() {
 
 /*******************************************************/
 void GlobalData::parseTreeFile() {
+       //Why is THIS in GlobalData??? - PDS
+       
        //only takes names from the first tree and assumes that all trees use the same names.
        try {
                string filename = treefile;
index 6d1b3f3c9dd9f2eb73f5abfd23829e63388f7e6e..f97d98b311ff46f21d5479c15795eaa3e69fd892 100644 (file)
@@ -91,7 +91,12 @@ public:
        string getSoft();
        string getHard();
        string getScale();
-
+       string getStartPos();
+       string getEndPos();
+       string getMaxAmbig();
+       string getMaxHomoPolymer();
+       string getMinLength();
+       string getMaxLength();
 
        void setListFile(string);
        void setGroupFile(string file); 
@@ -114,7 +119,7 @@ public:
        
        void parseGlobalData(string, string);
        
-       void parseTreeFile();           //parses through tree file to find names of nodes and number of them
+       void parseTreeFile();   //parses through tree file to find names of nodes and number of them
                                                        //this is required in case user has sequences in the names file that are
                                                        //not included in the tree. 
                                                        //only takes names from the first tree in the tree file and assumes that all trees use the same names.
@@ -122,9 +127,7 @@ public:
                
 private:
 
-       string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups;
-       string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, hard, scale, countends, processors, candidatefile, search, ksize, align, match, size;
-       string mismatch, gapopen, gapextend;
+       string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups, cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, hard, scale, countends, processors, candidatefile, search, ksize, align, match, size, mismatch, gapopen, gapextend, minLength, maxLength, startPos, endPos, maxAmbig, maxHomoPolymer;
 
 
        static GlobalData* _uniqueInstance;
index 07bb971347d974066ab5449614172c5e68d4c2a6..670a32c8326a76da2cf894cac0d7ad10a9a10825 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -249,7 +249,7 @@ inline string getPathName(string longName){
  
        string rootPathName = longName;
        
-       if(longName.find_last_of("/") != longName.npos){
+       if(longName.find_last_of('/') != longName.npos){
                int pos = longName.find_last_of('/')+1;
                rootPathName = longName.substr(0, pos);
        }
@@ -259,6 +259,20 @@ inline string getPathName(string longName){
 
 /***********************************************************************/
 
+inline string getExtension(string longName){
+       
+       string extension = longName;
+       
+       if(longName.find_last_of('.') != longName.npos){
+               int pos = longName.find_last_of('.');
+               extension = longName.substr(pos, longName.length());
+       }
+       
+       return extension;
+}
+
+/***********************************************************************/
+
 inline int openInputFile(string fileName, ifstream& fileHandle){
 
        fileHandle.open(fileName.c_str());
diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp
new file mode 100644 (file)
index 0000000..94094c7
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ *  screenseqscommand.cpp
+ *  Mothur
+ *
+ *  Created by Pat Schloss on 6/3/09.
+ *  Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "screenseqscommand.h"
+
+//***************************************************************************************************************
+
+ScreenSeqsCommand::ScreenSeqsCommand(){
+       try {
+               globaldata = GlobalData::getInstance();
+               
+               if(globaldata->getFastaFile() != "")            {       readSeqs = new ReadFasta(globaldata->inputFileName);    }
+               else if(globaldata->getNexusFile() != "")       {       readSeqs = new ReadNexus(globaldata->inputFileName);    }
+               else if(globaldata->getClustalFile() != "") {   readSeqs = new ReadClustal(globaldata->inputFileName);  }
+               else if(globaldata->getPhylipFile() != "")      {       readSeqs = new ReadPhylip(globaldata->inputFileName);   }
+               
+               readSeqs->read();
+               db = readSeqs->getDB();
+               numSeqs = db->size();
+       }
+       catch(exception& e) {
+               cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }
+       catch(...) {
+               cout << "An unknown error has occurred in the SeqCoordCommand class function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }       
+}
+
+//***************************************************************************************************************
+
+ScreenSeqsCommand::~ScreenSeqsCommand(){
+       delete readSeqs;
+}
+
+//***************************************************************************************************************
+
+int ScreenSeqsCommand::execute(){
+       try{
+               int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength;
+               convert(globaldata->getStartPos(), startPos);
+               convert(globaldata->getEndPos(), endPos);
+               convert(globaldata->getMaxAmbig(), maxAmbig);
+               convert(globaldata->getMaxHomoPolymer(), maxHomoP);
+               convert(globaldata->getMinLength(), minLength);
+               convert(globaldata->getMaxLength(), maxLength);
+               
+               set<string> badSeqNames;
+               
+               string goodSeqFile = getRootName(globaldata->inputFileName) + "good" + getExtension(globaldata->inputFileName);
+               string badSeqFile = getRootName(globaldata->inputFileName) + "bad" + getExtension(globaldata->inputFileName);
+               
+               ofstream goodSeqOut;    openOutputFile(goodSeqFile, goodSeqOut);
+               ofstream badSeqOut;             openOutputFile(badSeqFile, badSeqOut);          
+
+               for(int i=0;i<numSeqs;i++){
+                       Sequence currSeq = db->get(i);
+                       bool goodSeq = 1;               //      innocent until proven guilty
+                       if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())          {       goodSeq = 0;    }
+                       if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                        {       goodSeq = 0;    }
+                       if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())        {       goodSeq = 0;    }
+                       if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()){  goodSeq = 0;    }
+                       if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())        {       goodSeq = 0;    }
+                       if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())        {       goodSeq = 0;    }
+                       
+                       if(goodSeq == 1){
+                               currSeq.printSequence(goodSeqOut);      
+                       }
+                       else{
+                               currSeq.printSequence(badSeqOut);       
+                               badSeqNames.insert(currSeq.getName());
+                       }
+               }
+               
+               if(globaldata->getNameFile() != ""){
+                       screenNameGroupFile(badSeqNames);
+                       
+               }
+               
+               return 0;
+       }
+       catch(exception& e) {
+               cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }
+       catch(...) {
+               cout << "An unknown error has occurred in the FilterSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }
+       
+}
+
+//***************************************************************************************************************
+
+void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
+
+       ifstream inputNames;
+       openInputFile(globaldata->getNameFile(), inputNames);
+       set<string> badSeqGroups;
+       string seqName, seqList, group;
+       set<string>::iterator it;
+
+       string goodNameFile = getRootName(globaldata->getNameFile()) + "good" + getExtension(globaldata->getNameFile());
+       string badNameFile = getRootName(globaldata->getNameFile()) + "bad" + getExtension(globaldata->getNameFile());
+       
+       ofstream goodNameOut;   openOutputFile(goodNameFile, goodNameOut);
+       ofstream badNameOut;    openOutputFile(badNameFile, badNameOut);                
+       
+       while(!inputNames.eof()){
+               inputNames >> seqName >> seqList;
+               it = badSeqNames.find(seqName);
+               
+               if(it != badSeqNames.end()){
+                       badSeqNames.erase(it);
+                       badNameOut << seqName << '\t' << seqList << endl;
+                       if(globaldata->getNameFile() != ""){
+                               int start = 0;
+                               for(int i=0;i<seqList.length();i++){
+                                       if(seqList[i] == ','){
+                                               badSeqGroups.insert(seqList.substr(start,i-start));
+                                               start = i+1;
+                                       }                                       
+                               }
+                               badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
+                       }
+               }
+               else{
+                       goodNameOut << seqName << '\t' << seqList << endl;
+               }
+               gobble(inputNames);
+       }
+       inputNames.close();
+       goodNameOut.close();
+       badNameOut.close();
+       
+       if(globaldata->getGroupFile() != ""){
+               
+               ifstream inputGroups;
+               openInputFile(globaldata->getGroupFile(), inputGroups);
+
+               string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile());
+               string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile());
+               
+               ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
+               ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
+               
+               while(!inputGroups.eof()){
+                       inputGroups >> seqName >> group;
+
+                       it = badSeqGroups.find(seqName);
+                       
+                       if(it != badSeqGroups.end()){
+                               badSeqGroups.erase(it);
+                               badGroupOut << seqName << '\t' << group << endl;
+                       }
+                       else{
+                               goodGroupOut << seqName << '\t' << group << endl;
+                       }
+                       gobble(inputGroups);
+               }
+               inputGroups.close();
+               goodGroupOut.close();
+               badGroupOut.close();
+       }
+}
+
+//***************************************************************************************************************
+
+
diff --git a/screenseqscommand.h b/screenseqscommand.h
new file mode 100644 (file)
index 0000000..f88fe2c
--- /dev/null
@@ -0,0 +1,37 @@
+#ifndef SCREENSEQSCOMMAND_H
+#define SCREENSEQSCOMMAND_H
+
+/*
+ *  screenseqscommand.h
+ *  Mothur
+ *
+ *  Created by Pat Schloss on 6/3/09.
+ *  Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+#include "mothur.h"
+#include "command.hpp"
+#include "globaldata.hpp"
+#include "readfasta.h"
+#include "readnexus.h"
+#include "readclustal.h"
+#include "readseqsphylip.h"
+#include <set>
+
+using namespace std;
+
+class ScreenSeqsCommand : public Command {
+       
+public:
+       ScreenSeqsCommand();
+       ~ScreenSeqsCommand();
+       int execute();
+private:
+       void screenNameGroupFile(set<string>);
+       int numSeqs;    
+       GlobalData* globaldata; 
+       ReadSeqs* readSeqs;
+       SequenceDB* db;
+};
+
+#endif
index e71c2aff6a4d8c60a3cf8f0648ecfdd2804d8544..2bffecb30c1e3c7af065ffee6add227e70e6124c 100644 (file)
@@ -37,6 +37,7 @@ SeqSummaryCommand::SeqSummaryCommand(){
 //***************************************************************************************************************
 
 SeqSummaryCommand::~SeqSummaryCommand(){
+       delete readSeqs;
 }
 
 //***************************************************************************************************************
index 6439e9bfca12a19751b44cd0a570cdd266d81105..a4b17838a1422324dd6cca1420d53991b023a5bb 100644 (file)
@@ -47,6 +47,7 @@ ValidCommands::ValidCommands() {
                commands["filter.seqs"]                 = "filter.seqs";
                commands["align.seqs"]                  = "align.seqs";
                commands["summary.seqs"]                = "summary.seqs";
+               commands["screen.seqs"]                 = "screen.seqs";
                commands["quit"]                                = "quit"; 
 
                                
index 547d4dcd22c944443552ccc11edd7bea4b6e5124..b67ee806116ae4b1cc8b568bb3c0d74bfcfccae3 100644 (file)
@@ -279,6 +279,9 @@ void ValidParameters::initCommandParameters() {
                string summaryseqsArray[] =  {"fasta","phylip","clustal","nexus"};
                commandParameters["summary.seqs"] = addParameters(summaryseqsArray, sizeof(summaryseqsArray)/sizeof(string));
 
+               string screenseqsArray[] =  {"fasta","phylip","clustal","nexus", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"};
+               commandParameters["screen.seqs"] = addParameters(screenseqsArray, sizeof(screenseqsArray)/sizeof(string));
+
                string vennArray[] =  {"groups","line","label","calc"};
                commandParameters["venn"] = addParameters(vennArray, sizeof(vennArray)/sizeof(string));