From dc874a77f24b3808775e2ce7e39595c647a07f82 Mon Sep 17 00:00:00 2001 From: pschloss Date: Wed, 3 Jun 2009 18:21:39 +0000 Subject: [PATCH] added screen.seqs command - pds --- commandfactory.cpp | 4 +- fastamap.cpp | 48 +++++++----- globaldata.cpp | 37 ++++++++- globaldata.hpp | 13 ++-- mothur.h | 16 +++- screenseqscommand.cpp | 176 ++++++++++++++++++++++++++++++++++++++++++ screenseqscommand.h | 37 +++++++++ seqsummarycommand.cpp | 1 + validcommands.cpp | 1 + validparameter.cpp | 3 + 10 files changed, 308 insertions(+), 28 deletions(-) create mode 100644 screenseqscommand.cpp create mode 100644 screenseqscommand.h diff --git a/commandfactory.cpp b/commandfactory.cpp index 9b64118..7b3a5c0 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -46,6 +46,7 @@ #include "getsabundcommand.h" #include "getrabundcommand.h" #include "seqsummarycommand.h" +#include "screenseqscommand.h" /***********************************************************/ @@ -69,7 +70,7 @@ Command* CommandFactory::getCommand(string commandName){ try { delete command; //delete the old command - if(commandName == "read.dist") { command = new ReadDistCommand(); } + if(commandName == "read.dist") { command = new ReadDistCommand(); } else if(commandName == "read.otu") { command = new ReadOtuCommand(); } else if(commandName == "read.tree") { command = new ReadTreeCommand(); } else if(commandName == "cluster") { command = new ClusterCommand(); } @@ -103,6 +104,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "dist.seqs") { command = new DistanceCommand(); } else if(commandName == "align.seqs") { command = new AlignCommand(); } else if(commandName == "summary.seqs") { command = new SeqSummaryCommand(); } + else if(commandName == "screen.seqs") { command = new ScreenSeqsCommand(); } else { command = new NoCommand(); } return command; diff --git a/fastamap.cpp b/fastamap.cpp index ebfeced..0848363 100644 --- a/fastamap.cpp +++ b/fastamap.cpp @@ -21,29 +21,39 @@ void FastaMap::readFastaFile(ifstream& in) { //read through file while (!in.eof()) { in >> line; - if (line != "") { - if (isalnum(line.at(0))) { //if it's a sequence line - sequence += line; - } - else{ - //input sequence info into map - seqmap[name] = sequence; - it = data.find(sequence); - if (it == data.end()) { //it's unique. - data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found. - data[sequence].groupnumber = 1; - data[sequence].names = name; - }else { // its a duplicate. - data[sequence].names += "," + name; - data[sequence].groupnumber++; - } - name = (line.substr(1, (line.npos))); //The line you just read is a new name so rip off '>' - sequence = ""; + + if (line[0] != '>') { //if it's a sequence line + sequence += line; + } + else{ + //input sequence info into map + seqmap[name] = sequence; + + it = data.find(sequence); + if (it == data.end()) { //it's unique. + data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found. + data[sequence].groupnumber = 1; + data[sequence].names = name; + }else { // its a duplicate. + data[sequence].names += "," + name; + data[sequence].groupnumber++; } + name = (line.substr(1, (line.npos))); //The line you just read is a new name so rip off '>' + sequence = ""; } + gobble(in); } - + it = data.find(sequence); + if (it == data.end()) { //it's unique. + data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found. + data[sequence].groupnumber = 1; + data[sequence].names = name; + }else { // its a duplicate. + data[sequence].names += "," + name; + data[sequence].groupnumber++; + } + } catch(exception& e) { diff --git a/globaldata.cpp b/globaldata.cpp index a8e88c5..ec937ea 100644 --- a/globaldata.cpp +++ b/globaldata.cpp @@ -96,6 +96,12 @@ void GlobalData::parseGlobalData(string commandString, string optionText){ if (key == "mismatch") { mismatch = value; } if (key == "gapopen") { gapopen = value; } if (key == "gapextend" ) { gapextend = value; } + if (key == "start" ) { startPos = value; } + if (key == "end" ) { endPos = value; } + if (key == "maxambig" ) { maxAmbig = value; } + if (key == "maxhomop" ) { maxHomoPolymer = value; } + if (key == "minlength" ) { minLength = value; } + if (key == "maxlength" ) { maxLength = value; } if (key == "line") {//stores lines to be used in a vector lines.clear(); @@ -168,6 +174,13 @@ void GlobalData::parseGlobalData(string commandString, string optionText){ if (key == "mismatch") { mismatch = value; } if (key == "gapopen") { gapopen = value; } if (key == "gapextend" ) { gapextend = value; } + if (key == "start" ) { startPos = value; } + if (key == "end" ) { endPos = value; } + if (key == "maxambig" ) { maxAmbig = value; } + if (key == "maxhomop" ) { maxHomoPolymer = value; } + if (key == "minlength" ) { minLength = value; } + if (key == "maxlength" ) { maxLength = value; } + if (key == "line") {//stores lines to be used in a vector lines.clear(); @@ -211,7 +224,6 @@ void GlobalData::parseGlobalData(string commandString, string optionText){ splitAtDash(calc, Estimators); } if (commandName == "collect.shared") { - if ((calc == "default") || (calc == "")) { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; } Estimators.clear(); splitAtDash(calc, Estimators); @@ -323,6 +335,12 @@ string GlobalData::getMatch() { return match; } string GlobalData::getMismatch() { return mismatch; } string GlobalData::getGapopen() { return gapopen; } string GlobalData::getGapextend() { return gapextend; } +string GlobalData::getStartPos() { return startPos; } +string GlobalData::getEndPos() { return endPos; } +string GlobalData::getMaxAmbig() { return maxAmbig; } +string GlobalData::getMaxHomoPolymer() { return maxHomoPolymer; } +string GlobalData::getMinLength() { return minLength; } +string GlobalData::getMaxLength() { return maxLength; } void GlobalData::setListFile(string file) { listfile = file; inputFileName = file; } @@ -401,6 +419,13 @@ void GlobalData::clear() { mismatch = "-1.0"; gapopen = "-1.0"; gapextend = "-2.0"; + startPos = "-1"; + endPos = "-1"; + maxAmbig = "-1"; + maxHomoPolymer = "-1"; + minLength = "-1"; + maxLength = "-1"; + } //*******************************************************/ @@ -434,7 +459,13 @@ void GlobalData::reset() { trump = ""; hard = ""; soft = ""; - + startPos = "-1"; + endPos = "-1"; + maxAmbig = "-1"; + maxHomoPolymer = "-1"; + minLength = "-1"; + maxLength = "-1"; + } /*******************************************************/ @@ -449,6 +480,8 @@ GlobalData::~GlobalData() { /*******************************************************/ void GlobalData::parseTreeFile() { + //Why is THIS in GlobalData??? - PDS + //only takes names from the first tree and assumes that all trees use the same names. try { string filename = treefile; diff --git a/globaldata.hpp b/globaldata.hpp index 6d1b3f3..f97d98b 100644 --- a/globaldata.hpp +++ b/globaldata.hpp @@ -91,7 +91,12 @@ public: string getSoft(); string getHard(); string getScale(); - + string getStartPos(); + string getEndPos(); + string getMaxAmbig(); + string getMaxHomoPolymer(); + string getMinLength(); + string getMaxLength(); void setListFile(string); void setGroupFile(string file); @@ -114,7 +119,7 @@ public: void parseGlobalData(string, string); - void parseTreeFile(); //parses through tree file to find names of nodes and number of them + void parseTreeFile(); //parses through tree file to find names of nodes and number of them //this is required in case user has sequences in the names file that are //not included in the tree. //only takes names from the first tree in the tree file and assumes that all trees use the same names. @@ -122,9 +127,7 @@ public: private: - string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups; - string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, hard, scale, countends, processors, candidatefile, search, ksize, align, match, size; - string mismatch, gapopen, gapextend; + string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups, cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, hard, scale, countends, processors, candidatefile, search, ksize, align, match, size, mismatch, gapopen, gapextend, minLength, maxLength, startPos, endPos, maxAmbig, maxHomoPolymer; static GlobalData* _uniqueInstance; diff --git a/mothur.h b/mothur.h index 07bb971..670a32c 100644 --- a/mothur.h +++ b/mothur.h @@ -249,7 +249,7 @@ inline string getPathName(string longName){ string rootPathName = longName; - if(longName.find_last_of("/") != longName.npos){ + if(longName.find_last_of('/') != longName.npos){ int pos = longName.find_last_of('/')+1; rootPathName = longName.substr(0, pos); } @@ -259,6 +259,20 @@ inline string getPathName(string longName){ /***********************************************************************/ +inline string getExtension(string longName){ + + string extension = longName; + + if(longName.find_last_of('.') != longName.npos){ + int pos = longName.find_last_of('.'); + extension = longName.substr(pos, longName.length()); + } + + return extension; +} + +/***********************************************************************/ + inline int openInputFile(string fileName, ifstream& fileHandle){ fileHandle.open(fileName.c_str()); diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp new file mode 100644 index 0000000..94094c7 --- /dev/null +++ b/screenseqscommand.cpp @@ -0,0 +1,176 @@ +/* + * screenseqscommand.cpp + * Mothur + * + * Created by Pat Schloss on 6/3/09. + * Copyright 2009 Patrick D. Schloss. All rights reserved. + * + */ + +#include "screenseqscommand.h" + +//*************************************************************************************************************** + +ScreenSeqsCommand::ScreenSeqsCommand(){ + try { + globaldata = GlobalData::getInstance(); + + if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); } + else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); } + else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); } + else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); } + + readSeqs->read(); + db = readSeqs->getDB(); + numSeqs = db->size(); + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the SeqCoordCommand class function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + +//*************************************************************************************************************** + +ScreenSeqsCommand::~ScreenSeqsCommand(){ + delete readSeqs; +} + +//*************************************************************************************************************** + +int ScreenSeqsCommand::execute(){ + try{ + int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength; + convert(globaldata->getStartPos(), startPos); + convert(globaldata->getEndPos(), endPos); + convert(globaldata->getMaxAmbig(), maxAmbig); + convert(globaldata->getMaxHomoPolymer(), maxHomoP); + convert(globaldata->getMinLength(), minLength); + convert(globaldata->getMaxLength(), maxLength); + + set badSeqNames; + + string goodSeqFile = getRootName(globaldata->inputFileName) + "good" + getExtension(globaldata->inputFileName); + string badSeqFile = getRootName(globaldata->inputFileName) + "bad" + getExtension(globaldata->inputFileName); + + ofstream goodSeqOut; openOutputFile(goodSeqFile, goodSeqOut); + ofstream badSeqOut; openOutputFile(badSeqFile, badSeqOut); + + for(int i=0;iget(i); + bool goodSeq = 1; // innocent until proven guilty + if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; } + if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; } + if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; } + if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()){ goodSeq = 0; } + if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; } + if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; } + + if(goodSeq == 1){ + currSeq.printSequence(goodSeqOut); + } + else{ + currSeq.printSequence(badSeqOut); + badSeqNames.insert(currSeq.getName()); + } + } + + if(globaldata->getNameFile() != ""){ + screenNameGroupFile(badSeqNames); + + } + + return 0; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the FilterSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + +} + +//*************************************************************************************************************** + +void ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ + + ifstream inputNames; + openInputFile(globaldata->getNameFile(), inputNames); + set badSeqGroups; + string seqName, seqList, group; + set::iterator it; + + string goodNameFile = getRootName(globaldata->getNameFile()) + "good" + getExtension(globaldata->getNameFile()); + string badNameFile = getRootName(globaldata->getNameFile()) + "bad" + getExtension(globaldata->getNameFile()); + + ofstream goodNameOut; openOutputFile(goodNameFile, goodNameOut); + ofstream badNameOut; openOutputFile(badNameFile, badNameOut); + + while(!inputNames.eof()){ + inputNames >> seqName >> seqList; + it = badSeqNames.find(seqName); + + if(it != badSeqNames.end()){ + badSeqNames.erase(it); + badNameOut << seqName << '\t' << seqList << endl; + if(globaldata->getNameFile() != ""){ + int start = 0; + for(int i=0;igetGroupFile() != ""){ + + ifstream inputGroups; + openInputFile(globaldata->getGroupFile(), inputGroups); + + string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile()); + string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile()); + + ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut); + ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut); + + while(!inputGroups.eof()){ + inputGroups >> seqName >> group; + + it = badSeqGroups.find(seqName); + + if(it != badSeqGroups.end()){ + badSeqGroups.erase(it); + badGroupOut << seqName << '\t' << group << endl; + } + else{ + goodGroupOut << seqName << '\t' << group << endl; + } + gobble(inputGroups); + } + inputGroups.close(); + goodGroupOut.close(); + badGroupOut.close(); + } +} + +//*************************************************************************************************************** + + diff --git a/screenseqscommand.h b/screenseqscommand.h new file mode 100644 index 0000000..f88fe2c --- /dev/null +++ b/screenseqscommand.h @@ -0,0 +1,37 @@ +#ifndef SCREENSEQSCOMMAND_H +#define SCREENSEQSCOMMAND_H + +/* + * screenseqscommand.h + * Mothur + * + * Created by Pat Schloss on 6/3/09. + * Copyright 2009 Patrick D. Schloss. All rights reserved. + * + */ +#include "mothur.h" +#include "command.hpp" +#include "globaldata.hpp" +#include "readfasta.h" +#include "readnexus.h" +#include "readclustal.h" +#include "readseqsphylip.h" +#include + +using namespace std; + +class ScreenSeqsCommand : public Command { + +public: + ScreenSeqsCommand(); + ~ScreenSeqsCommand(); + int execute(); +private: + void screenNameGroupFile(set); + int numSeqs; + GlobalData* globaldata; + ReadSeqs* readSeqs; + SequenceDB* db; +}; + +#endif diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index e71c2af..2bffecb 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -37,6 +37,7 @@ SeqSummaryCommand::SeqSummaryCommand(){ //*************************************************************************************************************** SeqSummaryCommand::~SeqSummaryCommand(){ + delete readSeqs; } //*************************************************************************************************************** diff --git a/validcommands.cpp b/validcommands.cpp index 6439e9b..a4b1783 100644 --- a/validcommands.cpp +++ b/validcommands.cpp @@ -47,6 +47,7 @@ ValidCommands::ValidCommands() { commands["filter.seqs"] = "filter.seqs"; commands["align.seqs"] = "align.seqs"; commands["summary.seqs"] = "summary.seqs"; + commands["screen.seqs"] = "screen.seqs"; commands["quit"] = "quit"; diff --git a/validparameter.cpp b/validparameter.cpp index 547d4dc..b67ee80 100644 --- a/validparameter.cpp +++ b/validparameter.cpp @@ -279,6 +279,9 @@ void ValidParameters::initCommandParameters() { string summaryseqsArray[] = {"fasta","phylip","clustal","nexus"}; commandParameters["summary.seqs"] = addParameters(summaryseqsArray, sizeof(summaryseqsArray)/sizeof(string)); + string screenseqsArray[] = {"fasta","phylip","clustal","nexus", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"}; + commandParameters["screen.seqs"] = addParameters(screenseqsArray, sizeof(screenseqsArray)/sizeof(string)); + string vennArray[] = {"groups","line","label","calc"}; commandParameters["venn"] = addParameters(vennArray, sizeof(vennArray)/sizeof(string)); -- 2.39.2