#include "getsabundcommand.h"
#include "getrabundcommand.h"
#include "seqsummarycommand.h"
+#include "screenseqscommand.h"
/***********************************************************/
try {
delete command; //delete the old command
- if(commandName == "read.dist") { command = new ReadDistCommand(); }
+ if(commandName == "read.dist") { command = new ReadDistCommand(); }
else if(commandName == "read.otu") { command = new ReadOtuCommand(); }
else if(commandName == "read.tree") { command = new ReadTreeCommand(); }
else if(commandName == "cluster") { command = new ClusterCommand(); }
else if(commandName == "dist.seqs") { command = new DistanceCommand(); }
else if(commandName == "align.seqs") { command = new AlignCommand(); }
else if(commandName == "summary.seqs") { command = new SeqSummaryCommand(); }
+ else if(commandName == "screen.seqs") { command = new ScreenSeqsCommand(); }
else { command = new NoCommand(); }
return command;
//read through file
while (!in.eof()) {
in >> line;
- if (line != "") {
- if (isalnum(line.at(0))) { //if it's a sequence line
- sequence += line;
- }
- else{
- //input sequence info into map
- seqmap[name] = sequence;
- it = data.find(sequence);
- if (it == data.end()) { //it's unique.
- data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found.
- data[sequence].groupnumber = 1;
- data[sequence].names = name;
- }else { // its a duplicate.
- data[sequence].names += "," + name;
- data[sequence].groupnumber++;
- }
- name = (line.substr(1, (line.npos))); //The line you just read is a new name so rip off '>'
- sequence = "";
+
+ if (line[0] != '>') { //if it's a sequence line
+ sequence += line;
+ }
+ else{
+ //input sequence info into map
+ seqmap[name] = sequence;
+
+ it = data.find(sequence);
+ if (it == data.end()) { //it's unique.
+ data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found.
+ data[sequence].groupnumber = 1;
+ data[sequence].names = name;
+ }else { // its a duplicate.
+ data[sequence].names += "," + name;
+ data[sequence].groupnumber++;
}
+ name = (line.substr(1, (line.npos))); //The line you just read is a new name so rip off '>'
+ sequence = "";
}
+
gobble(in);
}
-
+ it = data.find(sequence);
+ if (it == data.end()) { //it's unique.
+ data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found.
+ data[sequence].groupnumber = 1;
+ data[sequence].names = name;
+ }else { // its a duplicate.
+ data[sequence].names += "," + name;
+ data[sequence].groupnumber++;
+ }
+
}
catch(exception& e) {
if (key == "mismatch") { mismatch = value; }
if (key == "gapopen") { gapopen = value; }
if (key == "gapextend" ) { gapextend = value; }
+ if (key == "start" ) { startPos = value; }
+ if (key == "end" ) { endPos = value; }
+ if (key == "maxambig" ) { maxAmbig = value; }
+ if (key == "maxhomop" ) { maxHomoPolymer = value; }
+ if (key == "minlength" ) { minLength = value; }
+ if (key == "maxlength" ) { maxLength = value; }
if (key == "line") {//stores lines to be used in a vector
lines.clear();
if (key == "mismatch") { mismatch = value; }
if (key == "gapopen") { gapopen = value; }
if (key == "gapextend" ) { gapextend = value; }
+ if (key == "start" ) { startPos = value; }
+ if (key == "end" ) { endPos = value; }
+ if (key == "maxambig" ) { maxAmbig = value; }
+ if (key == "maxhomop" ) { maxHomoPolymer = value; }
+ if (key == "minlength" ) { minLength = value; }
+ if (key == "maxlength" ) { maxLength = value; }
+
if (key == "line") {//stores lines to be used in a vector
lines.clear();
splitAtDash(calc, Estimators);
}
if (commandName == "collect.shared") {
-
if ((calc == "default") || (calc == "")) { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
Estimators.clear();
splitAtDash(calc, Estimators);
string GlobalData::getMismatch() { return mismatch; }
string GlobalData::getGapopen() { return gapopen; }
string GlobalData::getGapextend() { return gapextend; }
+string GlobalData::getStartPos() { return startPos; }
+string GlobalData::getEndPos() { return endPos; }
+string GlobalData::getMaxAmbig() { return maxAmbig; }
+string GlobalData::getMaxHomoPolymer() { return maxHomoPolymer; }
+string GlobalData::getMinLength() { return minLength; }
+string GlobalData::getMaxLength() { return maxLength; }
void GlobalData::setListFile(string file) { listfile = file; inputFileName = file; }
mismatch = "-1.0";
gapopen = "-1.0";
gapextend = "-2.0";
+ startPos = "-1";
+ endPos = "-1";
+ maxAmbig = "-1";
+ maxHomoPolymer = "-1";
+ minLength = "-1";
+ maxLength = "-1";
+
}
//*******************************************************/
trump = "";
hard = "";
soft = "";
-
+ startPos = "-1";
+ endPos = "-1";
+ maxAmbig = "-1";
+ maxHomoPolymer = "-1";
+ minLength = "-1";
+ maxLength = "-1";
+
}
/*******************************************************/
/*******************************************************/
void GlobalData::parseTreeFile() {
+ //Why is THIS in GlobalData??? - PDS
+
//only takes names from the first tree and assumes that all trees use the same names.
try {
string filename = treefile;
string getSoft();
string getHard();
string getScale();
-
+ string getStartPos();
+ string getEndPos();
+ string getMaxAmbig();
+ string getMaxHomoPolymer();
+ string getMinLength();
+ string getMaxLength();
void setListFile(string);
void setGroupFile(string file);
void parseGlobalData(string, string);
- void parseTreeFile(); //parses through tree file to find names of nodes and number of them
+ void parseTreeFile(); //parses through tree file to find names of nodes and number of them
//this is required in case user has sequences in the names file that are
//not included in the tree.
//only takes names from the first tree in the tree file and assumes that all trees use the same names.
private:
- string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups;
- string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, hard, scale, countends, processors, candidatefile, search, ksize, align, match, size;
- string mismatch, gapopen, gapextend;
+ string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups, cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, hard, scale, countends, processors, candidatefile, search, ksize, align, match, size, mismatch, gapopen, gapextend, minLength, maxLength, startPos, endPos, maxAmbig, maxHomoPolymer;
static GlobalData* _uniqueInstance;
string rootPathName = longName;
- if(longName.find_last_of("/") != longName.npos){
+ if(longName.find_last_of('/') != longName.npos){
int pos = longName.find_last_of('/')+1;
rootPathName = longName.substr(0, pos);
}
/***********************************************************************/
+inline string getExtension(string longName){
+
+ string extension = longName;
+
+ if(longName.find_last_of('.') != longName.npos){
+ int pos = longName.find_last_of('.');
+ extension = longName.substr(pos, longName.length());
+ }
+
+ return extension;
+}
+
+/***********************************************************************/
+
inline int openInputFile(string fileName, ifstream& fileHandle){
fileHandle.open(fileName.c_str());
--- /dev/null
+/*
+ * screenseqscommand.cpp
+ * Mothur
+ *
+ * Created by Pat Schloss on 6/3/09.
+ * Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "screenseqscommand.h"
+
+//***************************************************************************************************************
+
+ScreenSeqsCommand::ScreenSeqsCommand(){
+ try {
+ globaldata = GlobalData::getInstance();
+
+ if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); }
+ else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); }
+ else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); }
+ else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); }
+
+ readSeqs->read();
+ db = readSeqs->getDB();
+ numSeqs = db->size();
+ }
+ catch(exception& e) {
+ cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ exit(1);
+ }
+ catch(...) {
+ cout << "An unknown error has occurred in the SeqCoordCommand class function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+
+ScreenSeqsCommand::~ScreenSeqsCommand(){
+ delete readSeqs;
+}
+
+//***************************************************************************************************************
+
+int ScreenSeqsCommand::execute(){
+ try{
+ int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength;
+ convert(globaldata->getStartPos(), startPos);
+ convert(globaldata->getEndPos(), endPos);
+ convert(globaldata->getMaxAmbig(), maxAmbig);
+ convert(globaldata->getMaxHomoPolymer(), maxHomoP);
+ convert(globaldata->getMinLength(), minLength);
+ convert(globaldata->getMaxLength(), maxLength);
+
+ set<string> badSeqNames;
+
+ string goodSeqFile = getRootName(globaldata->inputFileName) + "good" + getExtension(globaldata->inputFileName);
+ string badSeqFile = getRootName(globaldata->inputFileName) + "bad" + getExtension(globaldata->inputFileName);
+
+ ofstream goodSeqOut; openOutputFile(goodSeqFile, goodSeqOut);
+ ofstream badSeqOut; openOutputFile(badSeqFile, badSeqOut);
+
+ for(int i=0;i<numSeqs;i++){
+ Sequence currSeq = db->get(i);
+ bool goodSeq = 1; // innocent until proven guilty
+ if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
+ if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
+ if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
+ if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()){ goodSeq = 0; }
+ if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
+ if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
+
+ if(goodSeq == 1){
+ currSeq.printSequence(goodSeqOut);
+ }
+ else{
+ currSeq.printSequence(badSeqOut);
+ badSeqNames.insert(currSeq.getName());
+ }
+ }
+
+ if(globaldata->getNameFile() != ""){
+ screenNameGroupFile(badSeqNames);
+
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ exit(1);
+ }
+ catch(...) {
+ cout << "An unknown error has occurred in the FilterSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ exit(1);
+ }
+
+}
+
+//***************************************************************************************************************
+
+void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
+
+ ifstream inputNames;
+ openInputFile(globaldata->getNameFile(), inputNames);
+ set<string> badSeqGroups;
+ string seqName, seqList, group;
+ set<string>::iterator it;
+
+ string goodNameFile = getRootName(globaldata->getNameFile()) + "good" + getExtension(globaldata->getNameFile());
+ string badNameFile = getRootName(globaldata->getNameFile()) + "bad" + getExtension(globaldata->getNameFile());
+
+ ofstream goodNameOut; openOutputFile(goodNameFile, goodNameOut);
+ ofstream badNameOut; openOutputFile(badNameFile, badNameOut);
+
+ while(!inputNames.eof()){
+ inputNames >> seqName >> seqList;
+ it = badSeqNames.find(seqName);
+
+ if(it != badSeqNames.end()){
+ badSeqNames.erase(it);
+ badNameOut << seqName << '\t' << seqList << endl;
+ if(globaldata->getNameFile() != ""){
+ int start = 0;
+ for(int i=0;i<seqList.length();i++){
+ if(seqList[i] == ','){
+ badSeqGroups.insert(seqList.substr(start,i-start));
+ start = i+1;
+ }
+ }
+ badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
+ }
+ }
+ else{
+ goodNameOut << seqName << '\t' << seqList << endl;
+ }
+ gobble(inputNames);
+ }
+ inputNames.close();
+ goodNameOut.close();
+ badNameOut.close();
+
+ if(globaldata->getGroupFile() != ""){
+
+ ifstream inputGroups;
+ openInputFile(globaldata->getGroupFile(), inputGroups);
+
+ string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile());
+ string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile());
+
+ ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut);
+ ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut);
+
+ while(!inputGroups.eof()){
+ inputGroups >> seqName >> group;
+
+ it = badSeqGroups.find(seqName);
+
+ if(it != badSeqGroups.end()){
+ badSeqGroups.erase(it);
+ badGroupOut << seqName << '\t' << group << endl;
+ }
+ else{
+ goodGroupOut << seqName << '\t' << group << endl;
+ }
+ gobble(inputGroups);
+ }
+ inputGroups.close();
+ goodGroupOut.close();
+ badGroupOut.close();
+ }
+}
+
+//***************************************************************************************************************
+
+
--- /dev/null
+#ifndef SCREENSEQSCOMMAND_H
+#define SCREENSEQSCOMMAND_H
+
+/*
+ * screenseqscommand.h
+ * Mothur
+ *
+ * Created by Pat Schloss on 6/3/09.
+ * Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+#include "mothur.h"
+#include "command.hpp"
+#include "globaldata.hpp"
+#include "readfasta.h"
+#include "readnexus.h"
+#include "readclustal.h"
+#include "readseqsphylip.h"
+#include <set>
+
+using namespace std;
+
+class ScreenSeqsCommand : public Command {
+
+public:
+ ScreenSeqsCommand();
+ ~ScreenSeqsCommand();
+ int execute();
+private:
+ void screenNameGroupFile(set<string>);
+ int numSeqs;
+ GlobalData* globaldata;
+ ReadSeqs* readSeqs;
+ SequenceDB* db;
+};
+
+#endif
//***************************************************************************************************************
SeqSummaryCommand::~SeqSummaryCommand(){
+ delete readSeqs;
}
//***************************************************************************************************************
commands["filter.seqs"] = "filter.seqs";
commands["align.seqs"] = "align.seqs";
commands["summary.seqs"] = "summary.seqs";
+ commands["screen.seqs"] = "screen.seqs";
commands["quit"] = "quit";
string summaryseqsArray[] = {"fasta","phylip","clustal","nexus"};
commandParameters["summary.seqs"] = addParameters(summaryseqsArray, sizeof(summaryseqsArray)/sizeof(string));
+ string screenseqsArray[] = {"fasta","phylip","clustal","nexus", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"};
+ commandParameters["screen.seqs"] = addParameters(screenseqsArray, sizeof(screenseqsArray)/sizeof(string));
+
string vennArray[] = {"groups","line","label","calc"};
commandParameters["venn"] = addParameters(vennArray, sizeof(vennArray)/sizeof(string));