From 648ec37228eb16075ace911dd5a5773cdfe683da Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 2 Dec 2009 20:11:35 +0000 Subject: [PATCH] modified sequence class to read fasta files with comments. this required modifications to several other files that use sequence. also added flip and threshold parameter to align.seqs command --- aligncommand.cpp | 169 +++++++++++++++++++++++++++------- aligncommand.h | 8 +- alignmentdb.cpp | 13 ++- chimera.cpp | 6 +- chimeraseqscommand.cpp | 6 +- classifyseqscommand.cpp | 15 +-- commandfactory.cpp | 17 +++- commandfactory.hpp | 1 + engine.cpp | 73 +++++++++++---- engine.hpp | 7 +- fastamap.cpp | 58 ++++++------ filterseqscommand.cpp | 30 +++--- getseqscommand.cpp | 17 ++-- getsharedotucommand.cpp | 2 +- listseqscommand.cpp | 2 +- mothur.cpp | 11 ++- mothur.h | 20 +++- nast.cpp | 6 +- nast.hpp | 1 + overlap.cpp | 2 +- removeseqscommand.cpp | 15 +-- reversecommand.cpp | 8 +- screenseqscommand.cpp | 30 +++--- secondarystructurecommand.cpp | 12 ++- seqsummarycommand.cpp | 26 +++--- sequence.cpp | 81 +++++++++++++--- sequence.hpp | 2 + sequencedb.cpp | 3 +- trimseqscommand.cpp | 102 ++++++++++---------- 29 files changed, 500 insertions(+), 243 deletions(-) diff --git a/aligncommand.cpp b/aligncommand.cpp index 57eebb0..55f0b82 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -39,7 +39,7 @@ AlignCommand::AlignCommand(string option){ else { //valid paramters for this command - string AlignArray[] = {"template","candidate","search","ksize","align","match","mismatch","gapopen","gapextend", "processors"}; + string AlignArray[] = {"template","candidate","search","ksize","align","match","mismatch","gapopen","gapextend", "processors","flip","threshold"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); OptionParser parser(option); @@ -90,6 +90,12 @@ AlignCommand::AlignCommand(string option){ temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; } convert(temp, processors); + temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ temp = "f"; } + flip = isTrue(temp); + + temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "0.80"; } + convert(temp, threshold); + search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; } align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; } @@ -127,6 +133,10 @@ void AlignCommand::help(){ mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"); mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n"); mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n"); + mothurOut("The flip parameter is used to specify whether or not you want mothur to try the reverse compement if a sequence falls below the threshold. The default is false.\n"); + mothurOut("The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. \n"); + mothurOut("If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.\n"); + mothurOut("The default for the threshold parameter is 0.80, meaning at least 80% of the bases must remain or the sequence is reported as potentially reversed.\n"); mothurOut("The align.seqs command should be in the following format: \n"); mothurOut("align.seqs(template=yourTemplateFile, candidate=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty) \n"); mothurOut("Example align.seqs(candidate=candidate.fasta, template=core.filtered, align=kmer, search=gotoh, ksize=8, match=2.0, mismatch=3.0, gapopen=-2.0, gapextend=-1.0)\n"); @@ -162,6 +172,7 @@ int AlignCommand::execute(){ string alignFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align"; string reportFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align.report"; + string accnosFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "flip.accnos"; int numFastaSeqs = 0; int start = time(NULL); @@ -175,8 +186,17 @@ int AlignCommand::execute(){ lines.push_back(new linePair(0, numFastaSeqs)); - driver(lines[0], alignFileName, reportFileName); + driver(lines[0], alignFileName, reportFileName, accnosFileName); + //delete accnos file if its blank else report to user + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); } + else { + mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + "."); + if (!flip) { + mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well."); + }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); } + mothurOutEndLine(); + } } else{ vector positions; @@ -205,11 +225,35 @@ int AlignCommand::execute(){ } lines.push_back(new linePair(startPos, numSeqsPerProcessor)); } - createProcesses(alignFileName, reportFileName); + createProcesses(alignFileName, reportFileName, accnosFileName); rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str()); rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str()); + vector nonBlankAccnosFiles; + //delete blank accnos files generated with multiple processes + for(int i=0;inumSeqs;i++){ Sequence* candidateSeq = new Sequence(inFASTA); - - if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { - alignment->resize(candidateSeq->getUnaligned().length()+1); - } - - report.setCandidate(candidateSeq); - - Sequence temp = templateDB->findClosestSequence(candidateSeq); - - Sequence* templateSeq = &temp; - - report.setTemplate(templateSeq); - report.setSearchParameters(search, templateDB->getSearchScore()); + int origNumBases = candidateSeq->getNumBases(); + string originalUnaligned = candidateSeq->getUnaligned(); + int numBasesNeeded = origNumBases * threshold; + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { + alignment->resize(candidateSeq->getUnaligned().length()+1); + } + + Sequence temp = templateDB->findClosestSequence(candidateSeq); + Sequence* templateSeq = &temp; - Nast nast(alignment, candidateSeq, templateSeq); - - report.setAlignmentParameters(align, alignment); - - report.setNastParameters(nast); - - alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; + float searchScore = templateDB->getSearchScore(); + + Nast* nast = new Nast(alignment, candidateSeq, templateSeq); + Sequence* copy; - report.print(); + Nast* nast2; + bool needToDeleteCopy = false; //this is needed in case you have you enter the ifs below + //since nast does not make a copy of hte sequence passed, and it is used by the reporter below + //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place + //so this bool tells you if you need to delete it + + //if there is a possibility that this sequence should be reversed + if (candidateSeq->getNumBases() < numBasesNeeded) { + + string wasBetter = ""; + //if the user wants you to try the reverse + if (flip) { + //get reverse compliment + copy = new Sequence(candidateSeq->getName(), originalUnaligned); + copy->reverseComplement(); + + //rerun alignment + Sequence temp2 = templateDB->findClosestSequence(copy); + Sequence* templateSeq2 = &temp2; + + searchScore = templateDB->getSearchScore(); + + nast2 = new Nast(alignment, copy, templateSeq2); + //check if any better + if (copy->getNumBases() > candidateSeq->getNumBases()) { + candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better + templateSeq = templateSeq2; + delete nast; + nast = nast2; + needToDeleteCopy = true; + }else{ + wasBetter = "\treverse complement did NOT produce a better alignment, please check sequence."; + delete nast2; + delete copy; + } + } + + //create accnos file with names + accnosFile << candidateSeq->getName() << wasBetter << endl; + } + + report.setCandidate(candidateSeq); + report.setTemplate(templateSeq); + report.setSearchParameters(search, searchScore); + report.setAlignmentParameters(align, alignment); + report.setNastParameters(*nast); + + alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; + + report.print(); + delete nast; + if (needToDeleteCopy) { delete copy; } + } delete candidateSeq; - } alignmentFile.close(); inFASTA.close(); + accnosFile.close(); return 1; } @@ -299,7 +407,7 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName){ /**************************************************************************************************/ -void AlignCommand::createProcesses(string alignFileName, string reportFileName) { +void AlignCommand::createProcesses(string alignFileName, string reportFileName, string accnosFName) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; @@ -313,7 +421,7 @@ void AlignCommand::createProcesses(string alignFileName, string reportFileName) processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp"); + driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp", accnosFName + toString(getpid()) + ".temp"); exit(0); }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } } @@ -354,8 +462,7 @@ void AlignCommand::appendAlignFiles(string temp, string filename) { exit(1); } } - -/**************************************************************************************************/ +//********************************************************************************************************************** void AlignCommand::appendReportFiles(string temp, string filename) { try{ diff --git a/aligncommand.h b/aligncommand.h index c3c2441..f7d59b0 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -36,16 +36,16 @@ private: AlignmentDB* templateDB; Alignment* alignment; - int driver(linePair*, string, string); - void createProcesses(string, string); + int driver(linePair*, string, string, string); + void createProcesses(string, string, string); void appendAlignFiles(string, string); void appendReportFiles(string, string); string candidateFileName, templateFileName, distanceFileName, search, align; - float match, misMatch, gapOpen, gapExtend; + float match, misMatch, gapOpen, gapExtend, threshold; int processors, kmerSize; - bool abort; + bool abort, flip; }; #endif diff --git a/alignmentdb.cpp b/alignmentdb.cpp index 6b05989..e450952 100644 --- a/alignmentdb.cpp +++ b/alignmentdb.cpp @@ -25,14 +25,13 @@ AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, floa mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush(); while (!fastaFile.eof()) { - Sequence temp(fastaFile); + Sequence temp(fastaFile); gobble(fastaFile); - templateSequences.push_back(temp); - - //save longest base - if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length(); } - - gobble(fastaFile); + if (temp.getName() != "") { + templateSequences.push_back(temp); + //save longest base + if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length(); } + } } numSeqs = templateSequences.size(); diff --git a/chimera.cpp b/chimera.cpp index 8851b75..db153dd 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -94,9 +94,9 @@ vector Chimera::readSeqs(string file) { //read in seqs and store in vector while(!in.eof()){ - Sequence* current = new Sequence(in); - container.push_back(current); - gobble(in); + Sequence* current = new Sequence(in); gobble(in); + + if (current->getName() != "") { container.push_back(current); } } in.close(); diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 51fc0e7..bf0ffa6 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -156,7 +156,7 @@ void ChimeraSeqsCommand::help(){ mothurOut("The ksize parameter allows you to input kmersize. \n"); mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence.\n"); mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n"); - mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n"); + //mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n"); mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n"); mothurOut("Details for each method: \n"); mothurOut("\tpintail: \n"); @@ -168,8 +168,8 @@ void ChimeraSeqsCommand::help(){ mothurOut("\t\tparameters: fasta=required, template=required, filter=F, mask=no mask, processors=1, window=10% of length, numwanted=20\n"); mothurOut("\tchimeracheck: \n"); mothurOut("\t\tparameters: fasta=required, template=required, processors=1, increment=10, ksize=7, svg=F, name=none\n\n"); - mothurOut("\tchimeraslayer: \n"); - mothurOut("\t\tparameters: fasta=required, template=required, processors=1, increment=10, mask=no mask, numwanted=10, match=5, mismatch=-4, divergence=1.0, minsim=90, parents=5, iters=1000, window=100. \n\n"); + //mothurOut("\tchimeraslayer: \n"); + //mothurOut("\t\tparameters: fasta=required, template=required, processors=1, increment=10, mask=no mask, numwanted=10, match=5, mismatch=-4, divergence=1.0, minsim=90, parents=5, iters=1000, window=100. \n\n"); mothurOut("The chimera.seqs command should be in the following format: \n"); mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n"); mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, method=bellerophon, window=200) \n"); diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index d2f8fc7..2b76e50 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -349,14 +349,15 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa for(int i=0;inumSeqs;i++){ Sequence* candidateSeq = new Sequence(inFASTA); - - taxonomy = classify->getTaxonomy(candidateSeq); - if (taxonomy != "bad seq") { - outTax << candidateSeq->getName() << '\t' << taxonomy << endl; - outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; - } - + if (candidateSeq->getName() != "") { + taxonomy = classify->getTaxonomy(candidateSeq); + + if (taxonomy != "bad seq") { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; + } + } delete candidateSeq; if((i+1) % 100 == 0){ diff --git a/commandfactory.cpp b/commandfactory.cpp index 635cff0..a385863 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -181,7 +181,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "merge.files") { command = new MergeFileCommand(optionString); } else if(commandName == "system") { command = new SystemCommand(optionString); } else if(commandName == "align.check") { command = new AlignCheckCommand(optionString); } - else if(commandName == "get.sharedseqs") { command = new GetSharedOTUCommand(optionString); } + else if(commandName == "get.sharedseqs") { command = new GetSharedOTUCommand(optionString); } else if(commandName == "get.otulist") { command = new GetListCountCommand(optionString); } else if(commandName == "hcluster") { command = new HClusterCommand(optionString); } else if(commandName == "classify.seqs") { command = new ClassifySeqsCommand(optionString); } @@ -195,7 +195,22 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ exit(1); } } +/***********************************************************/ +//This function is used to interrupt a command +Command* CommandFactory::getCommand(){ + try { + delete command; //delete the old command + string s = ""; + command = new NoCommand(s); + + return command; + } + catch(exception& e) { + errorOut(e, "CommandFactory", "getCommand"); + exit(1); + } +} /***********************************************************************/ bool CommandFactory::isValidCommand(string command) { try { diff --git a/commandfactory.hpp b/commandfactory.hpp index 0cede38..55f70ea 100644 --- a/commandfactory.hpp +++ b/commandfactory.hpp @@ -19,6 +19,7 @@ public: CommandFactory(); ~CommandFactory(); Command* getCommand(string, string); + Command* getCommand(); bool isValidCommand(string); void printCommands(ostream&); diff --git a/engine.cpp b/engine.cpp index 0a0f5a0..371bda8 100644 --- a/engine.cpp +++ b/engine.cpp @@ -20,14 +20,11 @@ InteractEngine::InteractEngine(string path){ globaldata = GlobalData::getInstance(); globaldata->argv = path; - - } /***********************************************************************/ -InteractEngine::~InteractEngine(){ - } +InteractEngine::~InteractEngine(){} /***********************************************************************/ //This function allows the user to input commands one line at a time until they quit. @@ -44,9 +41,47 @@ bool InteractEngine::getInput(){ mothurOutEndLine(); mothurOut("mothur > "); + + //get input char by char so you can check for use of arrow keys + //if (_kbhit()){ + // _getch(); // edit : if you want to check the arrow-keys you must call getch twice because special-keys have two values + // return _getch(); + //} + //return 0; // if no key is pressed + //setbuf(stdin, NULL); //no buffering +/*if(ch==0) +{ +ch=getch(); +if(ch==72) cout<<"up"; +else if(ch==75) cout<<"left"; +else if(ch==77) cout<<"right"; +else if(ch==80) cout<<"down"; +cout<getCommand(commandName, options); quitCommandCalled = command->execute(); }else { @@ -76,7 +111,17 @@ bool InteractEngine::getInput(){ exit(1); } } - +/***********************************************************************/ +void Engine::terminateCommand(int dummy) { + try { + mothurOut("Stopping command."); mothurOutEndLine(); + cFactory->getCommand(); //terminates command + } + catch(exception& e) { + errorOut(e, "InteractEngine", "terminateCommand"); + exit(1); + } +} /***********************************************************************/ //This function opens the batchfile to be used by BatchEngine::getInput. BatchEngine::BatchEngine(string path, string batchFileName){ @@ -95,8 +140,7 @@ BatchEngine::BatchEngine(string path, string batchFileName){ /***********************************************************************/ -BatchEngine::~BatchEngine(){ - } +BatchEngine::~BatchEngine(){ } /***********************************************************************/ //This Function allows the user to run a batchfile containing several commands on Dotur @@ -138,8 +182,7 @@ bool BatchEngine::getInput(){ if (commandName != "") { //executes valid command - CommandFactory cFactory; - Command* command = cFactory.getCommand(commandName, options); + Command* command = cFactory->getCommand(commandName, options); quitCommandCalled = command->execute(); }else { mothurOut("Invalid."); @@ -172,8 +215,6 @@ ScriptEngine::ScriptEngine(string path, string commandString){ globaldata->argv = path; - - } catch(exception& e) { errorOut(e, "ScriptEngine", "ScriptEngine"); @@ -183,8 +224,7 @@ ScriptEngine::ScriptEngine(string path, string commandString){ /***********************************************************************/ -ScriptEngine::~ScriptEngine(){ - } +ScriptEngine::~ScriptEngine(){ } /***********************************************************************/ //This Function allows the user to run a batchfile containing several commands on mothur @@ -221,8 +261,7 @@ bool ScriptEngine::getInput(){ if (commandName != "") { //executes valid command - CommandFactory cFactory; - Command* command = cFactory.getCommand(commandName, options); + Command* command = cFactory->getCommand(commandName, options); quitCommandCalled = command->execute(); }else { mothurOut("Invalid."); diff --git a/engine.hpp b/engine.hpp index 603ff0a..d4ec3b6 100644 --- a/engine.hpp +++ b/engine.hpp @@ -22,13 +22,16 @@ class GlobalData; class Engine { public: - virtual ~Engine(){}; + Engine() { cFactory = new CommandFactory(); } + virtual ~Engine(){ delete cFactory; } virtual bool getInput() = 0; // string getCommand() { return command; } vector getOptions() { return options; } + virtual void terminateCommand(int); protected: // string command; vector options; + CommandFactory* cFactory; }; @@ -54,6 +57,8 @@ public: virtual bool getInput(); private: GlobalData* globaldata; + vector previousInputs; //this is used to make the arrow keys work + }; diff --git a/fastamap.cpp b/fastamap.cpp index 5775466..f5ac122 100644 --- a/fastamap.cpp +++ b/fastamap.cpp @@ -24,20 +24,21 @@ void FastaMap::readFastaFile(string inFileName) { Sequence currSeq(in); name = currSeq.getName(); - if(currSeq.getIsAligned()) { sequence = currSeq.getAligned(); } - else { sequence = currSeq.getUnaligned(); } - - seqmap[name] = sequence; - map::iterator it = data.find(sequence); - if (it == data.end()) { //it's unique. - data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found. -// data[sequence].groupnumber = 1; - data[sequence].names = name; - }else { // its a duplicate. - data[sequence].names += "," + name; -// data[sequence].groupnumber++; - } - + if (name != "") { + if(currSeq.getIsAligned()) { sequence = currSeq.getAligned(); } + else { sequence = currSeq.getUnaligned(); } + + seqmap[name] = sequence; + map::iterator it = data.find(sequence); + if (it == data.end()) { //it's unique. + data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found. + // data[sequence].groupnumber = 1; + data[sequence].names = name; + }else { // its a duplicate. + data[sequence].names += "," + name; + // data[sequence].groupnumber++; + } + } gobble(in); } in.close(); @@ -71,20 +72,21 @@ void FastaMap::readFastaFile(string inFastaFile, string oldNameFileName){ //prin Sequence currSeq(inFASTA); name = currSeq.getName(); - if(currSeq.getIsAligned()) { sequence = currSeq.getAligned(); } - else { sequence = currSeq.getUnaligned(); } - - seqmap[name] = sequence; - map::iterator it = data.find(sequence); - if (it == data.end()) { //it's unique. - data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found. -// data[sequence].groupnumber = 1; - data[sequence].names = oldNameMap[name]; - }else { // its a duplicate. - data[sequence].names += "," + oldNameMap[name]; -// data[sequence].groupnumber++; - } - + if (name != "") { + if(currSeq.getIsAligned()) { sequence = currSeq.getAligned(); } + else { sequence = currSeq.getUnaligned(); } + + seqmap[name] = sequence; + map::iterator it = data.find(sequence); + if (it == data.end()) { //it's unique. + data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found. + // data[sequence].groupnumber = 1; + data[sequence].names = oldNameMap[name]; + }else { // its a duplicate. + data[sequence].names += "," + oldNameMap[name]; + // data[sequence].groupnumber++; + } + } gobble(inFASTA); } diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index 68450fb..a689704 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -121,10 +121,12 @@ int FilterSeqsCommand::execute() { if(trump != '*' || isTrue(vertical) || soft != 0){ while(!inFASTA.eof()){ //read through and create the filter... Sequence seq(inFASTA); - if(trump != '*'){ F.doTrump(seq); } - if(isTrue(vertical) || soft != 0){ F.getFreqs(seq); } - numSeqs++; - cout.flush(); + if (seq.getName() != "") { + if(trump != '*'){ F.doTrump(seq); } + if(isTrue(vertical) || soft != 0){ F.getFreqs(seq); } + numSeqs++; + cout.flush(); + } } } @@ -152,17 +154,19 @@ int FilterSeqsCommand::execute() { numSeqs = 0; while(!inFasta2.eof()){ Sequence seq(inFasta2); - string align = seq.getAligned(); - string filterSeq = ""; - - for(int j=0;j' << seq.getName() << endl << filterSeq << endl; + numSeqs++; } - - outFASTA << '>' << seq.getName() << endl << filterSeq << endl; - numSeqs++; gobble(inFasta2); } outFASTA.close(); diff --git a/getseqscommand.cpp b/getseqscommand.cpp index 04b2545..315417a 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -125,15 +125,16 @@ void GetSeqsCommand::readFasta(){ Sequence currSeq(in); name = currSeq.getName(); - //if this name is in the accnos file - if (names.count(name) == 1) { - wroteSomething = true; - - currSeq.printSequence(out); - - names.erase(name); + if (name != "") { + //if this name is in the accnos file + if (names.count(name) == 1) { + wroteSomething = true; + + currSeq.printSequence(out); + + names.erase(name); + } } - gobble(in); } in.close(); diff --git a/getsharedotucommand.cpp b/getsharedotucommand.cpp index 76e5725..742377f 100644 --- a/getsharedotucommand.cpp +++ b/getsharedotucommand.cpp @@ -133,7 +133,7 @@ int GetSharedOTUCommand::execute(){ while(!inFasta.eof()) { Sequence seq(inFasta); - seqs.push_back(seq); + if (seq.getName() != "") { seqs.push_back(seq); } } inFasta.close(); } diff --git a/listseqscommand.cpp b/listseqscommand.cpp index efc88e9..3087b48 100644 --- a/listseqscommand.cpp +++ b/listseqscommand.cpp @@ -125,7 +125,7 @@ void ListSeqsCommand::readFasta(){ Sequence currSeq(in); name = currSeq.getName(); - names.push_back(name); + if (name != "") { names.push_back(name); } gobble(in); } diff --git a/mothur.cpp b/mothur.cpp index 1c0c98f..4e064b1 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -77,7 +77,7 @@ int main(int argc, char *argv[]){ //srand(54321); srand( (unsigned)time( NULL ) ); - + Engine* mothur; bool bail = 0; string input; @@ -92,9 +92,16 @@ int main(int argc, char *argv[]){ } } else{ - mothur = new InteractEngine(argv[0]); + mothur = new InteractEngine(argv[0]); } + + //used to intercept the terminate signal, so instead of terminating mothur it will end a command + //void (*prev_fn)(int); + //prev_fn = signal(SIGTERM, mothur->terminateCommand(0)); + + //if (prev_fn==SIG_IGN) signal (SIGTERM,SIG_IGN); + while(bail == 0) { bail = mothur->getInput(); } delete mothur; diff --git a/mothur.h b/mothur.h index ba88f71..5c1880d 100644 --- a/mothur.h +++ b/mothur.h @@ -21,6 +21,7 @@ #include #include #include +#include //exception #include @@ -283,9 +284,6 @@ inline void errorOut(exception& e, string object, string function) { } - - - /***********************************************************************/ inline bool isTrue(string f){ @@ -425,7 +423,21 @@ inline string getExtension(string longName){ return extension; } - +/***********************************************************************/ +inline bool isBlank(string fileName){ + + ifstream fileHandle; + fileHandle.open(fileName.c_str()); + if(!fileHandle) { + mothurOut("Error: Could not open " + fileName); mothurOutEndLine(); + return false; + }else { + //check for blank file + gobble(fileHandle); + if (fileHandle.eof()) { fileHandle.close(); return true; } + } + return false; +} /***********************************************************************/ inline int openInputFile(string fileName, ifstream& fileHandle){ diff --git a/nast.cpp b/nast.cpp index 4848eb2..6a4ae3d 100644 --- a/nast.cpp +++ b/nast.cpp @@ -32,7 +32,6 @@ Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method errorOut(e, "Nast", "Nast"); exit(1); } - } /**************************************************************************************************/ @@ -42,10 +41,10 @@ void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment me try { alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned()); - + string candAln = alignment->getSeqAAln(); string tempAln = alignment->getSeqBAln(); - + if(candAln == ""){ candidateSeq->setPairwise(""); @@ -78,7 +77,6 @@ void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment me candidateSeq->setPairwise(candAln); // set the pairwise sequences in the Sequence objects for templateSeq->setPairwise(tempAln); // the candidate and template sequences - } catch(exception& e) { errorOut(e, "Nast", "pairwiseAlignSeqs"); diff --git a/nast.hpp b/nast.hpp index 4cf1b85..bc2467f 100644 --- a/nast.hpp +++ b/nast.hpp @@ -29,6 +29,7 @@ class Nast { public: Nast(Alignment*, Sequence*, Sequence*); + ~Nast(){}; float getSimilarityScore(); int getMaxInsertLength(); diff --git a/overlap.cpp b/overlap.cpp index e30b51d..40b49f8 100644 --- a/overlap.cpp +++ b/overlap.cpp @@ -60,7 +60,7 @@ void Overlap::setOverlap(vector >& alignment, const int nA int rowIndex = maxRow(alignment, band); // get the index for the row with the highest right hand side score int colIndex = maxColumn(alignment, band); // get the index for the column with the highest bottom row score - + int row = lB-1; int column = lA-1; diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp index 1b9edb0..28f8cf7 100644 --- a/removeseqscommand.cpp +++ b/removeseqscommand.cpp @@ -125,13 +125,14 @@ void RemoveSeqsCommand::readFasta(){ Sequence currSeq(in); name = currSeq.getName(); - //if this name is in the accnos file - if (names.count(name) == 0) { - wroteSomething = true; - - currSeq.printSequence(out); - }else { names.erase(name); } - + if (name != "") { + //if this name is in the accnos file + if (names.count(name) == 0) { + wroteSomething = true; + + currSeq.printSequence(out); + }else { names.erase(name); } + } gobble(in); } in.close(); diff --git a/reversecommand.cpp b/reversecommand.cpp index 7a87a99..37e9568 100644 --- a/reversecommand.cpp +++ b/reversecommand.cpp @@ -82,9 +82,11 @@ int ReverseSeqsCommand::execute(){ openOutputFile(reverseFile, outFASTA); while(!inFASTA.eof()){ - Sequence currSeq(inFASTA); - currSeq.reverseComplement(); - currSeq.printSequence(outFASTA); + Sequence currSeq(inFASTA); gobble(inFASTA); + if (currSeq.getName() != "") { + currSeq.reverseComplement(); + currSeq.printSequence(outFASTA); + } } inFASTA.close(); outFASTA.close(); diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index d5e917c..9225e99 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -131,20 +131,22 @@ int ScreenSeqsCommand::execute(){ while(!inFASTA.eof()){ Sequence currSeq(inFASTA); - bool goodSeq = 1; // innocent until proven guilty - if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; } - if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; } - if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; } - if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; } - if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; } - if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; } - - if(goodSeq == 1){ - currSeq.printSequence(goodSeqOut); - } - else{ - currSeq.printSequence(badSeqOut); - badSeqNames.insert(currSeq.getName()); + if (currSeq.getName() != "") { + bool goodSeq = 1; // innocent until proven guilty + if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; } + if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; } + if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; } + if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; } + if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; } + if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; } + + if(goodSeq == 1){ + currSeq.printSequence(goodSeqOut); + } + else{ + currSeq.printSequence(badSeqOut); + badSeqNames.insert(currSeq.getName()); + } } gobble(inFASTA); } diff --git a/secondarystructurecommand.cpp b/secondarystructurecommand.cpp index 7156082..c2c104e 100644 --- a/secondarystructurecommand.cpp +++ b/secondarystructurecommand.cpp @@ -91,11 +91,13 @@ int AlignCheckCommand::execute(){ while(!in.eof()){ - Sequence seq(in); - statData data = getStats(seq.getAligned()); - - out << seq.getName() << '\t' << data.pound << '\t' << data.dash << '\t' << data.plus << '\t' << data.equal << '\t'; - out << data.loop << '\t' << data.tilde << '\t' << data.total << endl; + Sequence seq(in); gobble(in); + if (seq.getName() != "") { + statData data = getStats(seq.getAligned()); + + out << seq.getName() << '\t' << data.pound << '\t' << data.dash << '\t' << data.plus << '\t' << data.equal << '\t'; + out << data.loop << '\t' << data.tilde << '\t' << data.total << endl; + } } in.close(); diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 4eafaf8..9ab27f9 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -91,18 +91,20 @@ int SeqSummaryCommand::execute(){ while(!inFASTA.eof()){ Sequence current(inFASTA); - startPosition.push_back(current.getStartPos()); - endPosition.push_back(current.getEndPos()); - seqLength.push_back(current.getNumBases()); - ambigBases.push_back(current.getAmbigBases()); - longHomoPolymer.push_back(current.getLongHomoPolymer()); - - outSummary << current.getName() << '\t'; - outSummary << current.getStartPos() << '\t' << current.getEndPos() << '\t'; - outSummary << current.getNumBases() << '\t' << current.getAmbigBases() << '\t'; - outSummary << current.getLongHomoPolymer() << endl; - - numSeqs++; + if (current.getName() != "") { + startPosition.push_back(current.getStartPos()); + endPosition.push_back(current.getEndPos()); + seqLength.push_back(current.getNumBases()); + ambigBases.push_back(current.getAmbigBases()); + longHomoPolymer.push_back(current.getLongHomoPolymer()); + + outSummary << current.getName() << '\t'; + outSummary << current.getStartPos() << '\t' << current.getEndPos() << '\t'; + outSummary << current.getNumBases() << '\t' << current.getAmbigBases() << '\t'; + outSummary << current.getLongHomoPolymer() << endl; + + numSeqs++; + } gobble(inFASTA); } inFASTA.close(); diff --git a/sequence.cpp b/sequence.cpp index b2c2b53..752e081 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -28,35 +28,85 @@ Sequence::Sequence(string newName, string sequence) { } //******************************************************************************************************************** - +//this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq Sequence::Sequence(ifstream& fastaFile){ initialize(); fastaFile >> name; name = name.substr(1); - - while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there - - char letter; string sequence; - while(fastaFile){ - letter= fastaFile.get(); - if(letter == '>'){ - fastaFile.putback(letter); + //read comments + while ((name[0] == '#') && fastaFile) { + while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + sequence = getCommentString(fastaFile); + + if (fastaFile) { + fastaFile >> name; + name = name.substr(1); + }else { + name = ""; break; } - else if(isprint(letter)){ - letter = toupper(letter); - if(letter == 'U'){letter = 'T';} - sequence += letter; - } } - + + //read real sequence + while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + + sequence = getSequenceString(fastaFile); + setAligned(sequence); //setUnaligned removes any gap characters for us setUnaligned(sequence); } +//******************************************************************************************************************** +string Sequence::getSequenceString(ifstream& fastaFile) { + try { + char letter; + string sequence = ""; + + while(fastaFile){ + letter= fastaFile.get(); + if(letter == '>'){ + fastaFile.putback(letter); + break; + } + else if(isprint(letter)){ + letter = toupper(letter); + if(letter == 'U'){letter = 'T';} + sequence += letter; + } + } + + return sequence; + } + catch(exception& e) { + errorOut(e, "Sequence", "getSequenceString"); + exit(1); + } +} +//******************************************************************************************************************** +//comment can contain '>' so we need to account for that +string Sequence::getCommentString(ifstream& fastaFile) { + try { + char letter; + string sequence = ""; + + while(fastaFile){ + letter=fastaFile.get(); + if((letter == '\r') || (letter == '\n')){ + gobble(fastaFile); //in case its a \r\n situation + break; + } + } + + return sequence; + } + catch(exception& e) { + errorOut(e, "Sequence", "getCommentString"); + exit(1); + } +} //******************************************************************************************************************** @@ -109,6 +159,7 @@ void Sequence::setAligned(string sequence){ //if the alignment starts or ends with a gap, replace it with a period to indicate missing data aligned = sequence; alignmentLength = aligned.length(); + setUnaligned(sequence); if(aligned[0] == '-'){ for(int i=0;i 0 || maxLength > 0){ - success = cullLength(currSeq); - if ((currSeq.getUnaligned().length() > 300) && (success)) { cout << "too long " << currSeq.getUnaligned().length() << endl; } - if(!success){ trashCode += 'l'; } - } - if(maxHomoP > 0){ - success = cullHomoP(currSeq); - if(!success){ trashCode += 'h'; } - } - if(maxAmbig != -1){ - success = cullAmbigs(currSeq); - if(!success){ trashCode += 'n'; } - } - - if(flip){ currSeq.reverseComplement(); } // should go last - - if(trashCode.length() == 0){ - currSeq.setAligned(currSeq.getUnaligned()); //this is because of a modification we made to the sequence class to fix a bug. all seqs have an aligned version, which is the version that gets printed. - currSeq.printSequence(outFASTA); if(barcodes.size() != 0){ - outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; - if(allFiles){ - currSeq.printSequence(*fastaFileNames[group]); + success = stripBarcode(currSeq, group); + if(!success){ trashCode += 'b'; } + } + if(numFPrimers != 0){ + success = stripForward(currSeq); + if(!success){ trashCode += 'f'; } + } + if(numRPrimers != 0){ + success = stripReverse(currSeq); + if(!success){ trashCode += 'r'; } + } + if(minLength > 0 || maxLength > 0){ + success = cullLength(currSeq); + if ((currSeq.getUnaligned().length() > 300) && (success)) { cout << "too long " << currSeq.getUnaligned().length() << endl; } + if(!success){ trashCode += 'l'; } + } + if(maxHomoP > 0){ + success = cullHomoP(currSeq); + if(!success){ trashCode += 'h'; } + } + if(maxAmbig != -1){ + success = cullAmbigs(currSeq); + if(!success){ trashCode += 'n'; } + } + + if(flip){ currSeq.reverseComplement(); } // should go last + + if(trashCode.length() == 0){ + currSeq.setAligned(currSeq.getUnaligned()); //this is because of a modification we made to the sequence class to fix a bug. all seqs have an aligned version, which is the version that gets printed. + currSeq.printSequence(outFASTA); + if(barcodes.size() != 0){ + outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; + + if(allFiles){ + currSeq.printSequence(*fastaFileNames[group]); + } } } - } - else{ - currSeq.setName(currSeq.getName() + '|' + trashCode); - currSeq.setUnaligned(origSeq); - currSeq.printSequence(scrapFASTA); + else{ + currSeq.setName(currSeq.getName() + '|' + trashCode); + currSeq.setUnaligned(origSeq); + currSeq.printSequence(scrapFASTA); + } } gobble(inFASTA); } -- 2.39.2