From: westcott Date: Wed, 6 Jul 2011 14:43:57 +0000 (+0000) Subject: added save parameter to align.seqs, chimera commands, classify.seqs, and seq.error... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=65b6a38d00b3a72021611211e7c25392022c69ed added save parameter to align.seqs, chimera commands, classify.seqs, and seq.error commands. added clear.memory command. added error message to chimera.slayer and chimera.uchime to avoid user entering both a namefile and reference file. --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index e920d62..c693c07 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -14,7 +14,9 @@ A713EBED12DC7C5E000092AC /* nmdscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A713EBEC12DC7C5E000092AC /* nmdscommand.cpp */; }; A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; }; A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; }; + A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721765613BB9F7D0014DAAE /* referencedb.cpp */; }; A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; }; + A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */; }; A74D3687137DAB8300332B0C /* addtargets2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3655137DAB8300332B0C /* addtargets2.cpp */; }; A74D3688137DAB8400332B0C /* alignchime.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3656137DAB8300332B0C /* alignchime.cpp */; }; A74D3689137DAB8400332B0C /* alignchimel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3657137DAB8300332B0C /* alignchimel.cpp */; }; @@ -346,8 +348,12 @@ A71CB15F130B04A2001E7287 /* anosimcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = anosimcommand.h; sourceTree = ""; }; A71FE12A12EDF72400963CA7 /* mergegroupscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mergegroupscommand.h; sourceTree = ""; }; A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = ""; }; + A721765513BB9F7D0014DAAE /* referencedb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = referencedb.h; sourceTree = ""; }; + A721765613BB9F7D0014DAAE /* referencedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = referencedb.cpp; sourceTree = ""; }; A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = ""; }; A727864312E9E28C00F86ABA /* removerarecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removerarecommand.cpp; sourceTree = ""; }; + A73DDBB813C4A0D1006AAE38 /* clearmemorycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clearmemorycommand.h; sourceTree = ""; }; + A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clearmemorycommand.cpp; sourceTree = ""; }; A74D3655137DAB8300332B0C /* addtargets2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = addtargets2.cpp; sourceTree = ""; }; A74D3656137DAB8300332B0C /* alignchime.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignchime.cpp; sourceTree = ""; }; A74D3657137DAB8300332B0C /* alignchimel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignchimel.cpp; sourceTree = ""; }; @@ -1206,6 +1212,8 @@ A7E9B69212D37EC400DA6239 /* classifyseqscommand.cpp */, A7E9B69712D37EC400DA6239 /* clearcutcommand.h */, A7E9B69612D37EC400DA6239 /* clearcutcommand.cpp */, + A73DDBB813C4A0D1006AAE38 /* clearmemorycommand.h */, + A73DDBB913C4A0D1006AAE38 /* clearmemorycommand.cpp */, A7E9B69D12D37EC400DA6239 /* clustercommand.h */, A7E9B69C12D37EC400DA6239 /* clustercommand.cpp */, A7E9B69F12D37EC400DA6239 /* clusterdoturcommand.h */, @@ -1602,6 +1610,8 @@ A7E9B7A012D37EC400DA6239 /* qualityscores.h */, A7E9B7A312D37EC400DA6239 /* rabundvector.cpp */, A7E9B7A412D37EC400DA6239 /* rabundvector.hpp */, + A721765513BB9F7D0014DAAE /* referencedb.h */, + A721765613BB9F7D0014DAAE /* referencedb.cpp */, A7E9B7CB12D37EC400DA6239 /* reportfile.cpp */, A7E9B7CC12D37EC400DA6239 /* reportfile.h */, A7E9B7CF12D37EC400DA6239 /* sabundvector.cpp */, @@ -2107,6 +2117,8 @@ A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */, A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */, A7730EFF13967241007433A3 /* countseqscommand.cpp in Sources */, + A721765713BB9F7D0014DAAE /* referencedb.cpp in Sources */, + A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -2153,8 +2165,8 @@ GCC_OPTIMIZATION_LEVEL = 0; GCC_PREPROCESSOR_DEFINITIONS = ( "MOTHUR_FILES=\"\\\"../release\\\"\"", - "VERSION=\"\\\"1.20.0\\\"\"", - "RELEASE_DATE=\"\\\"6/20/2011\\\"\"", + "VERSION=\"\\\"1.20.2\\\"\"", + "RELEASE_DATE=\"\\\"7/05/2011\\\"\"", ); GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES; diff --git a/aligncommand.cpp b/aligncommand.cpp index 5da0ee8..e3db3dd 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -24,6 +24,7 @@ #include "nast.hpp" #include "nastreport.hpp" +#include "referencedb.h" //********************************************************************************************************************** vector AlignCommand::setParameters(){ @@ -39,6 +40,7 @@ vector AlignCommand::setParameters(){ CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip); + CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); CommandParameter pthreshold("threshold", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(pthreshold); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -69,6 +71,7 @@ string AlignCommand::getHelpString(){ helpString += "The flip parameter is used to specify whether or not you want mothur to try the reverse complement if a sequence falls below the threshold. The default is false."; helpString += "The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. The default threshold is 0.50, meaning 50% of the bases are removed in the alignment."; helpString += "If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported."; + helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f."; helpString += "The default for the threshold parameter is 0.50, meaning at least 50% of the bases must remain or the sequence is reported as potentially reversed."; helpString += "The align.seqs command should be in the following format:"; helpString += "align.seqs(reference=yourTemplateFile, fasta=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty)"; @@ -99,8 +102,9 @@ AlignCommand::AlignCommand(){ //********************************************************************************************************************** AlignCommand::AlignCommand(string option) { try { - abort = false; calledHelp = false; - + abort = false; calledHelp = false; + ReferenceDB* rdb = ReferenceDB::getInstance(); + //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true;} else if(option == "citation") { citation(); abort = true; calledHelp = true;} @@ -146,15 +150,6 @@ AlignCommand::AlignCommand(string option) { } } - //check for required parameters - templateFileName = validParameter.validFile(parameters, "reference", true); - - if (templateFileName == "not found") { - m->mothurOut("reference is a required parameter for the align.seqs command."); - m->mothurOutEndLine(); - abort = true; - }else if (templateFileName == "not open") { abort = true; } - candidateFileName = validParameter.validFile(parameters, "fasta", false); if (candidateFileName == "not found") { //if there is a current fasta file, use it @@ -259,6 +254,27 @@ AlignCommand::AlignCommand(string option) { temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ temp = "f"; } flip = m->isTrue(temp); + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } + save = m->isTrue(temp); + rdb->save = save; + if (save) { //clear out old references + rdb->clearMemory(); + } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + templateFileName = validParameter.validFile(parameters, "reference", true); + if (templateFileName == "not found") { + //check for saved reference sequences + if (rdb->referenceSeqs.size() != 0) { + templateFileName = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required for the align.seqs command."); + m->mothurOutEndLine(); + abort = true; + } + }else if (templateFileName == "not open") { abort = true; } + else { if (save) { rdb->setSavedReference(templateFileName); } } + temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "0.50"; } convert(temp, threshold); diff --git a/aligncommand.h b/aligncommand.h index 952b936..75b9c2b 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -61,7 +61,7 @@ private: vector candidateFileNames; vector outputNames; - bool abort, flip, calledHelp; + bool abort, flip, calledHelp, save; }; diff --git a/alignmentdb.cpp b/alignmentdb.cpp index 6909f47..0fd2ac4 100644 --- a/alignmentdb.cpp +++ b/alignmentdb.cpp @@ -11,6 +11,7 @@ #include "kmerdb.hpp" #include "suffixdb.hpp" #include "blastdb.hpp" +#include "referencedb.h" /**************************************************************************************************/ @@ -20,91 +21,118 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap longest = 0; method = s; bool needToGenerate = true; + ReferenceDB* rdb = ReferenceDB::getInstance(); - m->mothurOutEndLine(); - m->mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush(); + if (fastaFileName == "saved") { + int start = time(NULL); + m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); + + for (int i = 0; i < rdb->referenceSeqs.size(); i++) { + templateSequences.push_back(rdb->referenceSeqs[i]); + //save longest base + if (rdb->referenceSeqs[i].getUnaligned().length() >= longest) { longest = (rdb->referenceSeqs[i].getUnaligned().length()+1); } + } + fastaFileName = rdb->getSavedReference(); + + numSeqs = templateSequences.size(); + m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); + + }else { + int start = time(NULL); + m->mothurOutEndLine(); + m->mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush(); + + #ifdef USE_MPI + int pid, processors; + vector positions; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + int tag = 2001; - #ifdef USE_MPI - int pid, processors; - vector positions; + char inFileName[1024]; + strcpy(inFileName, fastaFileName.c_str()); - MPI_Status status; - MPI_File inMPI; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - MPI_Comm_size(MPI_COMM_WORLD, &processors); - int tag = 2001; - - char inFileName[1024]; - strcpy(inFileName, fastaFileName.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + + if (pid == 0) { + positions = m->setFilePosFasta(fastaFileName, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } + }else{ + MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + positions.resize(numSeqs+1); + MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); + } + + //read file + for(int i=0;icontrol_pressed) { templateSequences.clear(); break; } + + //read next sequence + int length = positions[i+1] - positions[i]; + char* buf4 = new char[length]; + + MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status); - if (pid == 0) { - positions = m->setFilePosFasta(fastaFileName, numSeqs); //fills MPIPos, returns numSeqs + string tempBuf = buf4; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; - //send file positions to all processes - for(int i = 1; i < processors; i++) { - MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); - MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + istringstream iss (tempBuf,istringstream::in); + + Sequence temp(iss); + if (temp.getName() != "") { + templateSequences.push_back(temp); + + if (rdb->save) { rdb->referenceSeqs.push_back(temp); } + + //save longest base + if (temp.getUnaligned().length() >= longest) { longest = temp.getUnaligned().length()+1; } + } } - }else{ - MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - positions.resize(numSeqs+1); - MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); - } - - //read file - for(int i=0;icontrol_pressed) { templateSequences.clear(); break; } + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case - //read next sequence - int length = positions[i+1] - positions[i]; - char* buf4 = new char[length]; + MPI_File_close(&inMPI); - MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status); - - string tempBuf = buf4; - if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } - delete buf4; + #else + ifstream fastaFile; + m->openInputFile(fastaFileName, fastaFile); - istringstream iss (tempBuf,istringstream::in); - - Sequence temp(iss); + while (!fastaFile.eof()) { + Sequence temp(fastaFile); m->gobble(fastaFile); + + if (m->control_pressed) { templateSequences.clear(); break; } + if (temp.getName() != "") { templateSequences.push_back(temp); + + if (rdb->save) { rdb->referenceSeqs.push_back(temp); } + //save longest base - if (temp.getUnaligned().length() >= longest) { longest = temp.getUnaligned().length()+1; } + if (temp.getUnaligned().length() >= longest) { longest = (temp.getUnaligned().length()+1); } } } - - MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case - - MPI_File_close(&inMPI); + fastaFile.close(); + #endif - #else - ifstream fastaFile; - m->openInputFile(fastaFileName, fastaFile); - - while (!fastaFile.eof()) { - Sequence temp(fastaFile); m->gobble(fastaFile); - - if (m->control_pressed) { templateSequences.clear(); break; } + numSeqs = templateSequences.size(); + //all of this is elsewhere already! - if (temp.getName() != "") { - templateSequences.push_back(temp); - //save longest base - if (temp.getUnaligned().length() >= longest) { longest = (temp.getUnaligned().length()+1); } - } + m->mothurOut("DONE."); + m->mothurOutEndLine(); cout.flush(); + m->mothurOut("It took " + toString(time(NULL) - start) + " to read " + toString(rdb->referenceSeqs.size()) + " sequences."); m->mothurOutEndLine(); + } - fastaFile.close(); - #endif - - numSeqs = templateSequences.size(); - //all of this is elsewhere already! - m->mothurOut("DONE."); - m->mothurOutEndLine(); cout.flush(); //in case you delete the seqs and then ask for them emptySequence = Sequence(); @@ -156,6 +184,7 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap search->setNumSeqs(numSeqs); } + } catch(exception& e) { m->errorOut(e, "AlignmentDB", "AlignmentDB"); @@ -192,7 +221,7 @@ Sequence AlignmentDB::findClosestSequence(Sequence* seq) { try{ vector spot = search->findClosestSequences(seq, 1); - + if (spot.size() != 0) { return templateSequences[spot[0]]; } else { return emptySequence; } diff --git a/bayesian.cpp b/bayesian.cpp index cf55d5c..0b55efd 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -10,15 +10,22 @@ #include "bayesian.h" #include "kmer.hpp" #include "phylosummary.h" - +#include "referencedb.h" /**************************************************************************************************/ Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { try { + ReferenceDB* rdb = ReferenceDB::getInstance(); + + string baseName = tempFile; + if (baseName == "saved") { baseName = rdb->getSavedReference(); } + + string baseTName = tfile; + if (baseTName == "saved") { baseTName = rdb->getSavedTaxonomy(); } /************calculate the probablity that each word will be in a specific taxonomy*************/ - string tfileroot = tfile.substr(0,tfile.find_last_of(".")+1); - string tempfileroot = m->getRootName(m->getSimpleName(tempFile)); + string tfileroot = baseTName.substr(0,baseTName.find_last_of(".")+1); + string tempfileroot = m->getRootName(m->getSimpleName(baseName)); string phyloTreeName = tfileroot + "tree.train"; string phyloTreeSumName = tfileroot + "tree.sum"; string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob"; @@ -40,7 +47,23 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { FilesGood = checkReleaseDate(probFileTest, probFileTest2, phyloTreeTest, probFileTest3); } + //if you want to save, but you dont need to calculate then just read + if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood) { + ifstream saveIn; + m->openInputFile(tempFile, saveIn); + + while (!saveIn.eof()) { + Sequence temp(saveIn); + m->gobble(saveIn); + + rdb->referenceSeqs.push_back(temp); + } + saveIn.close(); + } + if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){ + if (tempFile == "saved") { m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); } + m->mothurOut("Reading template taxonomy... "); cout.flush(); phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); @@ -49,10 +72,17 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { genusNodes = phyloTree->getGenusNodes(); genusTotals = phyloTree->getGenusTotals(); - - m->mothurOut("Reading template probabilities... "); cout.flush(); - readProbFile(probFileTest, probFileTest2, probFileName, probFileName2); + if (tfile == "saved") { + m->mothurOutEndLine(); m->mothurOut("Using probabilties from " + rdb->getSavedTaxonomy() + " that are saved in memory... "); cout.flush();; + wordGenusProb = rdb->wordGenusProb; + }else { + m->mothurOut("Reading template probabilities... "); cout.flush(); + readProbFile(probFileTest, probFileTest2, probFileName, probFileName2); + } + + //save probabilities + if (rdb->save) { rdb->wordGenusProb = wordGenusProb; } }else{ //create search database and names vector @@ -194,6 +224,9 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { delete phyloTree; phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); + + //save probabilities + if (rdb->save) { rdb->wordGenusProb = wordGenusProb; } } } diff --git a/ccode.cpp b/ccode.cpp index ee88ba1..00cd3f1 100644 --- a/ccode.cpp +++ b/ccode.cpp @@ -11,12 +11,13 @@ #include "ignoregaps.h" #include "eachgapdist.h" - //*************************************************************************************************************** Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int numW, string o) : Chimera() { + try { + fastafile = filename; outputDir = o; - templateFileName = temp; templateSeqs = readSeqs(temp); + templateFileName = temp; templateSeqs = readSeqs(temp); setMask(mask); filter = f; window = win; @@ -63,6 +64,11 @@ Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int num out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl; out2.close(); #endif + } + catch(exception& e) { + m->errorOut(e, "Ccode", "Ccode"); + exit(1); + } } //*************************************************************************************************************** Ccode::~Ccode() { diff --git a/chimera.cpp b/chimera.cpp index 5412487..ed84397 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -8,6 +8,7 @@ */ #include "chimera.h" +#include "referencedb.h" //*************************************************************************************************************** //this is a vertical soft filter @@ -94,99 +95,123 @@ map Chimera::runFilter(Sequence* seq) { //*************************************************************************************************************** vector Chimera::readSeqs(string file) { try { - + vector container; int count = 0; length = 0; unaligned = false; - - m->mothurOut("Reading sequences from " + file + "..."); cout.flush(); + ReferenceDB* rdb = ReferenceDB::getInstance(); - #ifdef USE_MPI - int pid, processors; - vector positions; - int numSeqs; - int tag = 2001; - - MPI_Status status; - MPI_File inMPI; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - MPI_Comm_size(MPI_COMM_WORLD, &processors); - - //char* inFileName = new char[file.length()]; - //memcpy(inFileName, file.c_str(), file.length()); + if (file == "saved") { - char inFileName[1024]; - strcpy(inFileName, file.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer - //delete inFileName; - - if (pid == 0) { - positions = m->setFilePosFasta(file, numSeqs); //fills MPIPos, returns numSeqs - - //send file positions to all processes - for(int i = 1; i < processors; i++) { - MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); - MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); - } - }else{ - MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - positions.resize(numSeqs+1); - MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); + + m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); + + for (int i = 0; i < rdb->referenceSeqs.size(); i++) { + Sequence* temp = new Sequence(rdb->referenceSeqs[i].getName(), rdb->referenceSeqs[i].getAligned()); + + if (count == 0) { length = temp->getAligned().length(); count++; } //gets first seqs length + else if (length != temp->getAligned().length()) { unaligned = true; } + + if (temp->getName() != "") { container.push_back(temp); } } - //read file - for(int i=0;igetSavedReference(); - if (m->control_pressed) { MPI_File_close(&inMPI); return container; } - - //read next sequence - int seqlength = positions[i+1] - positions[i]; - char* buf4 = new char[seqlength]; + }else { + + m->mothurOut("Reading sequences from " + file + "..."); cout.flush(); + + #ifdef USE_MPI + int pid, processors; + vector positions; + int numSeqs; + int tag = 2001; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); - MPI_File_read_at(inMPI, positions[i], buf4, seqlength, MPI_CHAR, &status); + //char* inFileName = new char[file.length()]; + //memcpy(inFileName, file.c_str(), file.length()); - string tempBuf = buf4; - if (tempBuf.length() > seqlength) { tempBuf = tempBuf.substr(0, seqlength); } - delete buf4; + char inFileName[1024]; + strcpy(inFileName, file.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + //delete inFileName; + + if (pid == 0) { + positions = m->setFilePosFasta(file, numSeqs); //fills MPIPos, returns numSeqs - istringstream iss (tempBuf,istringstream::in); + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } + }else{ + MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + positions.resize(numSeqs+1); + MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); + } + + //read file + for(int i=0;icontrol_pressed) { MPI_File_close(&inMPI); return container; } - Sequence* current = new Sequence(iss); - if (current->getName() != "") { - if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length - else if (length != current->getAligned().length()) { unaligned = true; } + //read next sequence + int seqlength = positions[i+1] - positions[i]; + char* buf4 = new char[seqlength]; + + MPI_File_read_at(inMPI, positions[i], buf4, seqlength, MPI_CHAR, &status); + + string tempBuf = buf4; + if (tempBuf.length() > seqlength) { tempBuf = tempBuf.substr(0, seqlength); } + delete buf4; + + istringstream iss (tempBuf,istringstream::in); + Sequence* current = new Sequence(iss); + if (current->getName() != "") { + if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length + else if (length != current->getAligned().length()) { unaligned = true; } + + container.push_back(current); + if (rdb->save) { rdb->referenceSeqs.push_back(*current); } + } + } + + MPI_File_close(&inMPI); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case + #else + + ifstream in; + m->openInputFile(file, in); + + //read in seqs and store in vector + while(!in.eof()){ + + if (m->control_pressed) { return container; } + + Sequence* current = new Sequence(in); m->gobble(in); + + if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length + else if (length != current->getAligned().length()) { unaligned = true; } + + if (current->getName() != "") { container.push_back(current); + if (rdb->save) { rdb->referenceSeqs.push_back(*current); } } } - - MPI_File_close(&inMPI); - MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case - #else - - ifstream in; - m->openInputFile(file, in); + in.close(); + #endif - //read in seqs and store in vector - while(!in.eof()){ + m->mothurOut("Done."); m->mothurOutEndLine(); - if (m->control_pressed) { return container; } - - Sequence* current = new Sequence(in); m->gobble(in); - - if (count == 0) { length = current->getAligned().length(); count++; } //gets first seqs length - else if (length != current->getAligned().length()) { unaligned = true; } - - if (current->getName() != "") { container.push_back(current); } + filterString = (string(container[0]->getAligned().length(), '1')); } - in.close(); - #endif - - m->mothurOut("Done."); m->mothurOutEndLine(); - - filterString = (string(container[0]->getAligned().length(), '1')); return container; } diff --git a/chimeraccodecommand.cpp b/chimeraccodecommand.cpp index bc96e05..fd80ec9 100644 --- a/chimeraccodecommand.cpp +++ b/chimeraccodecommand.cpp @@ -9,6 +9,7 @@ #include "chimeraccodecommand.h" #include "ccode.h" +#include "referencedb.h" //********************************************************************************************************************** vector ChimeraCcodeCommand::setParameters(){ try { @@ -21,6 +22,7 @@ vector ChimeraCcodeCommand::setParameters(){ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -49,6 +51,7 @@ string ChimeraCcodeCommand::getHelpString(){ helpString += "The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n"; helpString += "The window parameter allows you to specify the window size for searching for chimeras. \n"; helpString += "The numwanted parameter allows you to specify how many sequences you would each query sequence compared with.\n"; + helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f."; helpString += "The chimera.ccode command should be in the following format: \n"; helpString += "chimera.ccode(fasta=yourFastaFile, reference=yourTemplate) \n"; helpString += "Example: chimera.ccode(fasta=AD.align, reference=core_set_aligned.imputed.fasta) \n"; @@ -79,6 +82,7 @@ ChimeraCcodeCommand::ChimeraCcodeCommand(){ ChimeraCcodeCommand::ChimeraCcodeCommand(string option) { try { abort = false; calledHelp = false; + ReferenceDB* rdb = ReferenceDB::getInstance(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -197,10 +201,6 @@ ChimeraCcodeCommand::ChimeraCcodeCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } - - templatefile = validParameter.validFile(parameters, "reference", true); - if (templatefile == "not open") { abort = true; } - else if (templatefile == "not found") { templatefile = ""; m->mothurOut("reference is a required parameter for the chimera.ccode command."); m->mothurOutEndLine(); abort = true; } maskfile = validParameter.validFile(parameters, "mask", false); if (maskfile == "not found") { maskfile = ""; } @@ -230,6 +230,28 @@ ChimeraCcodeCommand::ChimeraCcodeCommand(string option) { temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "20"; } convert(temp, numwanted); + + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } + save = m->isTrue(temp); + rdb->save = save; + if (save) { //clear out old references + rdb->clearMemory(); + } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + templatefile = validParameter.validFile(parameters, "reference", true); + if (templatefile == "not found") { + //check for saved reference sequences + if (rdb->referenceSeqs.size() != 0) { + templatefile = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required."); + m->mothurOutEndLine(); + abort = true; + } + }else if (templatefile == "not open") { abort = true; } + else { if (save) { rdb->setSavedReference(templatefile); } } + } } diff --git a/chimeraccodecommand.h b/chimeraccodecommand.h index 2bf7a70..87f5ee8 100644 --- a/chimeraccodecommand.h +++ b/chimeraccodecommand.h @@ -50,7 +50,7 @@ private: int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); #endif - bool abort, filter; + bool abort, filter, save; string fastafile, templatefile, outputDir, maskfile; int processors, window, numwanted, numSeqs, templateSeqsLength; Chimera* chimera; diff --git a/chimeracheckcommand.cpp b/chimeracheckcommand.cpp index 5b249d7..584e987 100644 --- a/chimeracheckcommand.cpp +++ b/chimeracheckcommand.cpp @@ -8,6 +8,7 @@ */ #include "chimeracheckcommand.h" +#include "referencedb.h" //********************************************************************************************************************** vector ChimeraCheckCommand::setParameters(){ @@ -21,7 +22,8 @@ vector ChimeraCheckCommand::setParameters(){ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); - + CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); + vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; @@ -50,6 +52,7 @@ string ChimeraCheckCommand::getHelpString(){ helpString += "The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence, default is False.\n"; helpString += "The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n"; helpString += "You may enter multiple name files by separating their names with dashes. ie. fasta=abrecovery.svg.names-amzon.svg.names \n"; + helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f."; helpString += "The chimera.check command should be in the following format: \n"; helpString += "chimera.check(fasta=yourFastaFile, reference=yourTemplateFile, processors=yourProcessors, ksize=yourKmerSize) \n"; helpString += "Example: chimera.check(fasta=AD.fasta, reference=core_set_aligned,imputed.fasta, processors=4, ksize=8) \n"; @@ -77,7 +80,8 @@ ChimeraCheckCommand::ChimeraCheckCommand(){ //*************************************************************************************************************** ChimeraCheckCommand::ChimeraCheckCommand(string option) { try { - abort = false; calledHelp = false; + abort = false; calledHelp = false; + ReferenceDB* rdb = ReferenceDB::getInstance(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -195,10 +199,6 @@ ChimeraCheckCommand::ChimeraCheckCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } - - templatefile = validParameter.validFile(parameters, "reference", true); - if (templatefile == "not open") { abort = true; } - else if (templatefile == "not found") { templatefile = ""; m->mothurOut("reference is a required parameter for the chimera.check command."); m->mothurOutEndLine(); abort = true; } namefile = validParameter.validFile(parameters, "name", false); if (namefile == "not found") { namefile = ""; } @@ -283,6 +283,28 @@ ChimeraCheckCommand::ChimeraCheckCommand(string option) { m->setProcessors(temp); convert(temp, processors); + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } + save = m->isTrue(temp); + rdb->save = save; + if (save) { //clear out old references + rdb->clearMemory(); + } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + templatefile = validParameter.validFile(parameters, "reference", true); + if (templatefile == "not found") { + //check for saved reference sequences + if (rdb->referenceSeqs.size() != 0) { + templatefile = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required."); + m->mothurOutEndLine(); + abort = true; + } + }else if (templatefile == "not open") { abort = true; } + else { if (save) { rdb->setSavedReference(templatefile); } } + + temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; } convert(temp, ksize); diff --git a/chimeracheckcommand.h b/chimeracheckcommand.h index d29744b..350e10e 100644 --- a/chimeracheckcommand.h +++ b/chimeracheckcommand.h @@ -53,7 +53,7 @@ private: int driverMPI(int, int, MPI_File&, MPI_File&, vector&); #endif - bool abort, svg; + bool abort, svg, save; string fastafile, templatefile, namefile, outputDir; int processors, increment, ksize, numSeqs, templateSeqsLength; Chimera* chimera; diff --git a/chimerapintailcommand.cpp b/chimerapintailcommand.cpp index 482bf7b..66fe65d 100644 --- a/chimerapintailcommand.cpp +++ b/chimerapintailcommand.cpp @@ -10,6 +10,7 @@ #include "chimerapintailcommand.h" #include "pintail.h" + //********************************************************************************************************************** vector ChimeraPintailCommand::setParameters(){ try { @@ -24,7 +25,8 @@ vector ChimeraPintailCommand::setParameters(){ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); - + CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); + vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; @@ -50,6 +52,7 @@ string ChimeraPintailCommand::getHelpString(){ #ifdef USE_MPI helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n"; #endif + helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f."; helpString += "The window parameter allows you to specify the window size for searching for chimeras, default=300. \n"; helpString += "The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n"; helpString += "The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n"; @@ -83,6 +86,7 @@ ChimeraPintailCommand::ChimeraPintailCommand(){ ChimeraPintailCommand::ChimeraPintailCommand(string option) { try { abort = false; calledHelp = false; + rdb = ReferenceDB::getInstance(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -229,6 +233,28 @@ ChimeraPintailCommand::ChimeraPintailCommand(string option) { temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; } convert(temp, increment); + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } + save = m->isTrue(temp); + rdb->save = save; + if (save) { //clear out old references + rdb->clearMemory(); + } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + templatefile = validParameter.validFile(parameters, "reference", true); + if (templatefile == "not found") { + //check for saved reference sequences + if (rdb->referenceSeqs.size() != 0) { + templatefile = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required."); + m->mothurOutEndLine(); + abort = true; + } + }else if (templatefile == "not open") { abort = true; } + else { if (save) { rdb->setSavedReference(templatefile); } } + + maskfile = validParameter.validFile(parameters, "mask", false); if (maskfile == "not found") { maskfile = ""; } else if (maskfile != "default") { @@ -274,10 +300,6 @@ ChimeraPintailCommand::ChimeraPintailCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } - templatefile = validParameter.validFile(parameters, "reference", true); - if (templatefile == "not open") { abort = true; } - else if (templatefile == "not found") { templatefile = ""; m->mothurOut("reference is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; } - consfile = validParameter.validFile(parameters, "conservation", true); if (consfile == "not open") { abort = true; } else if (consfile == "not found") { @@ -329,15 +351,18 @@ int ChimeraPintailCommand::execute(){ if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); } //check for quantile to save the time + string baseName = templatefile; + if (templatefile == "saved") { baseName = rdb->getSavedReference(); } + string tempQuan = ""; if ((!filter) && (maskfile == "")) { - tempQuan = inputDir + m->getRootName(m->getSimpleName(templatefile)) + "pintail.quan"; + tempQuan = inputDir + m->getRootName(m->getSimpleName(baseName)) + "pintail.quan"; }else if ((!filter) && (maskfile != "")) { - tempQuan = inputDir + m->getRootName(m->getSimpleName(templatefile)) + "pintail.masked.quan"; + tempQuan = inputDir + m->getRootName(m->getSimpleName(baseName)) + "pintail.masked.quan"; }else if ((filter) && (maskfile != "")) { - tempQuan = inputDir + m->getRootName(m->getSimpleName(templatefile)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastaFileNames[s])) + "masked.quan"; + tempQuan = inputDir + m->getRootName(m->getSimpleName(baseName)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastaFileNames[s])) + "masked.quan"; }else if ((filter) && (maskfile == "")) { - tempQuan = inputDir + m->getRootName(m->getSimpleName(templatefile)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastaFileNames[s])) + "quan"; + tempQuan = inputDir + m->getRootName(m->getSimpleName(baseName)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastaFileNames[s])) + "quan"; } ifstream FileTest(tempQuan.c_str()); @@ -350,13 +375,13 @@ int ChimeraPintailCommand::execute(){ string tryPath = m->getDefaultPath(); string tempQuan = ""; if ((!filter) && (maskfile == "")) { - tempQuan = tryPath + m->getRootName(m->getSimpleName(templatefile)) + "pintail.quan"; + tempQuan = tryPath + m->getRootName(m->getSimpleName(baseName)) + "pintail.quan"; }else if ((!filter) && (maskfile != "")) { - tempQuan = tryPath + m->getRootName(m->getSimpleName(templatefile)) + "pintail.masked.quan"; + tempQuan = tryPath + m->getRootName(m->getSimpleName(baseName)) + "pintail.masked.quan"; }else if ((filter) && (maskfile != "")) { - tempQuan = tryPath + m->getRootName(m->getSimpleName(templatefile)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastaFileNames[s])) + "masked.quan"; + tempQuan = tryPath + m->getRootName(m->getSimpleName(baseName)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastaFileNames[s])) + "masked.quan"; }else if ((filter) && (maskfile == "")) { - tempQuan = tryPath + m->getRootName(m->getSimpleName(templatefile)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastaFileNames[s])) + "quan"; + tempQuan = tryPath + m->getRootName(m->getSimpleName(baseName)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastaFileNames[s])) + "quan"; } ifstream FileTest2(tempQuan.c_str()); diff --git a/chimerapintailcommand.h b/chimerapintailcommand.h index acb0781..9e1d6f8 100644 --- a/chimerapintailcommand.h +++ b/chimerapintailcommand.h @@ -13,7 +13,7 @@ #include "mothur.h" #include "command.hpp" #include "chimera.h" - +#include "referencedb.h" /***********************************************************/ @@ -35,7 +35,8 @@ public: int execute(); void help() { m->mothurOut(getHelpString()); } private: - + ReferenceDB* rdb; + struct linePair { unsigned long int start; unsigned long int end; @@ -52,7 +53,7 @@ private: int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); #endif - bool abort, filter; + bool abort, filter, save; string fastafile, templatefile, consfile, quanfile, maskfile, outputDir, inputDir; int processors, window, increment, numSeqs, templateSeqsLength; Chimera* chimera; diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 5d714fd..57528b4 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -44,6 +44,7 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num } } //*************************************************************************************************************** +//template=self ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, map& prior, string mode, int k, int ms, int mms, int win, float div, int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() { try { @@ -256,6 +257,11 @@ vector ChimeraSlayer::getTemplate(Sequence q, vector& user } } + //avoids nuisance error from formatdb for making blank blast database + if (userTemplate.size() == 0) { + return userTemplate; + } + string kmerDBNameLeft; string kmerDBNameRight; @@ -749,7 +755,6 @@ int ChimeraSlayer::getChimeras(Sequence* query) { else { thisTemplate = getTemplate(*query, thisFilteredTemplate); } //fills this template and creates the databases if (m->control_pressed) { return 0; } - if (thisTemplate.size() == 0) { return 0; } //not chimeric //moved this out of maligner - 4/29/11 diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index 636e175..c846a83 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -10,6 +10,7 @@ #include "chimeraslayercommand.h" #include "chimeraslayer.h" #include "deconvolutecommand.h" +#include "referencedb.h" //********************************************************************************************************************** vector ChimeraSlayerCommand::setParameters(){ @@ -37,7 +38,8 @@ vector ChimeraSlayerCommand::setParameters(){ CommandParameter pincrement("increment", "Number", "", "5", "", "", "",false,false); parameters.push_back(pincrement); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); - + CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); + vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; @@ -55,7 +57,7 @@ string ChimeraSlayerCommand::getHelpString(){ helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n"; helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, and realign.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; - helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n"; + helpString += "The name parameter allows you to provide a name file, if you are using reference=self. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; @@ -79,6 +81,7 @@ string ChimeraSlayerCommand::getHelpString(){ helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n"; helpString += "The search parameter allows you to specify search method for finding the closest parent. Choices are blast, and kmer, default blast. \n"; helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default true. \n"; + helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f."; helpString += "The chimera.slayer command should be in the following format: \n"; helpString += "chimera.slayer(fasta=yourFastaFile, reference=yourTemplate, search=yourSearch) \n"; helpString += "Example: chimera.slayer(fasta=AD.align, reference=core_set_aligned.imputed.fasta, search=kmer) \n"; @@ -109,6 +112,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(){ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { try { abort = false; calledHelp = false; + ReferenceDB* rdb = ReferenceDB::getInstance(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -296,12 +300,29 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + convert(temp, processors); + + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } + save = m->isTrue(temp); + rdb->save = save; + if (save) { //clear out old references + rdb->clearMemory(); + } string path; it = parameters.find("reference"); //user has given a template file if(it != parameters.end()){ - if (it->second == "self") { templatefile = "self"; } + if (it->second == "self") { + templatefile = "self"; + if (save) { + m->mothurOut("[WARNING]: You can't save reference=self, ignoring save."); + m->mothurOutEndLine(); + save = false; + } + } else { path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. @@ -309,14 +330,34 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { templatefile = validParameter.validFile(parameters, "reference", true); if (templatefile == "not open") { abort = true; } - else if (templatefile == "not found") { templatefile = ""; m->mothurOut("reference is a required parameter for the chimera.slayer command, unless and namefile is given."); m->mothurOutEndLine(); abort = true; } + else if (templatefile == "not found") { //check for saved reference sequences + if (rdb->referenceSeqs.size() != 0) { + templatefile = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required."); + m->mothurOutEndLine(); + abort = true; + } + }else { if (save) { rdb->setSavedReference(templatefile); } } + } + }else if (hasName) { templatefile = "self"; + if (save) { + m->mothurOut("[WARNING]: You can't save reference=self, ignoring save."); + m->mothurOutEndLine(); + save = false; } - }else if (hasName) { templatefile = "self"; } - else { templatefile = ""; m->mothurOut("reference is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; } + } + else { + if (rdb->referenceSeqs.size() != 0) { + templatefile = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required."); + m->mothurOutEndLine(); + templatefile = ""; abort = true; + } + } + - string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } - m->setProcessors(temp); - convert(temp, processors); temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; } convert(temp, ksize); @@ -369,6 +410,8 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { convert(temp, numwanted); if ((search != "blast") && (search != "kmer")) { m->mothurOut(search + " is not a valid search."); m->mothurOutEndLine(); abort = true; } + + if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } } } catch(exception& e) { @@ -381,7 +424,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { int ChimeraSlayerCommand::execute(){ try{ if (abort == true) { if (calledHelp) { return 0; } return 2; } - + for (int s = 0; s < fastaFileNames.size(); s++) { m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h index 35ff762..cfa934a 100644 --- a/chimeraslayercommand.h +++ b/chimeraslayercommand.h @@ -52,7 +52,7 @@ private: int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); #endif - bool abort, realign, trim, trimera; + bool abort, realign, trim, trimera, save; string fastafile, templatefile, outputDir, search, namefile; int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength; float divR; diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index 05ddaf6..b42a491 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -11,6 +11,7 @@ #include "deconvolutecommand.h" #include "uc.h" #include "sequence.hpp" +#include "referencedb.h" //********************************************************************************************************************** @@ -40,7 +41,7 @@ vector ChimeraUchimeCommand::setParameters(){ CommandParameter pmaxlen("maxlen", "Number", "", "10000", "", "", "",false,false); parameters.push_back(pmaxlen); CommandParameter pucl("ucl", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pucl); CommandParameter pqueryfract("queryfract", "Number", "", "0.5", "", "", "",false,false); parameters.push_back(pqueryfract); - + vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; @@ -112,7 +113,8 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(){ //*************************************************************************************************************** ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { try { - abort = false; calledHelp = false; + abort = false; calledHelp = false; + ReferenceDB* rdb = ReferenceDB::getInstance(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -300,7 +302,6 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } - string path; it = parameters.find("reference"); //user has given a template file @@ -313,12 +314,29 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { templatefile = validParameter.validFile(parameters, "reference", true); if (templatefile == "not open") { abort = true; } - else if (templatefile == "not found") { templatefile = ""; m->mothurOut("reference is a required parameter for the chimera.uchime command."); m->mothurOutEndLine(); abort = true; } + else if (templatefile == "not found") { //check for saved reference sequences + if (rdb->getSavedReference() != "") { + templatefile = rdb->getSavedReference(); + m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + "."); m->mothurOutEndLine(); + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required."); + m->mothurOutEndLine(); + abort = true; + } + } } }else if (hasName) { templatefile = "self"; } - else { templatefile = ""; m->mothurOut("reference is a required parameter for the chimera.uchime command, unless you have a namefile."); m->mothurOutEndLine(); abort = true; } - - + else { + if (rdb->getSavedReference() != "") { + templatefile = rdb->getSavedReference(); + m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + "."); m->mothurOutEndLine(); + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required."); + m->mothurOutEndLine(); + templatefile = ""; abort = true; + } + } + string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); convert(temp, processors); @@ -353,7 +371,8 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { temp = validParameter.validFile(parameters, "skipgaps2", false); if (temp == "not found") { temp = "t"; } skipgaps2 = m->isTrue(temp); - + + if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } } } catch(exception& e) { diff --git a/classify.cpp b/classify.cpp index 925deda..7770999 100644 --- a/classify.cpp +++ b/classify.cpp @@ -13,157 +13,223 @@ #include "suffixdb.hpp" #include "blastdb.hpp" #include "distancedb.hpp" +#include "referencedb.h" /**************************************************************************************************/ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) { try { - taxFile = tfile; - readTaxonomy(taxFile); + ReferenceDB* rdb = ReferenceDB::getInstance(); - templateFile = tempFile; + if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); } - int start = time(NULL); + taxFile = tfile; + readTaxonomy(taxFile); int numSeqs = 0; - m->mothurOut("Generating search database... "); cout.flush(); -#ifdef USE_MPI - int pid, processors; - vector positions; - int tag = 2001; - - MPI_Status status; - MPI_File inMPI; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - MPI_Comm_size(MPI_COMM_WORLD, &processors); - - //char* inFileName = new char[tempFile.length()]; - //memcpy(inFileName, tempFile.c_str(), tempFile.length()); + if (tempFile == "saved") { + int start = time(NULL); + m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); + + numSeqs = rdb->referenceSeqs.size(); + templateFile = rdb->getSavedReference(); + tempFile = rdb->getSavedReference(); + + bool needToGenerate = true; + string kmerDBName; + if(method == "kmer") { + database = new KmerDB(tempFile, kmerSize); + + kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTest(kmerDBName.c_str()); + if(kmerFileTest){ + bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion()); + if (GoodFile) { needToGenerate = false; } + } + } + else if(method == "suffix") { database = new SuffixDB(numSeqs); } + else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch); } + else if(method == "distance") { database = new DistanceDB(); } + else { + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); + m->mothurOutEndLine(); + database = new KmerDB(tempFile, 8); + } + + if (needToGenerate) { + for (int k = 0; k < rdb->referenceSeqs.size(); k++) { + Sequence temp(rdb->referenceSeqs[k].getName(), rdb->referenceSeqs[k].getAligned()); + names.push_back(temp.getName()); + database->addSequence(temp); + } + database->generateDB(); + }else if ((method == "kmer") && (!needToGenerate)) { + ifstream kmerFileTest(kmerDBName.c_str()); + database->readKmerDB(kmerFileTest); + + for (int k = 0; k < rdb->referenceSeqs.size(); k++) { + names.push_back(rdb->referenceSeqs[k].getName()); + } + } + + database->setNumSeqs(numSeqs); + + //sanity check + bool okay = phyloTree->ErrorCheck(names); + + if (!okay) { m->control_pressed = true; } - char inFileName[1024]; - strcpy(inFileName, tempFile.c_str()); + m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences and generate the search databases.");m->mothurOutEndLine(); + + }else { + + templateFile = tempFile; + + int start = time(NULL); + + m->mothurOut("Generating search database... "); cout.flush(); + #ifdef USE_MPI + int pid, processors; + vector positions; + int tag = 2001; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + //char* inFileName = new char[tempFile.length()]; + //memcpy(inFileName, tempFile.c_str(), tempFile.length()); + + char inFileName[1024]; + strcpy(inFileName, tempFile.c_str()); - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer - //delete inFileName; + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + //delete inFileName; - if (pid == 0) { //only one process needs to scan file - positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs + if (pid == 0) { //only one process needs to scan file + positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs - //send file positions to all processes - for(int i = 1; i < processors; i++) { - MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); - MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } + }else{ + MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + positions.resize(numSeqs+1); + MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); } - }else{ - MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - positions.resize(numSeqs+1); - MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); - } + + //create database + if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); } + else if(method == "suffix") { database = new SuffixDB(numSeqs); } + else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch); } + else if(method == "distance") { database = new DistanceDB(); } + else { + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine(); + database = new KmerDB(tempFile, 8); + } + + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + istringstream iss (tempBuf,istringstream::in); + + Sequence temp(iss); + if (temp.getName() != "") { + if (rdb->save) { rdb->referenceSeqs.push_back(temp); } + names.push_back(temp.getName()); + database->addSequence(temp); + } + } + + database->generateDB(); + MPI_File_close(&inMPI); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case + #else - //create database - if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); } + //need to know number of template seqs for suffixdb + if (method == "suffix") { + ifstream inFASTA; + m->openInputFile(tempFile, inFASTA); + m->getNumSeqs(inFASTA, numSeqs); + inFASTA.close(); + } + + bool needToGenerate = true; + string kmerDBName; + if(method == "kmer") { + database = new KmerDB(tempFile, kmerSize); + + kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTest(kmerDBName.c_str()); + if(kmerFileTest){ + bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion()); + if (GoodFile) { needToGenerate = false; } + } + } else if(method == "suffix") { database = new SuffixDB(numSeqs); } else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch); } else if(method == "distance") { database = new DistanceDB(); } else { - m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine(); + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); + m->mothurOutEndLine(); database = new KmerDB(tempFile, 8); } - - //read file - for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } - delete buf4; - istringstream iss (tempBuf,istringstream::in); + + if (needToGenerate) { + ifstream fastaFile; + m->openInputFile(tempFile, fastaFile); - Sequence temp(iss); - if (temp.getName() != "") { + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + m->gobble(fastaFile); + + if (rdb->save) { rdb->referenceSeqs.push_back(temp); } + names.push_back(temp.getName()); + database->addSequence(temp); } - } - - database->generateDB(); - MPI_File_close(&inMPI); - MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case - #else - - //need to know number of template seqs for suffixdb - if (method == "suffix") { - ifstream inFASTA; - m->openInputFile(tempFile, inFASTA); - m->getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - } + fastaFile.close(); - bool needToGenerate = true; - string kmerDBName; - if(method == "kmer") { - database = new KmerDB(tempFile, kmerSize); + database->generateDB(); + + }else if ((method == "kmer") && (!needToGenerate)) { + ifstream kmerFileTest(kmerDBName.c_str()); + database->readKmerDB(kmerFileTest); - kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; - ifstream kmerFileTest(kmerDBName.c_str()); - if(kmerFileTest){ - bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion()); - if (GoodFile) { needToGenerate = false; } + ifstream fastaFile; + m->openInputFile(tempFile, fastaFile); + + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + m->gobble(fastaFile); + + if (rdb->save) { rdb->referenceSeqs.push_back(temp); } + names.push_back(temp.getName()); + } + fastaFile.close(); } - } - else if(method == "suffix") { database = new SuffixDB(numSeqs); } - else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch); } - else if(method == "distance") { database = new DistanceDB(); } - else { - m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); - m->mothurOutEndLine(); - database = new KmerDB(tempFile, 8); - } + #endif - if (needToGenerate) { - ifstream fastaFile; - m->openInputFile(tempFile, fastaFile); + database->setNumSeqs(names.size()); - while (!fastaFile.eof()) { - Sequence temp(fastaFile); - m->gobble(fastaFile); + //sanity check + bool okay = phyloTree->ErrorCheck(names); - names.push_back(temp.getName()); - - database->addSequence(temp); - } - fastaFile.close(); - - database->generateDB(); + if (!okay) { m->control_pressed = true; } - }else if ((method == "kmer") && (!needToGenerate)) { - ifstream kmerFileTest(kmerDBName.c_str()); - database->readKmerDB(kmerFileTest); - - ifstream fastaFile; - m->openInputFile(tempFile, fastaFile); - - while (!fastaFile.eof()) { - Sequence temp(fastaFile); - m->gobble(fastaFile); - - names.push_back(temp.getName()); - } - fastaFile.close(); + m->mothurOut("DONE."); m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); } -#endif - - database->setNumSeqs(names.size()); - - //sanity check - bool okay = phyloTree->ErrorCheck(names); - - if (!okay) { m->control_pressed = true; } - - m->mothurOut("DONE."); m->mothurOutEndLine(); - m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); } catch(exception& e) { diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 72ca1d0..f1b3a0e 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -15,6 +15,7 @@ #include "knn.h" + //********************************************************************************************************************** vector ClassifySeqsCommand::setParameters(){ try { @@ -34,6 +35,7 @@ vector ClassifySeqsCommand::setParameters(){ CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff); CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs); CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters); + CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -63,6 +65,7 @@ string ClassifySeqsCommand::getHelpString(){ #ifdef USE_MPI helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n"; #endif + helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f."; helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"; helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"; helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"; @@ -103,6 +106,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(){ ClassifySeqsCommand::ClassifySeqsCommand(string option) { try { abort = false; calledHelp = false; + rdb = ReferenceDB::getInstance(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -161,16 +165,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } } - //check for required parameters - templateFileName = validParameter.validFile(parameters, "reference", true); - if (templateFileName == "not found") { - m->mothurOut("reference is a required parameter for the classify.seqs command."); - m->mothurOutEndLine(); - abort = true; - } - else if (templateFileName == "not open") { abort = true; } - - fastaFileName = validParameter.validFile(parameters, "fasta", false); if (fastaFileName == "not found") { //if there is a current fasta file, use it @@ -250,16 +244,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } } - - taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true); - if (taxonomyFileName == "not found") { - m->mothurOut("taxonomy is a required parameter for the classify.seqs command."); - m->mothurOutEndLine(); - abort = true; - } - else if (taxonomyFileName == "not open") { abort = true; } - - namefile = validParameter.validFile(parameters, "name", false); if (namefile == "not found") { namefile = ""; } @@ -397,6 +381,41 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; } convert(temp, kmerSize); + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } + save = m->isTrue(temp); + rdb->save = save; + if (save) { //clear out old references + rdb->clearMemory(); + } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + templateFileName = validParameter.validFile(parameters, "reference", true); + if (templateFileName == "not found") { + //check for saved reference sequences + if (rdb->referenceSeqs.size() != 0) { + templateFileName = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required for the classify.seqs command."); + m->mothurOutEndLine(); + abort = true; + } + }else if (templateFileName == "not open") { abort = true; } + else { if (save) { rdb->setSavedReference(templateFileName); } } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true); + if (taxonomyFileName == "not found") { + //check for saved reference sequences + if (rdb->wordGenusProb.size() != 0) { + taxonomyFileName = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved taxonomy information and the taxonomy parameter is a required for the classify.seqs command."); + m->mothurOutEndLine(); + abort = true; + } + }else if (taxonomyFileName == "not open") { abort = true; } + else { if (save) { rdb->setSavedTaxonomy(taxonomyFileName); } } + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); convert(temp, processors); @@ -471,7 +490,10 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); - string RippedTaxName = m->getRootName(m->getSimpleName(taxonomyFileName)); + string baseTName = taxonomyFileName; + if (taxonomyFileName == "saved") {baseTName = rdb->getSavedTaxonomy(); } + + string RippedTaxName = m->getRootName(m->getSimpleName(baseTName)); RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1)); if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); } RippedTaxName += "."; @@ -636,7 +658,7 @@ int ClassifySeqsCommand::execute(){ string group = ""; if (groupfile != "") { group = groupfileNames[s]; } - PhyloSummary taxaSum(taxonomyFileName, group); + PhyloSummary taxaSum(baseTName, group); if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 9c8dbca..3dbf06e 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -13,6 +13,7 @@ #include "mothur.h" #include "command.hpp" #include "classify.h" +#include "referencedb.h" //KNN and Bayesian methods modeled from algorithms in //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences @@ -60,11 +61,12 @@ private: map >::iterator itNames; Classify* classify; + ReferenceDB* rdb; string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; - bool abort, probs; + bool abort, probs, save; int driver(linePair*, string, string, string); void appendTaxFiles(string, string); diff --git a/clearmemorycommand.cpp b/clearmemorycommand.cpp new file mode 100644 index 0000000..abdc555 --- /dev/null +++ b/clearmemorycommand.cpp @@ -0,0 +1,70 @@ +/* + * clearmemorycommand.cpp + * Mothur + * + * Created by westcott on 7/6/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "clearmemorycommand.h" +#include "referencedb.h" + +//********************************************************************************************************************** +vector ClearMemoryCommand::setParameters(){ + try { + vector myArray; + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ClearMemoryCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string ClearMemoryCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The clear.memory command removes saved reference data from memory.\n"; + helpString += "The clear.memory command should be in the following format: clear.memory().\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "ClearMemoryCommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** + +ClearMemoryCommand::ClearMemoryCommand(string option) { + try { + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + } + catch(exception& e) { + m->errorOut(e, "ClearMemoryCommand", "ClearMemoryCommand"); + exit(1); + } +} + +//********************************************************************************************************************** + +int ClearMemoryCommand::execute(){ + try { + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + ReferenceDB* rdb = ReferenceDB::getInstance(); + rdb->clearMemory(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClearMemoryCommand", "execute"); + exit(1); + } +} +//**********************************************************************************************************************/ diff --git a/clearmemorycommand.h b/clearmemorycommand.h new file mode 100644 index 0000000..9017561 --- /dev/null +++ b/clearmemorycommand.h @@ -0,0 +1,39 @@ +#ifndef CLEARMEMORYCOMMAND_H +#define CLEARMEMORYCOMMAND_H + +/* + * clearmemorycommand.h + * Mothur + * + * Created by westcott on 7/6/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" + +class ClearMemoryCommand : public Command { +public: + ClearMemoryCommand(string); + ClearMemoryCommand(){ abort = true; calledHelp = true; } + ~ClearMemoryCommand(){} + + vector setParameters(); + string getCommandName() { return "clear.memory"; } + string getCommandCategory() { return "General"; } + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Clear.memory"; } + string getDescription() { return "remove saved references from memory"; } + + + int execute(); + void help() { m->mothurOut(getHelpString()); } + + +private: + bool abort; + vector outputNames; +}; + +#endif + diff --git a/commandfactory.cpp b/commandfactory.cpp index 620536d..5aaa7b7 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -120,6 +120,7 @@ #include "getcommandinfocommand.h" #include "deuniquetreecommand.h" #include "countseqscommand.h" +#include "clearmemorycommand.h" /*******************************************************/ @@ -243,6 +244,7 @@ CommandFactory::CommandFactory(){ commands["get.commandinfo"] = "get.commandinfo"; commands["deunique.tree"] = "deunique.tree"; commands["count.seqs"] = "count.seqs"; + commands["clear.memory"] = "clear.memory"; commands["pairwise.seqs"] = "MPIEnabled"; commands["pipeline.pds"] = "MPIEnabled"; commands["classify.seqs"] = "MPIEnabled"; @@ -415,6 +417,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "get.commandinfo") { command = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { command = new DeuniqueTreeCommand(optionString); } else if(commandName == "count.seqs") { command = new CountSeqsCommand(optionString); } + else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -553,6 +556,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "get.commandinfo") { pipecommand = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { pipecommand = new DeuniqueTreeCommand(optionString); } else if(commandName == "count.seqs") { pipecommand = new CountSeqsCommand(optionString); } + else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -679,6 +683,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "get.commandinfo") { shellcommand = new GetCommandInfoCommand(); } else if(commandName == "deunique.tree") { shellcommand = new DeuniqueTreeCommand(); } else if(commandName == "count.seqs") { shellcommand = new CountSeqsCommand(); } + else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/getsharedotucommand.cpp b/getsharedotucommand.cpp index 06e199f..046460d 100644 --- a/getsharedotucommand.cpp +++ b/getsharedotucommand.cpp @@ -8,6 +8,7 @@ */ #include "getsharedotucommand.h" +#include "sharedutilities.h" //********************************************************************************************************************** vector GetSharedOTUCommand::setParameters(){ @@ -226,6 +227,10 @@ int GetSharedOTUCommand::execute(){ userGroups = "unique."; for(int i = 0; i < Groups.size(); i++) { userGroups += Groups[i] + "-"; } userGroups = userGroups.substr(0, userGroups.length()-1); + }else{ + //sanity check for group names + SharedUtil util; + util.setGroups(Groups, groupMap->namesOfGroups); } //put groups in map to find easier diff --git a/makefile b/makefile index df165b3..fa214c4 100644 --- a/makefile +++ b/makefile @@ -14,9 +14,9 @@ USEMPI ?= no USEREADLINE ?= yes CYGWIN_BUILD ?= no USECOMPRESSION ?= no -MOTHUR_FILES="\"../release\"" -RELEASE_DATE = "\"2/7/2011\"" -VERSION = "\"1.16.0\"" +MOTHUR_FILES="\"Enter_your_default_path_here\"" +RELEASE_DATE = "\"7/5/2011\"" +VERSION = "\"1.20.0\"" # Optimize to level 3: CXXFLAGS += -O3 diff --git a/mothur.cpp b/mothur.cpp index b551302..5838811 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -10,12 +10,13 @@ #include "mothur.h" #include "engine.hpp" #include "mothurout.h" +#include "referencedb.h" /**************************************************************************************************/ CommandFactory* CommandFactory::_uniqueInstance = 0; MothurOut* MothurOut::_uniqueInstance = 0; - +ReferenceDB* ReferenceDB::myInstance = 0; /***********************************************************************/ volatile int ctrlc_pressed = 0; void ctrlc_handler ( int sig ) { diff --git a/mothurout.h b/mothurout.h index 7bbd315..f68e453 100644 --- a/mothurout.h +++ b/mothurout.h @@ -12,7 +12,6 @@ #include "mothur.h" - /***********************************************/ class MothurOut { @@ -183,7 +182,7 @@ class MothurOut { gui = false; printedHeaders = false; sharedHeaderMode = ""; - }; + } ~MothurOut(); string logFileName; diff --git a/myutils.cpp b/myutils.cpp index d0d3817..786316e 100755 --- a/myutils.cpp +++ b/myutils.cpp @@ -954,7 +954,7 @@ void ProgressExit() // Skip exit(), which can be very slow in DEBUG build // VERY DANGEROUS practice, because it skips global destructors. // But if you know the rules, you can break 'em, right? - ExitProcess(0); + //ExitProcess(0); #endif } @@ -1531,7 +1531,7 @@ static void GetArgsFromFile(const string &FileName, vector &Args) void MyCmdLine(int argc, char **argv) { - g_Opts.clear(); + g_Opts.clear(); g_Argv.clear(); static unsigned RecurseDepth = 0; ++RecurseDepth; diff --git a/referencedb.cpp b/referencedb.cpp new file mode 100644 index 0000000..fb930ef --- /dev/null +++ b/referencedb.cpp @@ -0,0 +1,30 @@ +/* + * referencedb.cpp + * Mothur + * + * Created by westcott on 6/29/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "referencedb.h" + +/******************************************************/ +ReferenceDB* ReferenceDB::getInstance() { + if(myInstance == NULL) { + myInstance = new ReferenceDB(); + } + return myInstance; + } +/******************************************************/ +void ReferenceDB::clearMemory() { + referenceSeqs.clear(); + setSavedReference(""); + for(int i = 0; i < wordGenusProb.size(); i++) { wordGenusProb[i].clear(); } + wordGenusProb.clear(); + setSavedTaxonomy(""); +} +/******************************************************* +ReferenceDB::~ReferenceDB() { myInstance = NULL; } +/*******************************************************/ + diff --git a/referencedb.h b/referencedb.h new file mode 100644 index 0000000..2f29247 --- /dev/null +++ b/referencedb.h @@ -0,0 +1,48 @@ +#ifndef MYREFERENCEDB_H +#define MYREFERENCEDB_H + +/* + * referencedb.h + * Mothur + * + * Created by westcott on 6/29/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + + +#include "mothur.h" +#include "sequence.hpp" + +/***********************************************/ + +class ReferenceDB { + + public: + + static ReferenceDB* getInstance(); + void clearMemory(); + + bool save; + vector referenceSeqs; + vector< vector > wordGenusProb; + + string getSavedReference() { return referencefile; } + void setSavedReference(string p) { referencefile = p; } + string getSavedTaxonomy() { return taxonomyfile; } + void setSavedTaxonomy(string p) { taxonomyfile = p; } + + private: + + static ReferenceDB* myInstance; + ReferenceDB() { referencefile = ""; taxonomyfile = ""; save = false; } + ReferenceDB(const ReferenceDB&){}// Disable copy constructor + void operator=(const ReferenceDB&){} // Disable assignment operator + ~ReferenceDB(){ myInstance = 0; } + + string referencefile, taxonomyfile; +}; +/***********************************************/ + +#endif + diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index 007ac93..fc027b9 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -13,6 +13,7 @@ #include "refchimeratest.h" #include "filterseqscommand.h" + //********************************************************************************************************************** vector SeqErrorCommand::setParameters(){ try { @@ -24,6 +25,7 @@ vector SeqErrorCommand::setParameters(){ CommandParameter pignorechimeras("ignorechimeras", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pignorechimeras); CommandParameter pthreshold("threshold", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pthreshold); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -41,6 +43,15 @@ string SeqErrorCommand::getHelpString(){ try { string helpString = ""; helpString += "The seq.error command reads a query alignment file and a reference alignment file and creates .....\n"; + helpString += "The fasta parameter...\n"; + helpString += "The reference parameter...\n"; + helpString += "The qfile parameter...\n"; + helpString += "The report parameter...\n"; + helpString += "The name parameter...\n"; + helpString += "The ignorechimeras parameter...\n"; + helpString += "The threshold parameter...\n"; + helpString += "The processors parameter...\n"; + helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f."; helpString += "Example seq.error(...).\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; helpString += "For more details please check out the wiki http://www.mothur.org/wiki/seq.error .\n"; @@ -55,6 +66,7 @@ string SeqErrorCommand::getHelpString(){ SeqErrorCommand::SeqErrorCommand(){ try { abort = true; calledHelp = true; + setParameters(); vector tempOutNames; outputTypes["error.summary"] = tempOutNames; outputTypes["error.seq"] = tempOutNames; @@ -77,6 +89,7 @@ SeqErrorCommand::SeqErrorCommand(string option) { try { abort = false; calledHelp = false; + rdb = ReferenceDB::getInstance(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -165,11 +178,6 @@ SeqErrorCommand::SeqErrorCommand(string option) { } else if (queryFileName == "not open") { abort = true; } else { m->setFastaFile(queryFileName); } - - referenceFileName = validParameter.validFile(parameters, "reference", true); - if (referenceFileName == "not found") { m->mothurOut("reference is a required parameter for the seq.error command."); m->mothurOutEndLine(); abort = true; } - else if (referenceFileName == "not open") { abort = true; } - //check for optional parameters namesFileName = validParameter.validFile(parameters, "name", true); @@ -203,6 +211,28 @@ SeqErrorCommand::SeqErrorCommand(string option) { temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "1.00"; } convert(temp, threshold); + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } + save = m->isTrue(temp); + rdb->save = save; + if (save) { //clear out old references + rdb->clearMemory(); + } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + referenceFileName = validParameter.validFile(parameters, "reference", true); + if (referenceFileName == "not found") { + //check for saved reference sequences + if (rdb->referenceSeqs.size() != 0) { + referenceFileName = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required."); + m->mothurOutEndLine(); + abort = true; + } + }else if (referenceFileName == "not open") { abort = true; } + else { if (save) { rdb->setSavedReference(referenceFileName); } } + + temp = validParameter.validFile(parameters, "ignorechimeras", false); if (temp == "not found") { temp = "T"; } ignoreChimeras = m->isTrue(temp); @@ -728,32 +758,59 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, void SeqErrorCommand::getReferences(){ try { - - ifstream referenceFile; - m->openInputFile(referenceFileName, referenceFile); - int numAmbigSeqs = 0; int maxStartPos = 0; int minEndPos = 100000; - while(referenceFile){ - Sequence currentSeq(referenceFile); - int numAmbigs = currentSeq.getAmbigBases(); - if(numAmbigs > 0){ numAmbigSeqs++; } + if (referenceFileName == "saved") { + int start = time(NULL); + m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); + + for (int i = 0; i < rdb->referenceSeqs.size(); i++) { + int numAmbigs = rdb->referenceSeqs[i].getAmbigBases(); + if(numAmbigs > 0){ numAmbigSeqs++; } + + // int startPos = rdb->referenceSeqs[i].getStartPos(); + // if(startPos > maxStartPos) { maxStartPos = startPos; } + // + // int endPos = rdb->referenceSeqs[i].getEndPos(); + // if(endPos < minEndPos) { minEndPos = endPos; } + + referenceSeqs.push_back(rdb->referenceSeqs[i]); + } + referenceFileName = rdb->getSavedReference(); + + m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); + + }else { + int start = time(NULL); + + ifstream referenceFile; + m->openInputFile(referenceFileName, referenceFile); -// int startPos = currentSeq.getStartPos(); -// if(startPos > maxStartPos) { maxStartPos = startPos; } -// -// int endPos = currentSeq.getEndPos(); -// if(endPos < minEndPos) { minEndPos = endPos; } - referenceSeqs.push_back(currentSeq); + while(referenceFile){ + Sequence currentSeq(referenceFile); + int numAmbigs = currentSeq.getAmbigBases(); + if(numAmbigs > 0){ numAmbigSeqs++; } + + // int startPos = currentSeq.getStartPos(); + // if(startPos > maxStartPos) { maxStartPos = startPos; } + // + // int endPos = currentSeq.getEndPos(); + // if(endPos < minEndPos) { minEndPos = endPos; } + referenceSeqs.push_back(currentSeq); - m->gobble(referenceFile); + if (rdb->save) { rdb->referenceSeqs.push_back(currentSeq); } + + m->gobble(referenceFile); + } + referenceFile.close(); + + m->mothurOut("It took " + toString(time(NULL) - start) + " to read " + toString(referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); } - referenceFile.close(); + numRefs = referenceSeqs.size(); - for(int i=0;imothurOut("Warning: " + toString(numAmbigSeqs) + " reference sequences have ambiguous bases, these bases will be ignored\n"); - } + } } catch(exception& e) { diff --git a/seqerrorcommand.h b/seqerrorcommand.h index e467df1..590c554 100644 --- a/seqerrorcommand.h +++ b/seqerrorcommand.h @@ -13,6 +13,7 @@ #include "mothur.h" #include "command.hpp" #include "sequence.hpp" +#include "referencedb.h" struct Compare { int AA, AT, AG, AC, TA, TT, TG, TC, GA, GT, GG, GC, CA, CT, CG, CC, NA, NT, NG, NC, Ai, Ti, Gi, Ci, Ni, dA, dT, dG, dC; @@ -58,6 +59,7 @@ public: private: bool abort; + ReferenceDB* rdb; struct linePair { unsigned long int start; @@ -86,7 +88,7 @@ private: string queryFileName, referenceFileName, qualFileName, reportFileName, namesFileName, outputDir; double threshold; - bool ignoreChimeras; + bool ignoreChimeras, save; int numRefs, processors; int maxLength, totalBases, totalMatches; //ofstream errorSummaryFile, errorSeqFile; diff --git a/uchime_main.cpp b/uchime_main.cpp index 0465c72..18bd8bb 100644 --- a/uchime_main.cpp +++ b/uchime_main.cpp @@ -210,7 +210,6 @@ int uchime_main(int argc, char *argv[]) if((QuerySeqCount) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(QuerySeqCount) + ", " + toString(HitCount) + " chimeras found."); m->mothurOutEndLine(); } } - Log("\n"); Log("%s: %u/%u chimeras found (%.1f%%)\n", opt_input.c_str(), HitCount, QuerySeqCount, Pct(HitCount, QuerySeqCount));