From: westcott Date: Tue, 9 Aug 2011 10:32:41 +0000 (+0000) Subject: added count.groups command and paralellized align.seqs for windows X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=d3aed959ca3b68890eeb7b815e24210bcfeb256c added count.groups command and paralellized align.seqs for windows --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 49de826..73a36ad 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -46,6 +46,7 @@ A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; }; A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; }; A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79234D613C74BF6002B08E2 /* mothurfisher.cpp */; }; + A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A795840C13F13CD900F201D5 /* countgroupscommand.cpp */; }; A799F5B91309A3E000AEEFA0 /* makefastqcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */; }; A7A61F2D130062E000E05B6B /* amovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A61F2C130062E000E05B6B /* amovacommand.cpp */; }; A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; }; @@ -420,6 +421,8 @@ A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deuniquetreecommand.cpp; sourceTree = ""; }; A79234D513C74BF6002B08E2 /* mothurfisher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurfisher.h; sourceTree = ""; }; A79234D613C74BF6002B08E2 /* mothurfisher.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurfisher.cpp; sourceTree = ""; }; + A795840B13F13CD900F201D5 /* countgroupscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = countgroupscommand.h; sourceTree = ""; }; + A795840C13F13CD900F201D5 /* countgroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = countgroupscommand.cpp; sourceTree = ""; }; A799F5B71309A3E000AEEFA0 /* makefastqcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makefastqcommand.h; sourceTree = ""; }; A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makefastqcommand.cpp; sourceTree = ""; }; A7A61F1A130035C800E05B6B /* LICENSE */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = LICENSE; sourceTree = ""; }; @@ -934,12 +937,12 @@ A7E9B84E12D37EC400DA6239 /* structpearson.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = structpearson.h; sourceTree = ""; }; A7E9B84F12D37EC400DA6239 /* subsamplecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = subsamplecommand.cpp; sourceTree = ""; }; A7E9B85012D37EC400DA6239 /* subsamplecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = subsamplecommand.h; sourceTree = ""; }; - A7E9B85112D37EC400DA6239 /* suffixdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixdb.cpp; sourceTree = SOURCE_ROOT; }; - A7E9B85212D37EC400DA6239 /* suffixdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixdb.hpp; sourceTree = SOURCE_ROOT; }; - A7E9B85312D37EC400DA6239 /* suffixnodes.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixnodes.cpp; sourceTree = SOURCE_ROOT; }; - A7E9B85412D37EC400DA6239 /* suffixnodes.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixnodes.hpp; sourceTree = SOURCE_ROOT; }; - A7E9B85512D37EC400DA6239 /* suffixtree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixtree.cpp; sourceTree = SOURCE_ROOT; }; - A7E9B85612D37EC400DA6239 /* suffixtree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixtree.hpp; sourceTree = SOURCE_ROOT; }; + A7E9B85112D37EC400DA6239 /* suffixdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixdb.cpp; sourceTree = ""; }; + A7E9B85212D37EC400DA6239 /* suffixdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixdb.hpp; sourceTree = ""; }; + A7E9B85312D37EC400DA6239 /* suffixnodes.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixnodes.cpp; sourceTree = ""; }; + A7E9B85412D37EC400DA6239 /* suffixnodes.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixnodes.hpp; sourceTree = ""; }; + A7E9B85512D37EC400DA6239 /* suffixtree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixtree.cpp; sourceTree = ""; }; + A7E9B85612D37EC400DA6239 /* suffixtree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixtree.hpp; sourceTree = ""; }; A7E9B85712D37EC400DA6239 /* summarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = summarycommand.cpp; sourceTree = ""; }; A7E9B85812D37EC400DA6239 /* summarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = summarycommand.h; sourceTree = ""; }; A7E9B85912D37EC400DA6239 /* summarysharedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = summarysharedcommand.cpp; sourceTree = ""; }; @@ -1099,13 +1102,7 @@ A7E9B82412D37EC400DA6239 /* sharedutilities.h */, A7E9B82D12D37EC400DA6239 /* singlelinkage.cpp */, A7E9B83012D37EC400DA6239 /* slibshuff.cpp */, - A7E9B85112D37EC400DA6239 /* suffixdb.cpp */, A7E9B83112D37EC400DA6239 /* slibshuff.h */, - A7E9B85212D37EC400DA6239 /* suffixdb.hpp */, - A7E9B85312D37EC400DA6239 /* suffixnodes.cpp */, - A7E9B85412D37EC400DA6239 /* suffixnodes.hpp */, - A7E9B85512D37EC400DA6239 /* suffixtree.cpp */, - A7E9B85612D37EC400DA6239 /* suffixtree.hpp */, A7E9B87412D37EC400DA6239 /* validcalculator.cpp */, A7E9B87512D37EC400DA6239 /* validcalculator.h */, A7E9B87612D37EC400DA6239 /* validparameter.cpp */, @@ -1238,6 +1235,8 @@ A7E9B6B712D37EC400DA6239 /* consensusseqscommand.cpp */, A7E9B6BA12D37EC400DA6239 /* corraxescommand.h */, A7E9B6B912D37EC400DA6239 /* corraxescommand.cpp */, + A795840B13F13CD900F201D5 /* countgroupscommand.h */, + A795840C13F13CD900F201D5 /* countgroupscommand.cpp */, A7730EFD13967241007433A3 /* countseqscommand.h */, A7730EFE13967241007433A3 /* countseqscommand.cpp */, A7E9B6C412D37EC400DA6239 /* deconvolutecommand.h */, @@ -1638,6 +1637,12 @@ A7E9B81412D37EC400DA6239 /* sharedsabundvector.h */, A7E9B83912D37EC400DA6239 /* sparsematrix.cpp */, A7E9B83A12D37EC400DA6239 /* sparsematrix.hpp */, + A7E9B85112D37EC400DA6239 /* suffixdb.cpp */, + A7E9B85212D37EC400DA6239 /* suffixdb.hpp */, + A7E9B85312D37EC400DA6239 /* suffixnodes.cpp */, + A7E9B85412D37EC400DA6239 /* suffixnodes.hpp */, + A7E9B85512D37EC400DA6239 /* suffixtree.cpp */, + A7E9B85612D37EC400DA6239 /* suffixtree.hpp */, A7E9B85F12D37EC400DA6239 /* tree.cpp */, A7E9B86012D37EC400DA6239 /* tree.h */, A7E9B86412D37EC400DA6239 /* treemap.cpp */, @@ -2131,6 +2136,7 @@ A73DDBBA13C4A0D1006AAE38 /* clearmemorycommand.cpp in Sources */, A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */, A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */, + A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index a32ec4d..b7c6843 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -15,15 +15,6 @@ */ #include "aligncommand.h" -#include "sequence.hpp" - -#include "gotohoverlap.hpp" -#include "needlemanoverlap.hpp" -#include "blastalign.hpp" -#include "noalign.hpp" - -#include "nast.hpp" -#include "nastreport.hpp" #include "referencedb.h" //********************************************************************************************************************** @@ -256,6 +247,11 @@ AlignCommand::AlignCommand(string option) { temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } save = m->isTrue(temp); + //this is so the threads can quickly load the reference data + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #else + if (processors != 1) { save = true; } + #endif rdb->save = save; if (save) { //clear out old references rdb->clearMemory(); @@ -298,7 +294,6 @@ AlignCommand::~AlignCommand(){ if (abort == false) { for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete templateDB; - delete alignment; } } //********************************************************************************************************************** @@ -308,17 +303,6 @@ int AlignCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } templateDB = new AlignmentDB(templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch); - int longestBase = templateDB->getLongestBase(); - - if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); } - else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } - else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } - else if(align == "noalign") { alignment = new NoAlign(); } - else { - m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman."); - m->mothurOutEndLine(); - alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); - } for (int s = 0; s < candidateFileNames.size(); s++) { if (m->control_pressed) { outputTypes.clear(); return 0; } @@ -442,20 +426,28 @@ int AlignCommand::execute(){ #else - vector positions = m->divideFile(candidateFileNames[s], processors); - for (int i = 0; i < (positions.size()-1); i++) { - lines.push_back(new linePair(positions[i], positions[(i+1)])); - } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + vector positions; + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + positions = m->divideFile(candidateFileNames[s], processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } + #else + positions = m->setFilePosFasta(candidateFileNames[s], numFastaSeqs); + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor)); + } + #endif + if(processors == 1){ numFastaSeqs = driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); }else{ numFastaSeqs = createProcesses(alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); } - #else - numFastaSeqs = driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); - #endif - + if (m->control_pressed) { m->mothurRemove(accnosFileName); m->mothurRemove(alignFileName); m->mothurRemove(reportFileName); outputTypes.clear(); return 0; } //delete accnos file if its blank else report to user @@ -511,7 +503,6 @@ int AlignCommand::execute(){ } //********************************************************************************************************************** - int AlignCommand::driver(linePair* filePos, string alignFName, string reportFName, string accnosFName, string filename){ try { ofstream alignmentFile; @@ -529,10 +520,23 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam bool done = false; int count = 0; + + //moved this into driver to avoid deep copies in windows paralellized version + Alignment* alignment; + int longestBase = templateDB->getLongestBase(); + if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); } + else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } + else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } + else if(align == "noalign") { alignment = new NoAlign(); } + else { + m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman."); + m->mothurOutEndLine(); + alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); + } while (!done) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { break; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); report.setCandidate(candidateSeq); @@ -548,7 +552,7 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam Sequence temp = templateDB->findClosestSequence(candidateSeq); Sequence* templateSeq = &temp; - + float searchScore = templateDB->getSearchScore(); Nast* nast = new Nast(alignment, candidateSeq, templateSeq); @@ -628,6 +632,7 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam //report progress if((count) % 100 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + delete alignment; alignmentFile.close(); inFASTA.close(); accnosFile.close(); @@ -665,9 +670,22 @@ int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& align delete buf; } + Alignment* alignment; + int longestBase = templateDB->getLongestBase(); + if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); } + else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } + else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } + else if(align == "noalign") { alignment = new NoAlign(); } + else { + m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman."); + m->mothurOutEndLine(); + alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); + } + + for(int i=0;icontrol_pressed) { return 0; } + if (m->control_pressed) { delete alignment; return 0; } //read next sequence int length = MPIPos[start+i+1] - MPIPos[start+i]; @@ -807,11 +825,10 @@ int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& align int AlignCommand::createProcesses(string alignFileName, string reportFileName, string accnosFName, string filename) { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int num = 0; processIDS.resize(0); +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 1; - int num = 0; - // processIDS.resize(0); //loop through and create all the processes you want while (process != processors) { @@ -883,9 +900,90 @@ int AlignCommand::createProcesses(string alignFileName, string reportFileName, s m->openOutputFile(accnosFName, out); out.close(); } +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the alignData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; istart, lines[i]->end, flip, match, misMatch, gapOpen, gapExtend, threshold); + pDataArray.push_back(tempalign); + processIDS.push_back(i); + + //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. + //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier + hThreadArray[i] = CreateThread(NULL, 0, MyAlignThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + } + + //need to check for line ending error + ifstream inFASTA; + m->openInputFile(filename, inFASTA); + inFASTA.seekg(lines[processors-1]->start-1); + char c = inFASTA.peek(); + + if (c == '>') { //we need to move back + lines[processors-1]->start--; + } + + //using the main process as a worker saves time and memory + //do my part - do last piece because windows is looking for eof + num = driver(lines[processors-1], (alignFileName + toString(processors-1) + ".temp"), (reportFileName + toString(processors-1) + ".temp"), (accnosFName + toString(processors-1) + ".temp"), filename); + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + num += pDataArray[i]->count; + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + + vector nonBlankAccnosFiles; + if (!(m->isBlank(accnosFName))) { nonBlankAccnosFiles.push_back(accnosFName); } + else { m->mothurRemove(accnosFName); } //remove so other files can be renamed to it + + for (int i = 1; i < processors; i++) { + appendAlignFiles((alignFileName + toString(i) + ".temp"), alignFileName); + m->mothurRemove((alignFileName + toString(i) + ".temp")); + + appendReportFiles((reportFileName + toString(i) + ".temp"), reportFileName); + m->mothurRemove((reportFileName + toString(i) + ".temp")); + + if (!(m->isBlank(accnosFName + toString(i) + ".temp"))) { + nonBlankAccnosFiles.push_back(accnosFName + toString(i) + ".temp"); + }else { m->mothurRemove((accnosFName + toString(i) + ".temp")); } + } + + //append accnos files + if (nonBlankAccnosFiles.size() != 0) { + rename(nonBlankAccnosFiles[0].c_str(), accnosFName.c_str()); + + for (int h=1; h < nonBlankAccnosFiles.size(); h++) { + appendAlignFiles(nonBlankAccnosFiles[h], accnosFName); + m->mothurRemove(nonBlankAccnosFiles[h]); + } + }else { //recreate the accnosfile if needed + ofstream out; + m->openOutputFile(accnosFName, out); + out.close(); + } +#endif return num; -#endif } catch(exception& e) { m->errorOut(e, "AlignCommand", "createProcesses"); diff --git a/aligncommand.h b/aligncommand.h index 75b9c2b..193eeac 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -15,6 +15,16 @@ #include "database.hpp" #include "alignment.hpp" #include "alignmentdb.h" +#include "sequence.hpp" + +#include "gotohoverlap.hpp" +#include "needlemanoverlap.hpp" +#include "blastalign.hpp" +#include "noalign.hpp" + +#include "nast.hpp" +#include "nastreport.hpp" + class AlignCommand : public Command { @@ -44,7 +54,6 @@ private: bool MPIWroteAccnos; AlignmentDB* templateDB; - Alignment* alignment; int driver(linePair*, string, string, string, string); int createProcesses(string, string, string, string); @@ -65,4 +74,196 @@ private: }; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +typedef struct alignData { + string alignFName; + string reportFName; + string accnosFName; + string filename; + string align; + string search; + bool flip; + unsigned long int start; + unsigned long int end; + MothurOut* m; + //AlignmentDB* templateDB; + float match, misMatch, gapOpen, gapExtend, threshold; + int count, kmerSize; + + alignData(){} + alignData(string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long int st, unsigned long int en, bool fl, float ma, float misMa, float gapO, float gapE, float thr) { + alignFName = a; + reportFName = r; + accnosFName = ac; + filename = f; + flip = fl; + m = mout; + start = st; + end = en; + //templateDB = tdb; + match = ma; + misMatch = misMa; + gapOpen = gapO; + gapExtend = gapE; + threshold = thr; + align = al; + search = se; + count = 0; + kmerSize = ks; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyAlignThreadFunction(LPVOID lpParam){ + alignData* pDataArray; + pDataArray = (alignData*)lpParam; + + try { + ofstream alignmentFile; + pDataArray->m->openOutputFile(pDataArray->alignFName, alignmentFile); + + ofstream accnosFile; + pDataArray->m->openOutputFile(pDataArray->accnosFName, accnosFile); + + NastReport report(pDataArray->reportFName); + + ifstream inFASTA; + pDataArray->m->openInputFile(pDataArray->filename, inFASTA); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + inFASTA.seekg(0); + }else { //this accounts for the difference in line endings. + inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); + } + + pDataArray->count = pDataArray->end; + + AlignmentDB* templateDB = new AlignmentDB("saved-silent", pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); + + //moved this into driver to avoid deep copies in windows paralellized version + Alignment* alignment; + int longestBase = templateDB->getLongestBase(); + if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, longestBase); } + else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase); } + else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); } + else if(pDataArray->align == "noalign") { alignment = new NoAlign(); } + else { + pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman."); + pDataArray->m->mothurOutEndLine(); + alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, longestBase); + } + + int count = 0; + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { break; } + + Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA); + report.setCandidate(candidateSeq); + + int origNumBases = candidateSeq->getNumBases(); + string originalUnaligned = candidateSeq->getUnaligned(); + int numBasesNeeded = origNumBases * pDataArray->threshold; + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { + alignment->resize(candidateSeq->getUnaligned().length()+1); + } + + Sequence temp = templateDB->findClosestSequence(candidateSeq); + Sequence* templateSeq = &temp; + + float searchScore = templateDB->getSearchScore(); + + Nast* nast = new Nast(alignment, candidateSeq, templateSeq); + + Sequence* copy; + + Nast* nast2; + bool needToDeleteCopy = false; //this is needed in case you have you enter the ifs below + //since nast does not make a copy of hte sequence passed, and it is used by the reporter below + //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place + //so this bool tells you if you need to delete it + + //if there is a possibility that this sequence should be reversed + if (candidateSeq->getNumBases() < numBasesNeeded) { + + string wasBetter = ""; + //if the user wants you to try the reverse + if (pDataArray->flip) { + + //get reverse compliment + copy = new Sequence(candidateSeq->getName(), originalUnaligned); + copy->reverseComplement(); + + //rerun alignment + Sequence temp2 = templateDB->findClosestSequence(copy); + Sequence* templateSeq2 = &temp2; + + searchScore = templateDB->getSearchScore(); + + nast2 = new Nast(alignment, copy, templateSeq2); + + //check if any better + if (copy->getNumBases() > candidateSeq->getNumBases()) { + candidateSeq->setAligned(copy->getAligned()); //use reverse compliments alignment since its better + templateSeq = templateSeq2; + delete nast; + nast = nast2; + needToDeleteCopy = true; + wasBetter = "\treverse complement produced a better alignment, so mothur used the reverse complement."; + }else{ + wasBetter = "\treverse complement did NOT produce a better alignment so it was not used, please check sequence."; + delete nast2; + delete copy; + } + } + + //create accnos file with names + accnosFile << candidateSeq->getName() << wasBetter << endl; + } + + report.setTemplate(templateSeq); + report.setSearchParameters(pDataArray->search, searchScore); + report.setAlignmentParameters(pDataArray->align, alignment); + report.setNastParameters(*nast); + + alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; + + report.print(); + delete nast; + if (needToDeleteCopy) { delete copy; } + + count++; + } + delete candidateSeq; + + //report progress + if((count) % 100 == 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); } + + } + //report progress + if((count) % 100 != 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); } + + delete alignment; + delete templateDB; + alignmentFile.close(); + inFASTA.close(); + accnosFile.close(); + } + catch(exception& e) { + pDataArray->m->errorOut(e, "AlignCommand", "MyAlignThreadFunction"); + exit(1); + } +} +#endif + + + #endif diff --git a/alignmentdb.cpp b/alignmentdb.cpp index bac61c4..6ccfd39 100644 --- a/alignmentdb.cpp +++ b/alignmentdb.cpp @@ -13,7 +13,33 @@ #include "blastdb.hpp" #include "referencedb.h" - +/**************************************************************************************************/ +//deep copy +AlignmentDB::AlignmentDB(const AlignmentDB& adb) : numSeqs(adb.numSeqs), longest(adb.longest), method(adb.method), emptySequence(adb.emptySequence) { + try { + + m = MothurOut::getInstance(); + if (adb.method == "blast") { + search = new BlastDB(*((BlastDB*)adb.search)); + }else if(adb.method == "kmer") { + search = new KmerDB(*((KmerDB*)adb.search)); + }else if(adb.method == "suffix") { + search = new SuffixDB(*((SuffixDB*)adb.search)); + }else { + m->mothurOut("[ERROR]: cannot create copy of alignment database, unrecognized method - " + adb.method); m->mothurOutEndLine(); + } + + for (int i = 0; i < adb.templateSequences.size(); i++) { + Sequence temp(adb.templateSequences[i]); + templateSequences.push_back(temp); + } + } + catch(exception& e) { + m->errorOut(e, "AlignmentDB", "AlignmentDB"); + exit(1); + } + +} /**************************************************************************************************/ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch){ // This assumes that the template database is in fasta format, may try { // need to alter this in the future? @@ -22,10 +48,16 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap method = s; bool needToGenerate = true; ReferenceDB* rdb = ReferenceDB::getInstance(); + bool silent = false; + + if (fastaFileName == "saved-silent") { + fastaFileName = "saved"; silent = true; + } if (fastaFileName == "saved") { int start = time(NULL); - m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); + + if (!silent) { m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); } for (int i = 0; i < rdb->referenceSeqs.size(); i++) { templateSequences.push_back(rdb->referenceSeqs[i]); @@ -35,7 +67,7 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap fastaFileName = rdb->getSavedReference(); numSeqs = templateSequences.size(); - m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); + if (!silent) { m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences.");m->mothurOutEndLine(); } }else { int start = time(NULL); @@ -159,6 +191,7 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap else if(method == "suffix") { search = new SuffixDB(numSeqs); } else if(method == "blast") { search = new BlastDB(fastaFileName.substr(0,fastaFileName.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, ""); } else { + method = "kmer"; m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine(); search = new KmerDB(fastaFileName, 8); diff --git a/alignmentdb.h b/alignmentdb.h index 22d2dd3..27d0434 100644 --- a/alignmentdb.h +++ b/alignmentdb.h @@ -22,6 +22,7 @@ public: AlignmentDB(string, string, int, float, float, float, float); //reads fastafile passed in and stores sequences AlignmentDB(string); + AlignmentDB(const AlignmentDB& adb); ~AlignmentDB(); Sequence findClosestSequence(Sequence*); diff --git a/blastdb.hpp b/blastdb.hpp index 99a9987..0c3ac4b 100644 --- a/blastdb.hpp +++ b/blastdb.hpp @@ -18,6 +18,8 @@ class BlastDB : public Database { public: BlastDB(string, float, float, float, float, string); BlastDB(string); + BlastDB(const BlastDB& bdb) : dbFileName(bdb.dbFileName), queryFileName(bdb.queryFileName), blastFileName(bdb.blastFileName), path(bdb.path), + count(bdb.count), gapOpen(bdb.gapOpen), gapExtend(bdb.gapExtend), match(bdb.match), misMatch(bdb.misMatch), Database(bdb) {} ~BlastDB(); void generateDB(); diff --git a/commandfactory.cpp b/commandfactory.cpp index 5aaa7b7..dcd811f 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -120,6 +120,7 @@ #include "getcommandinfocommand.h" #include "deuniquetreecommand.h" #include "countseqscommand.h" +#include "countgroupscommand.h" #include "clearmemorycommand.h" /*******************************************************/ @@ -244,6 +245,7 @@ CommandFactory::CommandFactory(){ commands["get.commandinfo"] = "get.commandinfo"; commands["deunique.tree"] = "deunique.tree"; commands["count.seqs"] = "count.seqs"; + commands["count.groups"] = "count.groups"; commands["clear.memory"] = "clear.memory"; commands["pairwise.seqs"] = "MPIEnabled"; commands["pipeline.pds"] = "MPIEnabled"; @@ -417,6 +419,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "get.commandinfo") { command = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { command = new DeuniqueTreeCommand(optionString); } else if(commandName == "count.seqs") { command = new CountSeqsCommand(optionString); } + else if(commandName == "count.groups") { command = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); } else { command = new NoCommand(optionString); } @@ -556,6 +559,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "get.commandinfo") { pipecommand = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { pipecommand = new DeuniqueTreeCommand(optionString); } else if(commandName == "count.seqs") { pipecommand = new CountSeqsCommand(optionString); } + else if(commandName == "count.groups") { pipecommand = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); } else { pipecommand = new NoCommand(optionString); } @@ -683,6 +687,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "get.commandinfo") { shellcommand = new GetCommandInfoCommand(); } else if(commandName == "deunique.tree") { shellcommand = new DeuniqueTreeCommand(); } else if(commandName == "count.seqs") { shellcommand = new CountSeqsCommand(); } + else if(commandName == "count.groups") { shellcommand = new CountGroupsCommand(); } else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); } else { shellcommand = new NoCommand(); } diff --git a/countgroupscommand.cpp b/countgroupscommand.cpp new file mode 100644 index 0000000..af9993c --- /dev/null +++ b/countgroupscommand.cpp @@ -0,0 +1,237 @@ +/* + * countgroupscommand.cpp + * Mothur + * + * Created by westcott on 8/9/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "countgroupscommand.h" +#include "sharedutilities.h" +#include "inputdata.h" + +//********************************************************************************************************************** +vector CountGroupsCommand::setParameters(){ + try { + CommandParameter pshared("shared", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none",false,false); parameters.push_back(pshared); + CommandParameter pgroup("group", "InputTypes", "", "", "sharedGroup", "sharedGroup", "none",false,false); parameters.push_back(pgroup); + CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos); + CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "CountGroupsCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string CountGroupsCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The count.groups command counts sequences from a specfic group or set of groups from the following file types: group or shared file.\n"; + helpString += "The count.groups command parameters are accnos, group, shared and groups. You must provide a group or shared file.\n"; + helpString += "The accnos parameter allows you to provide a file containing the list of groups.\n"; + helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like. You can separate group names with dashes.\n"; + helpString += "The count.groups command should be in the following format: count.groups(accnos=yourAccnos, group=yourGroupFile).\n"; + helpString += "Example count.groups(accnos=amazon.accnos, group=amazon.groups).\n"; + helpString += "or count.groups(groups=pasture, group=amazon.groups).\n"; + helpString += "Note: No spaces between parameter labels (i.e. group), '=' and parameters (i.e.yourGroupFile).\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "CountGroupsCommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** +CountGroupsCommand::CountGroupsCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + } + catch(exception& e) { + m->errorOut(e, "CountGroupsCommand", "CountGroupsCommand"); + exit(1); + } +} +//********************************************************************************************************************** +CountGroupsCommand::CountGroupsCommand(string option) { + try { + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("accnos"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["accnos"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + + it = parameters.find("shared"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["shared"] = inputDir + it->second; } + } + } + + + //check for required parameters + accnosfile = validParameter.validFile(parameters, "accnos", true); + if (accnosfile == "not open") { abort = true; } + else if (accnosfile == "not found") { accnosfile = ""; } + else { m->setAccnosFile(accnosfile); } + + groups = validParameter.validFile(parameters, "groups", false); + if (groups == "not found") { groups = ""; } + else { + m->splitAtDash(groups, Groups); + m->Groups = Groups; + } + + sharedfile = validParameter.validFile(parameters, "shared", true); + if (sharedfile == "not open") { sharedfile = ""; abort = true; } + else if (sharedfile == "not found") { sharedfile = ""; } + else { m->setSharedFile(sharedfile); } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { groupfile = ""; abort = true; } + else if (groupfile == "not found") { groupfile = ""; } + else { m->setGroupFile(groupfile); } + + if ((sharedfile == "") && (groupfile == "")) { + //give priority to shared, then group + sharedfile = m->getSharedFile(); + if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } + else { + groupfile = m->getGroupFile(); + if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current groupfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + } + } + } + + if ((accnosfile == "") && (Groups.size() == 0)) { Groups.push_back("all"); m->Groups = Groups; } + } + + } + catch(exception& e) { + m->errorOut(e, "CountGroupsCommand", "CountGroupsCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +int CountGroupsCommand::execute(){ + try { + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + //get groups you want to remove + if (accnosfile != "") { readAccnos(); } + + if (groupfile != "") { + GroupMap groupMap(groupfile); + groupMap.readMap(); + + //make sure groups are valid + //takes care of user setting groupNames that are invalid or setting groups=all + SharedUtil util; + util.setGroups(Groups, groupMap.namesOfGroups); + + for (int i = 0; i < Groups.size(); i++) { + m->mothurOut(Groups[i] + " contains " + toString(groupMap.getNumSeqs(Groups[i])) + "."); m->mothurOutEndLine(); + } + } + + if (m->control_pressed) { return 0; } + + if (sharedfile != "") { + InputData input(sharedfile, "sharedfile"); + vector lookup = input.getSharedRAbundVectors(); + + for (int i = 0; i < lookup.size(); i++) { + m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + "."); m->mothurOutEndLine(); + delete lookup[i]; + } + } + + return 0; + } + + catch(exception& e) { + m->errorOut(e, "CountGroupsCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +void CountGroupsCommand::readAccnos(){ + try { + Groups.clear(); + + ifstream in; + m->openInputFile(accnosfile, in); + string name; + + while(!in.eof()){ + in >> name; + + Groups.push_back(name); + + m->gobble(in); + } + in.close(); + + m->Groups = Groups; + + } + catch(exception& e) { + m->errorOut(e, "CountGroupsCommand", "readAccnos"); + exit(1); + } +} +//********************************************************************************************************************** + + diff --git a/countgroupscommand.h b/countgroupscommand.h new file mode 100644 index 0000000..2532a80 --- /dev/null +++ b/countgroupscommand.h @@ -0,0 +1,42 @@ +#ifndef COUNTGROUPSCOMMAND_H +#define COUNTGROUPSCOMMAND_H + +/* + * countgroupscommand.h + * Mothur + * + * Created by westcott on 8/9/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" + +class CountGroupsCommand : public Command { + +public: + + CountGroupsCommand(string); + CountGroupsCommand(); + ~CountGroupsCommand(){} + + vector setParameters(); + string getCommandName() { return "count.groups"; } + string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Count.groups"; } + string getDescription() { return "counts the number of sequences in each group"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + + +private: + string sharedfile, groupfile, outputDir, groups, accnosfile; + bool abort; + vector Groups; + + void readAccnos(); +}; + +#endif diff --git a/database.hpp b/database.hpp index f974ae9..580c3f2 100644 --- a/database.hpp +++ b/database.hpp @@ -45,6 +45,7 @@ class Database { public: Database(); + Database(const Database& db) : numSeqs(db.numSeqs), longest(db.longest), searchScore(db.searchScore), results(db.results), Scores(db.Scores) { m = MothurOut::getInstance(); } virtual ~Database(); virtual void generateDB() = 0; virtual void addSequence(Sequence) = 0; //add sequence to search engine @@ -57,7 +58,7 @@ public: virtual void readKmerDB(ifstream&){}; virtual void setNumSeqs(int i) { numSeqs = i; } virtual vector getSequencesWithKmer(int){ vector filler; return filler; }; - virtual int getMaxKmer(){ return 1; }; + virtual int getMaxKmer(){ return 1; } protected: MothurOut* m; diff --git a/dist.h b/dist.h index 71055ae..1412424 100644 --- a/dist.h +++ b/dist.h @@ -17,7 +17,8 @@ class Dist { public: - Dist(){dist = 0; m = MothurOut::getInstance(); } + Dist(){ dist = 0; m = MothurOut::getInstance(); } + Dist(const Dist& d) : dist(d.dist) { m = MothurOut::getInstance(); } virtual ~Dist() {} virtual void calcDist(Sequence, Sequence) = 0; double getDist() { return dist; } diff --git a/distancedb.cpp b/distancedb.cpp index c1bf7e7..2cbf5ca 100644 --- a/distancedb.cpp +++ b/distancedb.cpp @@ -13,8 +13,13 @@ #include "distancedb.hpp" #include "onegapignore.h" + +/**************************************************************************************************/ +DistanceDB::DistanceDB(const DistanceDB& ddb) : data(ddb.data), templateSeqsLength(ddb.templateSeqsLength), templateAligned(ddb.templateAligned), Database(ddb) { + distCalculator = new oneGapIgnoreTermGapDist(); +} /**************************************************************************************************/ -DistanceDB::DistanceDB() { +DistanceDB::DistanceDB() : Database() { try { templateAligned = true; templateSeqsLength = 0; diff --git a/distancedb.hpp b/distancedb.hpp index 2624d6d..d7e05db 100644 --- a/distancedb.hpp +++ b/distancedb.hpp @@ -19,6 +19,7 @@ class DistanceDB : public Database { public: DistanceDB(); + DistanceDB(const DistanceDB& ddb); ~DistanceDB() { delete distCalculator; } void generateDB() {} //doesn't generate a search db diff --git a/eachgapdist.h b/eachgapdist.h index 84fe56e..fe9bc30 100644 --- a/eachgapdist.h +++ b/eachgapdist.h @@ -18,6 +18,9 @@ class eachGapDist : public Dist { public: + eachGapDist() {} + eachGapDist(const eachGapDist& ddb) {} + void calcDist(Sequence A, Sequence B){ int diff = 0; int length = 0; diff --git a/eachgapignore.h b/eachgapignore.h index 824a8b8..2e7fbd6 100644 --- a/eachgapignore.h +++ b/eachgapignore.h @@ -17,6 +17,8 @@ class eachGapIgnoreTermGapDist : public Dist { public: + eachGapIgnoreTermGapDist() {} + eachGapIgnoreTermGapDist(const eachGapIgnoreTermGapDist& ddb) {} void calcDist(Sequence A, Sequence B){ int diff = 0; diff --git a/ignoregaps.h b/ignoregaps.h index d4e5ee5..ccc6b96 100644 --- a/ignoregaps.h +++ b/ignoregaps.h @@ -20,6 +20,9 @@ class ignoreGaps : public Dist { public: + ignoreGaps() {} + ignoreGaps(const ignoreGaps& ddb) {} + void calcDist(Sequence A, Sequence B){ int diff = 0; int length = 0; diff --git a/kmerdb.hpp b/kmerdb.hpp index becdeda..62d4836 100644 --- a/kmerdb.hpp +++ b/kmerdb.hpp @@ -26,6 +26,7 @@ class KmerDB : public Database { public: KmerDB(string, int); + KmerDB(const KmerDB& kdb) : kmerSize(kdb.kmerSize), maxKmer(kdb.maxKmer), count(kdb.count), kmerDBName(kdb.kmerDBName), kmerLocations(kdb.kmerLocations), Database(kdb) {} KmerDB(); ~KmerDB(); diff --git a/onegapdist.h b/onegapdist.h index cb63ae6..971e6b0 100644 --- a/onegapdist.h +++ b/onegapdist.h @@ -17,6 +17,10 @@ class oneGapDist : public Dist { public: + + oneGapDist() {} + oneGapDist(const oneGapDist& ddb) {} + void calcDist(Sequence A, Sequence B){ int difference = 0; diff --git a/onegapignore.h b/onegapignore.h index 32818e3..dba3f38 100644 --- a/onegapignore.h +++ b/onegapignore.h @@ -17,6 +17,10 @@ class oneGapIgnoreTermGapDist : public Dist { public: + + oneGapIgnoreTermGapDist() {} + oneGapIgnoreTermGapDist(const oneGapIgnoreTermGapDist& ddb) {} + void calcDist(Sequence A, Sequence B){ int difference = 0; diff --git a/parsimonycommand.cpp b/parsimonycommand.cpp index ff12092..66f8193 100644 --- a/parsimonycommand.cpp +++ b/parsimonycommand.cpp @@ -219,7 +219,7 @@ int ParsimonyCommand::execute() { read->AssembleTrees(); T = read->getTrees(); delete read; - + //make sure all files match //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size. int numNamesInTree; diff --git a/sequence.hpp b/sequence.hpp index 94d7d29..b433740 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -25,6 +25,8 @@ public: Sequence(string, string); Sequence(ifstream&); Sequence(istringstream&); + Sequence(const Sequence& se) : name(se.name), unaligned(se.unaligned), aligned(se.aligned), pairwise(se.pairwise), numBases(se.numBases), startPos(se.startPos), endPos(se.endPos), + alignmentLength(se.alignmentLength), isAligned(se.isAligned), longHomoPolymer(se.longHomoPolymer), ambigBases(se.ambigBases) { m = MothurOut::getInstance(); } //these constructors just set the unaligned string to save space Sequence(string, string, string); diff --git a/suffixdb.hpp b/suffixdb.hpp index 393b4a5..ca1ff42 100644 --- a/suffixdb.hpp +++ b/suffixdb.hpp @@ -20,14 +20,20 @@ #include "mothur.h" #include "database.hpp" - -class SuffixTree; +#include "suffixtree.hpp" +//class SuffixTree; class SuffixDB : public Database { public: SuffixDB(int); SuffixDB(); + SuffixDB(const SuffixDB& sdb) : count(sdb.count), Database(sdb) { + for (int i = 0; i < sdb.suffixForest.size(); i++) { + SuffixTree temp(sdb.suffixForest[i]); + suffixForest.push_back(temp); + } + } ~SuffixDB(); void generateDB() {}; //adding sequences generates the db diff --git a/suffixnodes.hpp b/suffixnodes.hpp index 6a22c4d..1e078a6 100644 --- a/suffixnodes.hpp +++ b/suffixnodes.hpp @@ -25,6 +25,7 @@ class SuffixNode { public: SuffixNode(int, int, int); + SuffixNode(const SuffixNode& sn) : parentNode(sn.parentNode), startCharPosition(sn.startCharPosition), endCharPosition(sn.endCharPosition) {m = MothurOut::getInstance();} virtual ~SuffixNode() {} virtual void print(string, int) = 0; virtual void setChildren(char, int); @@ -62,6 +63,7 @@ class SuffixBranch : public SuffixNode { public: SuffixBranch(int, int, int); + SuffixBranch(const SuffixBranch& sb) : suffixNode(sb.suffixNode), childNodes(sb.childNodes), SuffixNode(sb.parentNode, sb.startCharPosition, sb.endCharPosition) {} ~SuffixBranch() {} void print(string, int); // need a special method for printing the node because there are children void eraseChild(char); // need a special method for erasing the children diff --git a/suffixtree.cpp b/suffixtree.cpp index 9cd8351..fd18109 100644 --- a/suffixtree.cpp +++ b/suffixtree.cpp @@ -33,6 +33,25 @@ inline bool compareParents(SuffixNode* left, SuffixNode* right){// this is neces return (left->getParentNode() < right->getParentNode()); // nodes in order of their parent } +//******************************************************************************************************************** + +SuffixTree::SuffixTree(const SuffixTree& st) : root(st.root), activeEndPosition(st.activeEndPosition), activeStartPosition(st.activeStartPosition), activeNode(st.activeNode), + nodeCounter(st.nodeCounter), seqName(st.seqName), sequence(st.sequence) { + try { + m = MothurOut::getInstance(); + + for (int i = 0; i < st.nodeVector.size(); i++) { + SuffixNode* temp = new SuffixBranch(*((SuffixBranch*)st.nodeVector[i])); + nodeVector.push_back(temp); + } + + + }catch(exception& e) { + m->errorOut(e, "SuffixTree", "SuffixTree"); + exit(1); + } +} + //******************************************************************************************************************** SuffixTree::SuffixTree(){ m = MothurOut::getInstance(); } diff --git a/suffixtree.hpp b/suffixtree.hpp index 805b9b0..492db54 100644 --- a/suffixtree.hpp +++ b/suffixtree.hpp @@ -37,6 +37,7 @@ public: SuffixTree(); ~SuffixTree(); // SuffixTree(string, string); + SuffixTree(const SuffixTree&); void loadSequence(Sequence); string getSeqName(); diff --git a/treemap.cpp b/treemap.cpp index 7542385..1fc5c01 100644 --- a/treemap.cpp +++ b/treemap.cpp @@ -12,6 +12,7 @@ /************************************************************/ TreeMap::TreeMap(string filename) { + m = MothurOut::getInstance(); groupFileName = filename; m->openInputFile(filename, fileHandle); } @@ -20,28 +21,38 @@ TreeMap::~TreeMap(){} /************************************************************/ -void TreeMap::readMap() { +int TreeMap::readMap() { string seqName, seqGroup; - + int error = 0; + while(fileHandle){ - fileHandle >> seqName; //read from first column + fileHandle >> seqName; //read from first column fileHandle >> seqGroup; //read from second column - - namesOfSeqs.push_back(seqName); + + if (m->control_pressed) { fileHandle.close(); return 1; } + setNamesOfGroups(seqGroup); - treemap[seqName].groupname = seqGroup; //store data in map - - it2 = seqsPerGroup.find(seqGroup); - if (it2 == seqsPerGroup.end()) { //if it's a new group - seqsPerGroup[seqGroup] = 1; - }else {//it's a group we already have - seqsPerGroup[seqGroup]++; + map::iterator itCheck = treemap.find(seqName); + if (itCheck != treemap.end()) { error = 1; m->mothurOut("[WARNING]: Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } + else { + namesOfSeqs.push_back(seqName); + treemap[seqName].groupname = seqGroup; //store data in map + + it2 = seqsPerGroup.find(seqGroup); + if (it2 == seqsPerGroup.end()) { //if it's a new group + seqsPerGroup[seqGroup] = 1; + }else {//it's a group we already have + seqsPerGroup[seqGroup]++; + } } - + m->gobble(fileHandle); } fileHandle.close(); + + + return error; } /************************************************************/ void TreeMap::addSeq(string seqName, string seqGroup) { diff --git a/treemap.h b/treemap.h index 9f10f37..244348e 100644 --- a/treemap.h +++ b/treemap.h @@ -28,7 +28,7 @@ public: TreeMap() { m = MothurOut::getInstance(); } TreeMap(string); ~TreeMap(); - void readMap(); + int readMap(); int getNumGroups(); int getNumSeqs(); void setIndex(string, int); //sequencename, index