From: westcott Date: Tue, 13 Sep 2011 10:08:39 +0000 (+0000) Subject: working on windows paralellization, added trimOligos class to be used by trim.flows... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=ae57e166b2ed7b475ec3f466106bd76fabadd063 working on windows paralellization, added trimOligos class to be used by trim.flows and trim.seqs commands. Added sequenceParser class to divide fasta and names files by group. Added group and bygroup options to pre.cluster to allow you to pre.cluster within each group. --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index c9681af..3ccc0b9 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -317,11 +317,13 @@ A7E9B98D12D37EC400DA6239 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87C12D37EC400DA6239 /* weighted.cpp */; }; A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */; }; A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87F12D37EC400DA6239 /* whittaker.cpp */; }; + A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */; }; A7FA10021302E097003860FE /* mantelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FA10011302E096003860FE /* mantelcommand.cpp */; }; A7FC480E12D788F20055BC5C /* linearalgebra.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FC480D12D788F20055BC5C /* linearalgebra.cpp */; }; A7FC486712D795D60055BC5C /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FC486612D795D60055BC5C /* pcacommand.cpp */; }; A7FE7C401330EA1000F7B327 /* getcurrentcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FE7C3F1330EA1000F7B327 /* getcurrentcommand.cpp */; }; A7FE7E6D13311EA400F7B327 /* setcurrentcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FE7E6C13311EA400F7B327 /* setcurrentcommand.cpp */; }; + A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FF19F1140FFDA500AD216D /* trimoligos.cpp */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -985,6 +987,8 @@ A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = weightedlinkage.cpp; sourceTree = SOURCE_ROOT; }; A7E9B87F12D37EC400DA6239 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = ""; }; A7E9B88012D37EC400DA6239 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = ""; }; + A7F9F5CD141A5E500032F693 /* sequenceparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequenceparser.h; sourceTree = ""; }; + A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequenceparser.cpp; sourceTree = ""; }; A7FA10001302E096003860FE /* mantelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mantelcommand.h; sourceTree = ""; }; A7FA10011302E096003860FE /* mantelcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mantelcommand.cpp; sourceTree = ""; }; A7FC480C12D788F20055BC5C /* linearalgebra.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = linearalgebra.h; sourceTree = ""; }; @@ -995,6 +999,8 @@ A7FE7C3F1330EA1000F7B327 /* getcurrentcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcurrentcommand.cpp; sourceTree = ""; }; A7FE7E6B13311EA400F7B327 /* setcurrentcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = setcurrentcommand.h; sourceTree = ""; }; A7FE7E6C13311EA400F7B327 /* setcurrentcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = setcurrentcommand.cpp; sourceTree = ""; }; + A7FF19F0140FFDA500AD216D /* trimoligos.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trimoligos.h; sourceTree = ""; }; + A7FF19F1140FFDA500AD216D /* trimoligos.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trimoligos.cpp; sourceTree = ""; }; C6A0FF2C0290799A04C91782 /* mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = mothur.1; sourceTree = ""; }; /* End PBXFileReference section */ @@ -1103,6 +1109,8 @@ A7E9B82D12D37EC400DA6239 /* singlelinkage.cpp */, A7E9B83012D37EC400DA6239 /* slibshuff.cpp */, A7E9B83112D37EC400DA6239 /* slibshuff.h */, + A7FF19F0140FFDA500AD216D /* trimoligos.h */, + A7FF19F1140FFDA500AD216D /* trimoligos.cpp */, A7E9B87412D37EC400DA6239 /* validcalculator.cpp */, A7E9B87512D37EC400DA6239 /* validcalculator.h */, A7E9B87612D37EC400DA6239 /* validparameter.cpp */, @@ -1382,9 +1390,9 @@ A7E9B7E412D37EC400DA6239 /* sffinfocommand.h */, A7E9B7E312D37EC400DA6239 /* sffinfocommand.cpp */, A7E9B7F312D37EC400DA6239 /* sharedcommand.h */, - A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */, A7E9B82812D37EC400DA6239 /* shhhercommand.h */, A7E9B82712D37EC400DA6239 /* shhhercommand.cpp */, + A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */, A7E9B84012D37EC400DA6239 /* splitabundcommand.h */, A7E9B83F12D37EC400DA6239 /* splitabundcommand.cpp */, A7E9B84212D37EC400DA6239 /* splitgroupscommand.h */, @@ -1625,6 +1633,8 @@ A7E9B7DC12D37EC400DA6239 /* sequence.hpp */, A7E9B7DD12D37EC400DA6239 /* sequencedb.cpp */, A7E9B7DE12D37EC400DA6239 /* sequencedb.h */, + A7F9F5CD141A5E500032F693 /* sequenceparser.h */, + A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */, A7E9B80412D37EC400DA6239 /* sharedlistvector.cpp */, A7E9B80512D37EC400DA6239 /* sharedlistvector.h */, A7E9B80E12D37EC400DA6239 /* sharedordervector.h */, @@ -2137,6 +2147,8 @@ A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */, A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */, A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */, + A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */, + A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index 05ecd46..960d702 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -322,7 +322,7 @@ int AlignCommand::execute(){ #ifdef USE_MPI int pid, numSeqsPerProcessor; int tag = 2001; - vector MPIPos; + vector MPIPos; MPIWroteAccnos = false; MPI_Status status; @@ -426,7 +426,7 @@ int AlignCommand::execute(){ #else - vector positions; + vector positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) positions = m->divideFile(candidateFileNames[s], processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } @@ -623,7 +623,7 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam delete candidateSeq; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } @@ -650,7 +650,7 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam } //********************************************************************************************************************** #ifdef USE_MPI -int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& alignFile, MPI_File& reportFile, MPI_File& accnosFile, vector& MPIPos){ +int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& alignFile, MPI_File& reportFile, MPI_File& accnosFile, vector& MPIPos){ try { string outputString = ""; MPI_Status statusReport; diff --git a/aligncommand.h b/aligncommand.h index 4a5eb11..6236958 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -45,9 +45,9 @@ public: private: struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector processIDS; //processid vector lines; @@ -61,7 +61,7 @@ private: void appendReportFiles(string, string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); #endif string candidateFileName, templateFileName, distanceFileName, search, align, outputDir; @@ -86,15 +86,15 @@ typedef struct alignData { string align; string search; bool flip; - unsigned long int start; - unsigned long int end; + unsigned long long start; + unsigned long long end; MothurOut* m; //AlignmentDB* templateDB; float match, misMatch, gapOpen, gapExtend, threshold; int count, kmerSize, threadID; alignData(){} - alignData(string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long int st, unsigned long int en, bool fl, float ma, float misMa, float gapO, float gapE, float thr, int tid) { + alignData(string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long long st, unsigned long long en, bool fl, float ma, float misMa, float gapO, float gapE, float thr, int tid) { alignFName = a; reportFName = r; accnosFName = ac; diff --git a/alignmentdb.cpp b/alignmentdb.cpp index 57975d6..df4eaec 100644 --- a/alignmentdb.cpp +++ b/alignmentdb.cpp @@ -77,7 +77,7 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap #ifdef USE_MPI int pid, processors; - vector positions; + vector positions; MPI_Status status; MPI_File inMPI; diff --git a/amovacommand.cpp b/amovacommand.cpp index 1416733..de277ab 100644 --- a/amovacommand.cpp +++ b/amovacommand.cpp @@ -174,10 +174,17 @@ int AmovaCommand::execute(){ //link designMap to rows/columns in distance matrix map > origGroupSampleMap; for(int i=0;igetGroup(sampleNames[i])].push_back(i); + string group = designMap->getGroup(sampleNames[i]); + + if (group == "not found") { + m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; + }else { origGroupSampleMap[group].push_back(i); } + } int numGroups = origGroupSampleMap.size(); + if (m->control_pressed) { delete designMap; return 0; } + //create a new filename ofstream AMOVAFile; string AMOVAFileName = outputDir + m->getRootName(m->getSimpleName(phylipFileName)) + "amova"; diff --git a/anosimcommand.cpp b/anosimcommand.cpp index c5cb4d6..50b6f28 100644 --- a/anosimcommand.cpp +++ b/anosimcommand.cpp @@ -176,10 +176,16 @@ int AnosimCommand::execute(){ //link designMap to rows/columns in distance matrix map > origGroupSampleMap; for(int i=0;igetGroup(sampleNames[i])].push_back(i); + string group = designMap->getGroup(sampleNames[i]); + + if (group == "not found") { + m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; + }else { origGroupSampleMap[group].push_back(i); } } int numGroups = origGroupSampleMap.size(); + if (m->control_pressed) { delete designMap; return 0; } + //create a new filename ofstream ANOSIMFile; string ANOSIMFileName = outputDir + m->getRootName(m->getSimpleName(phylipFileName)) + "anosim"; diff --git a/bayesian.cpp b/bayesian.cpp index a12afed..d62bb23 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -19,6 +19,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { threadID = tid; string baseName = tempFile; + if (baseName == "saved") { baseName = rdb->getSavedReference(); } string baseTName = tfile; @@ -49,7 +50,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { } //if you want to save, but you dont need to calculate then just read - if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood) { + if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood && (tempFile != "saved")) { ifstream saveIn; m->openInputFile(tempFile, saveIn); @@ -445,8 +446,8 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string #ifdef USE_MPI int pid, num, num2, processors; - vector positions; - vector positions2; + vector positions; + vector positions2; MPI_Status status; MPI_File inMPI; @@ -575,7 +576,7 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string string line = m->getline(in); m->gobble(in); in >> numKmers; m->gobble(in); - + //cout << threadID << '\t' << line << '\t' << numKmers << &in << '\t' << &inNum << '\t' << genusNodes.size() << endl; //initialze probabilities wordGenusProb.resize(numKmers); @@ -588,22 +589,25 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string //read version string line2 = m->getline(inNum); m->gobble(inNum); - + //cout << threadID << '\t' << line2 << '\t' << this << endl; while (inNum) { inNum >> zeroCountProb[count] >> num[count]; count++; m->gobble(inNum); + //cout << threadID << '\t' << count << endl; } inNum.close(); - + //cout << threadID << '\t' << "here1 " << &wordGenusProb << '\t' << &num << endl; // + //cout << threadID << '\t' << &genusTotals << '\t' << endl; + //cout << threadID << '\t' << genusNodes.size() << endl; while(in) { in >> kmer; - + //cout << threadID << '\t' << kmer << endl; //set them all to zero value for (int i = 0; i < genusNodes.size(); i++) { wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1)); } - + //cout << threadID << '\t' << num[kmer] << "here" << endl; //get probs for nonzero values for (int i = 0; i < num[kmer]; i++) { in >> name >> prob; @@ -613,7 +617,7 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string m->gobble(in); } in.close(); - + //cout << threadID << '\t' << "here" << endl; #endif } catch(exception& e) { diff --git a/bellerophon.cpp b/bellerophon.cpp index 207fa51..5a0a444 100644 --- a/bellerophon.cpp +++ b/bellerophon.cpp @@ -246,7 +246,7 @@ int Bellerophon::getChimeras() { numSeqsPerProcessor = iters / processors; //each process hits this only once - unsigned long int startPos = pid * numSeqsPerProcessor; + unsigned long long startPos = pid * numSeqsPerProcessor; if(pid == processors - 1){ numSeqsPerProcessor = iters - pid * numSeqsPerProcessor; } @@ -326,7 +326,7 @@ int Bellerophon::getChimeras() { int numSeqsPerProcessor = iters / processors; for (int i = 0; i < processors; i++) { - unsigned long int startPos = i * numSeqsPerProcessor; + unsigned long long startPos = i * numSeqsPerProcessor; if(i == processors - 1){ numSeqsPerProcessor = iters - i * numSeqsPerProcessor; } diff --git a/bellerophon.h b/bellerophon.h index 08dca2b..36555bf 100644 --- a/bellerophon.h +++ b/bellerophon.h @@ -36,9 +36,9 @@ class Bellerophon : public Chimera { private: struct linePair { - unsigned long int start; + unsigned long long start; int num; - linePair(unsigned long int i, int j) : start(i), num(j) {} + linePair(unsigned long long i, int j) : start(i), num(j) {} }; vector lines; diff --git a/chimera.cpp b/chimera.cpp index ed84397..7158416 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -124,7 +124,7 @@ vector Chimera::readSeqs(string file) { #ifdef USE_MPI int pid, processors; - vector positions; + vector positions; int numSeqs; int tag = 2001; diff --git a/chimera.h b/chimera.h index 7e327dd..1df4d6b 100644 --- a/chimera.h +++ b/chimera.h @@ -133,9 +133,9 @@ struct sim { }; struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} linePair(){} }; diff --git a/chimeraccodecommand.cpp b/chimeraccodecommand.cpp index 7c8485a..321ae77 100644 --- a/chimeraccodecommand.cpp +++ b/chimeraccodecommand.cpp @@ -299,7 +299,7 @@ int ChimeraCcodeCommand::execute(){ int pid, numSeqsPerProcessor; int tag = 2001; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -390,7 +390,7 @@ int ChimeraCcodeCommand::execute(){ outHeader.close(); - vector positions = m->divideFile(fastaFileNames[s], processors); + vector positions = m->divideFile(fastaFileNames[s], processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); @@ -522,7 +522,7 @@ int ChimeraCcodeCommand::driver(linePair* filePos, string outputFName, string fi delete candidateSeq; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } @@ -547,7 +547,7 @@ int ChimeraCcodeCommand::driver(linePair* filePos, string outputFName, string fi } //********************************************************************************************************************** #ifdef USE_MPI -int ChimeraCcodeCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector& MPIPos){ +int ChimeraCcodeCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector& MPIPos){ try { MPI_Status status; diff --git a/chimeraccodecommand.h b/chimeraccodecommand.h index 87f5ee8..5b8092b 100644 --- a/chimeraccodecommand.h +++ b/chimeraccodecommand.h @@ -36,9 +36,9 @@ public: private: struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector processIDS; //processid vector lines; @@ -47,7 +47,7 @@ private: int createProcesses(string, string, string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); #endif bool abort, filter, save; diff --git a/chimeracheckcommand.cpp b/chimeracheckcommand.cpp index 2b88eb5..3a530fd 100644 --- a/chimeracheckcommand.cpp +++ b/chimeracheckcommand.cpp @@ -349,7 +349,7 @@ int ChimeraCheckCommand::execute(){ int pid, numSeqsPerProcessor; int tag = 2001; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -424,7 +424,7 @@ int ChimeraCheckCommand::execute(){ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - vector positions = m->divideFile(fastaFileNames[i], processors); + vector positions = m->divideFile(fastaFileNames[i], processors); for (int s = 0; s < (positions.size()-1); s++) { lines.push_back(new linePair(positions[s], positions[(s+1)])); @@ -520,7 +520,7 @@ int ChimeraCheckCommand::driver(linePair* filePos, string outputFName, string fi delete candidateSeq; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } @@ -544,7 +544,7 @@ int ChimeraCheckCommand::driver(linePair* filePos, string outputFName, string fi } //********************************************************************************************************************** #ifdef USE_MPI -int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector& MPIPos){ +int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector& MPIPos){ try { MPI_File outAccMPI; MPI_Status status; diff --git a/chimeracheckcommand.h b/chimeracheckcommand.h index 350e10e..918b4e5 100644 --- a/chimeracheckcommand.h +++ b/chimeracheckcommand.h @@ -38,9 +38,9 @@ public: private: struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector processIDS; //processid @@ -50,7 +50,7 @@ private: int createProcesses(string, string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, vector&); + int driverMPI(int, int, MPI_File&, MPI_File&, vector&); #endif bool abort, svg, save; diff --git a/chimerapintailcommand.cpp b/chimerapintailcommand.cpp index 33ac69a..76dd0da 100644 --- a/chimerapintailcommand.cpp +++ b/chimerapintailcommand.cpp @@ -416,7 +416,7 @@ int ChimeraPintailCommand::execute(){ #ifdef USE_MPI int pid, numSeqsPerProcessor; int tag = 2001; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -486,7 +486,7 @@ int ChimeraPintailCommand::execute(){ MPI_File_close(&outMPIAccnos); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - vector positions = m->divideFile(fastaFileNames[s], processors); + vector positions = m->divideFile(fastaFileNames[s], processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); @@ -610,7 +610,7 @@ int ChimeraPintailCommand::driver(linePair* filePos, string outputFName, string delete candidateSeq; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } @@ -635,7 +635,7 @@ int ChimeraPintailCommand::driver(linePair* filePos, string outputFName, string } //********************************************************************************************************************** #ifdef USE_MPI -int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector& MPIPos){ +int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector& MPIPos){ try { MPI_Status status; diff --git a/chimerapintailcommand.h b/chimerapintailcommand.h index 9e1d6f8..ddbb648 100644 --- a/chimerapintailcommand.h +++ b/chimerapintailcommand.h @@ -38,9 +38,9 @@ private: ReferenceDB* rdb; struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector processIDS; //processid @@ -50,7 +50,7 @@ private: int createProcesses(string, string, string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); #endif bool abort, filter, save; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index 0732bc2..0958630 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -509,7 +509,7 @@ int ChimeraSlayerCommand::execute(){ string trimFastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.fasta"; //create chimera here if you are mac or linux because fork will copy for you. Create in create processes instead if you are windows. - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || USE_MPI if (templatefile != "self") { //you want to run slayer with a reference template chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign, blastlocation, rand()); }else { @@ -546,7 +546,7 @@ int ChimeraSlayerCommand::execute(){ #ifdef USE_MPI int pid, numSeqsPerProcessor; int tag = 2001; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -659,7 +659,7 @@ int ChimeraSlayerCommand::execute(){ #else //break up file - vector positions; + vector positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) positions = m->divideFile(fastaFileNames[s], processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } @@ -716,7 +716,7 @@ int ChimeraSlayerCommand::execute(){ #endif - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || USE_MPI delete chimera; #endif @@ -849,7 +849,7 @@ int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string f } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } @@ -876,7 +876,7 @@ int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string f } //********************************************************************************************************************** #ifdef USE_MPI -int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, vector& MPIPos){ +int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, vector& MPIPos){ try { MPI_Status status; int pid; diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h index a54b24f..b37d4c0 100644 --- a/chimeraslayercommand.h +++ b/chimeraslayercommand.h @@ -36,9 +36,9 @@ public: private: struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector processIDS; //processid @@ -51,7 +51,7 @@ private: map sortFastaFile(string, string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); #endif bool abort, realign, trim, trimera, save; @@ -81,8 +81,8 @@ typedef struct slayerData { string blastlocation; bool trimera; bool trim, realign; - unsigned long int start; - unsigned long int end; + unsigned long long start; + unsigned long long end; int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted; MothurOut* m; float divR; @@ -92,7 +92,7 @@ typedef struct slayerData { int threadId; slayerData(){} - slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long int st, unsigned long int en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map prior, int tid) { + slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map prior, int tid) { outputFName = o; fasta = fa; accnos = ac; diff --git a/classify.cpp b/classify.cpp index e31e8cc..eb0865c 100644 --- a/classify.cpp +++ b/classify.cpp @@ -89,7 +89,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me m->mothurOut("Generating search database... "); cout.flush(); #ifdef USE_MPI int pid, processors; - vector positions; + vector positions; int tag = 2001; MPI_Status status; @@ -252,7 +252,7 @@ int Classify::readTaxonomy(string file) { #ifdef USE_MPI int pid, num, processors; - vector positions; + vector positions; int tag = 2001; MPI_Status status; diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 4291132..3b7b4a8 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -8,11 +8,6 @@ */ #include "classifyseqscommand.h" -#include "sequence.hpp" -#include "bayesian.h" -#include "phylotree.h" -#include "phylosummary.h" -#include "knn.h" @@ -383,6 +378,11 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } save = m->isTrue(temp); + //this is so the threads can quickly load the reference data + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #else + if (processors != 1) { save = true; } + #endif rdb->save = save; if (save) { //clear out old references rdb->clearMemory(); @@ -484,7 +484,6 @@ int ClassifySeqsCommand::execute(){ } if (m->control_pressed) { delete classify; return 0; } - for (int s = 0; s < fastaFileNames.size(); s++) { @@ -518,7 +517,7 @@ int ClassifySeqsCommand::execute(){ #ifdef USE_MPI int pid, numSeqsPerProcessor; int tag = 2001; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -544,7 +543,7 @@ int ClassifySeqsCommand::execute(){ MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax); MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } if (pid == 0) { //you are the root process @@ -599,25 +598,30 @@ int ClassifySeqsCommand::execute(){ #else - vector positions = m->divideFile(fastaFileNames[s], processors); + vector positions; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + positions = m->divideFile(fastaFileNames[s], processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } +#else + if (processors == 1) { + lines.push_back(new linePair(0, 1000)); + }else { + positions = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); - for (int i = 0; i < (positions.size()-1); i++) { - lines.push_back(new linePair(positions[i], positions[(i+1)])); - } - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor)); + } + } +#endif if(processors == 1){ numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); - } - else{ - processIDS.resize(0); - + }else{ numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); - } - #else - numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); - #endif #endif m->mothurOutEndLine(); @@ -746,6 +750,7 @@ int ClassifySeqsCommand::execute(){ } delete classify; + return 0; } catch(exception& e) { @@ -787,9 +792,12 @@ string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) { int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { try { + + int num = 0; + processIDS.clear(); + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 1; - int num = 0; //loop through and create all the processes you want while (process != processors) { @@ -832,6 +840,46 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } in.close(); m->mothurRemove(m->getFullPathName(tempFile)); } +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the alignData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; istart, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i); + pDataArray.push_back(tempclass); + + //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. + //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier + hThreadArray[i] = CreateThread(NULL, 0, MyClassThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + + } + + //parent does its part + num = driver(lines[processors-1], taxFileName + toString(processors-1) + ".temp", tempTaxFile + toString(processors-1) + ".temp", filename); + processIDS.push_back((processors-1)); + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + num += pDataArray[i]->count; + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + + #endif for(int i=0;ierrorOut(e, "ClassifySeqsCommand", "createProcesses"); @@ -918,7 +966,7 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT delete candidateSeq; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } @@ -944,7 +992,7 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT } //********************************************************************************************************************** #ifdef USE_MPI -int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector& MPIPos){ +int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector& MPIPos){ try { MPI_Status statusNew; MPI_Status statusTemp; diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 3dbf06e..58de969 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -14,6 +14,12 @@ #include "command.hpp" #include "classify.h" #include "referencedb.h" +#include "sequence.hpp" +#include "bayesian.h" +#include "phylotree.h" +#include "phylosummary.h" +#include "knn.h" + //KNN and Bayesian methods modeled from algorithms in //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences @@ -46,9 +52,9 @@ public: private: struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector processIDS; //processid @@ -75,9 +81,140 @@ private: int MPIReadNamesFile(string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); #endif }; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +typedef struct classifyData { + string taxFName; + string tempTFName; + string filename; + string search, taxonomyFileName, templateFileName, method; + unsigned long long start; + unsigned long long end; + MothurOut* m; + float match, misMatch, gapOpen, gapExtend; + int count, kmerSize, threadID, cutoff, iters, numWanted; + bool probs; + + classifyData(){} + classifyData(bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid) { + taxonomyFileName = tx; + templateFileName = te; + taxFName = a; + tempTFName = r; + filename = f; + search = se; + method = me; + m = mout; + start = st; + end = en; + match = ma; + misMatch = misMa; + gapOpen = gapO; + gapExtend = gapE; + kmerSize = ks; + cutoff = cut; + iters = i; + numWanted = numW; + threadID = tid; + probs = p; + count = 0; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ + classifyData* pDataArray; + pDataArray = (classifyData*)lpParam; + + try { + ofstream outTax; + pDataArray->m->openOutputFile(pDataArray->taxFName, outTax); + + ofstream outTaxSimple; + pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple); + + ifstream inFASTA; + pDataArray->m->openInputFile(pDataArray->filename, inFASTA); + + string taxonomy; + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + inFASTA.seekg(0); + }else { //this accounts for the difference in line endings. + inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); + } + + pDataArray->count = pDataArray->end; + + //make classify + Classify* myclassify; + if(pDataArray->method == "bayesian"){ myclassify = new Bayesian("saved", "saved", pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID); } + else if(pDataArray->method == "knn"){ myclassify = new Knn("saved", "saved", pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } + else { + pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); + pDataArray->m->mothurOutEndLine(); + myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID); + } + + if (pDataArray->m->control_pressed) { delete myclassify; return 0; } + + int count = 0; + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { delete myclassify; return 0; } + + Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA); + + if (candidateSeq->getName() != "") { + + taxonomy = myclassify->getTaxonomy(candidateSeq); + + if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; } + + if (taxonomy != "bad seq") { + //output confidence scores or not + if (pDataArray->probs) { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + }else{ + outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; + } + + outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; + } + count++; + } + delete candidateSeq; + //report progress + if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); } + + } + //report progress + if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); } + + delete myclassify; + inFASTA.close(); + outTax.close(); + outTaxSimple.close(); + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ClassifySeqsCommand", "MyClassThreadFunction"); + exit(1); + } +} +#endif + + + + #endif diff --git a/clustercommand.cpp b/clustercommand.cpp index 059f277..678912c 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -326,7 +326,7 @@ int ClusterCommand::execute(){ cout.flush(); print_start = false; } - + if(previousDist <= 0.0000){ printData("unique"); } diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp index 5c8d12b..8d2d3ea 100644 --- a/clusterfragmentscommand.cpp +++ b/clusterfragmentscommand.cpp @@ -226,7 +226,7 @@ int ClusterFragmentsCommand::execute(){ string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile)); string newFastaFile = fileroot + "fragclust.fasta"; - string newNamesFile = fileroot + "names"; + string newNamesFile = fileroot + "fragclust.names"; if (m->control_pressed) { return 0; } diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index bc2dd48..3fa622f 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -162,6 +162,7 @@ int DeconvoluteCommand::execute() { map::iterator itStrings; set nameInFastaFile; //for sanity checking set::iterator itname; + vector nameFileOrder; int count = 0; while (!in.eof()) { @@ -189,8 +190,9 @@ int DeconvoluteCommand::execute() { m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine(); }else { sequenceStrings[seq.getAligned()] = itNames->second; + nameFileOrder.push_back(seq.getAligned()); } - }else { sequenceStrings[seq.getAligned()] = seq.getName(); } + }else { sequenceStrings[seq.getAligned()] = seq.getName(); nameFileOrder.push_back(seq.getAligned()); } }else { //this is a dup if (oldNameMapFName != "") { itNames = nameMap.find(seq.getName()); @@ -222,17 +224,22 @@ int DeconvoluteCommand::execute() { ofstream outNames; m->openOutputFile(outNameFile, outNames); - for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) { + for (int i = 0; i < nameFileOrder.size(); i++) { + //for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) { if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); m->mothurRemove(outNameFile); return 0; } - //get rep name - int pos = (itStrings->second).find_first_of(','); + itStrings = sequenceStrings.find(nameFileOrder[i]); - if (pos == string::npos) { // only reps itself - outNames << itStrings->second << '\t' << itStrings->second << endl; - }else { - outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; - } + if (itStrings != sequenceStrings.end()) { + //get rep name + int pos = (itStrings->second).find_first_of(','); + + if (pos == string::npos) { // only reps itself + outNames << itStrings->second << '\t' << itStrings->second << endl; + }else { + outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl; + } + }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; } } outNames.close(); diff --git a/distancecommand.cpp b/distancecommand.cpp index f7dff2e..beea1bf 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -317,7 +317,7 @@ int DistanceCommand::execute(){ //do your part string outputMyPart; - unsigned long int mySize; + unsigned long long mySize; if (output != "square"){ driverMPI(start, end, outputFile, mySize); } else { driverMPI(start, end, outputFile, mySize, output); } @@ -339,7 +339,7 @@ int DistanceCommand::execute(){ //wait on chidren for(int b = 1; b < processors; b++) { - unsigned long int fileSize; + unsigned long long fileSize; if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); delete distCalculator; return 0; } @@ -367,7 +367,7 @@ int DistanceCommand::execute(){ MPI_File_close(&outMPI); }else { //you are a child process //do your part - unsigned long int size; + unsigned long long size; if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); } else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); } @@ -387,7 +387,7 @@ int DistanceCommand::execute(){ else { driver(0, numSeqs, outputFile, "square"); } }else{ //you have multiple processors - unsigned long int numDists = 0; + unsigned long long numDists = 0; if (output == "square") { numDists = numSeqs * numSeqs; @@ -817,7 +817,7 @@ int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, flo } /**************************************************************************************************/ /////// need to fix to work with calcs and sequencedb -int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){ +int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size){ try { ValidCalculators validCalculator; Dist* distCalculator; @@ -913,7 +913,7 @@ int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned } /**************************************************************************************************/ /////// need to fix to work with calcs and sequencedb -int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){ +int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size, string square){ try { ValidCalculators validCalculator; Dist* distCalculator; diff --git a/distancecommand.h b/distancecommand.h index 9d5477d..66d8af1 100644 --- a/distancecommand.h +++ b/distancecommand.h @@ -213,8 +213,8 @@ private: #ifdef USE_MPI int driverMPI(int, int, MPI_File&, float); - int driverMPI(int, int, string, unsigned long int&); - int driverMPI(int, int, string, unsigned long int&, string); + int driverMPI(int, int, string, unsigned long long&); + int driverMPI(int, int, string, unsigned long long&, string); #endif //int convertMatrix(string); diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index 173227f..e96b7fd 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -341,7 +341,7 @@ int FilterSeqsCommand::filterSequences() { #ifdef USE_MPI int pid, numSeqsPerProcessor, num; int tag = 2001; - vectorMPIPos; + vectorMPIPos; MPI_Status status; MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running @@ -420,7 +420,7 @@ int FilterSeqsCommand::filterSequences() { MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - vector positions = m->divideFile(fastafileNames[s], processors); + vector positions = m->divideFile(fastafileNames[s], processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); @@ -462,7 +462,7 @@ int FilterSeqsCommand::filterSequences() { } #ifdef USE_MPI /**************************************************************************************/ -int FilterSeqsCommand::driverMPIRun(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector& MPIPos) { +int FilterSeqsCommand::driverMPIRun(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector& MPIPos) { try { string outputString = ""; int count = 0; @@ -568,7 +568,7 @@ int FilterSeqsCommand::driverRunFilter(string F, string outputFilename, string i } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = in.tellg(); + unsigned long long pos = in.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (in.eof()) { break; } @@ -676,7 +676,7 @@ string FilterSeqsCommand::createFilter() { #ifdef USE_MPI int pid, numSeqsPerProcessor, num; int tag = 2001; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_File inMPI; @@ -736,7 +736,7 @@ string FilterSeqsCommand::createFilter() { MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - vector positions = m->divideFile(fastafileNames[s], processors); + vector positions = m->divideFile(fastafileNames[s], processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); @@ -877,7 +877,7 @@ int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair* } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = in.tellg(); + unsigned long long pos = in.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (in.eof()) { break; } @@ -899,7 +899,7 @@ int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair* } #ifdef USE_MPI /**************************************************************************************/ -int FilterSeqsCommand::MPICreateFilter(int start, int num, Filters& F, MPI_File& inMPI, vector& MPIPos) { +int FilterSeqsCommand::MPICreateFilter(int start, int num, Filters& F, MPI_File& inMPI, vector& MPIPos) { try { MPI_Status status; diff --git a/filterseqscommand.h b/filterseqscommand.h index e445023..3bf36c0 100644 --- a/filterseqscommand.h +++ b/filterseqscommand.h @@ -33,9 +33,9 @@ public: private: struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector lines; @@ -59,8 +59,8 @@ private: int driverRunFilter(string, string, string, linePair*); int driverCreateFilter(Filters& F, string filename, linePair* line); #ifdef USE_MPI - int driverMPIRun(int, int, MPI_File&, MPI_File&, vector&); - int MPICreateFilter(int, int, Filters&, MPI_File&, vector&); + int driverMPIRun(int, int, MPI_File&, MPI_File&, vector&); + int MPICreateFilter(int, int, Filters&, MPI_File&, vector&); #endif }; diff --git a/flowdata.cpp b/flowdata.cpp index d1769bb..6d6c89b 100644 --- a/flowdata.cpp +++ b/flowdata.cpp @@ -42,14 +42,11 @@ FlowData::FlowData(int numFlows, float signal, float noise, int maxHomoP, string bool FlowData::getNext(ifstream& flowFile){ try { - - string lengthString; - string flowString; - - flowFile >> seqName >> endFlow; - for(int i=0;i> flowData[i]; } - - updateEndFlow(); + flowFile >> seqName >> endFlow; + //cout << "in Flowdata " + seqName << endl; + for(int i=0;i> flowData[i]; } + //cout << "in Flowdata read " << seqName + " done" << endl; + updateEndFlow(); translateFlow(); m->gobble(flowFile); diff --git a/homovacommand.cpp b/homovacommand.cpp index 22fd1bf..1869004 100644 --- a/homovacommand.cpp +++ b/homovacommand.cpp @@ -178,10 +178,16 @@ int HomovaCommand::execute(){ //link designMap to rows/columns in distance matrix map > origGroupSampleMap; for(int i=0;igetGroup(sampleNames[i])].push_back(i); + string group = designMap->getGroup(sampleNames[i]); + + if (group == "not found") { + m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; + }else { origGroupSampleMap[group].push_back(i); } } int numGroups = origGroupSampleMap.size(); + if (m->control_pressed) { delete designMap; return 0; } + //create a new filename ofstream HOMOVAFile; string HOMOVAFileName = outputDir + m->getRootName(m->getSimpleName(phylipFileName)) + "homova"; diff --git a/makefile b/makefile index 0f7d098..3f64312 100644 --- a/makefile +++ b/makefile @@ -14,12 +14,12 @@ USEMPI ?= no USEREADLINE ?= yes CYGWIN_BUILD ?= no USECOMPRESSION ?= no -MOTHUR_FILES="\"Enter_your_default_path_here\"" +MOTHUR_FILES="\"/Users/Sarahswork/desktop/release\"" RELEASE_DATE = "\"7/25/2011\"" VERSION = "\"1.21.0\"" # Optimize to level 3: -CXXFLAGS += -O3 +CXXFLAGS += -03 -g ifeq ($(strip $(64BIT_VERSION)),yes) #if you are using centos uncomment the following lines diff --git a/mothurout.cpp b/mothurout.cpp index cdde507..c80bff2 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -1050,19 +1050,27 @@ string MothurOut::sortFile(string distFile, string outputDir){ } } /**************************************************************************************************/ -vector MothurOut::setFilePosFasta(string filename, int& num) { +vector MothurOut::setFilePosFasta(string filename, int& num) { try { - vector positions; + vector positions; ifstream inFASTA; - openInputFile(filename, inFASTA); + //openInputFile(filename, inFASTA); + inFASTA.open(filename.c_str(), ios::binary); string input; + unsigned long long count = 0; while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + //input = getline(inFASTA); + //cout << input << '\t' << inFASTA.tellg() << endl; + //if (input.length() != 0) { + // if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); cout << (pos - input.length() - 1) << endl; } + //} + //gobble(inFASTA); //has to be here since windows line endings are 2 characters and mess up the positions + char c = inFASTA.get(); count++; + if (c == '>') { + positions.push_back(count-1); + //cout << count << endl; } - gobble(inFASTA); //has to be here since windows line endings are 2 characters and mess up the positions } inFASTA.close(); @@ -1080,7 +1088,7 @@ vector MothurOut::setFilePosFasta(string filename, int& num) fclose (pFile); }*/ - unsigned long int size = positions[(positions.size()-1)]; + unsigned long long size = positions[(positions.size()-1)]; ifstream in; openInputFile(filename, in); @@ -1093,6 +1101,7 @@ vector MothurOut::setFilePosFasta(string filename, int& num) in.close(); positions.push_back(size); + positions[0] = 0; return positions; } @@ -1102,31 +1111,51 @@ vector MothurOut::setFilePosFasta(string filename, int& num) } } /**************************************************************************************************/ -vector MothurOut::setFilePosEachLine(string filename, int& num) { +vector MothurOut::setFilePosEachLine(string filename, int& num) { try { filename = getFullPathName(filename); - vector positions; + vector positions; ifstream in; - openInputFile(filename, in); - + //openInputFile(filename, in); + in.open(filename.c_str(), ios::binary); + string input; + unsigned long long count = 0; + positions.push_back(0); + while(!in.eof()){ - unsigned long int lastpos = in.tellg(); - input = getline(in); - if (input.length() != 0) { - unsigned long int pos = in.tellg(); - if (pos != -1) { positions.push_back(pos - input.length() - 1); } - else { positions.push_back(lastpos); } + //unsigned long long lastpos = in.tellg(); + //input = getline(in); + //if (input.length() != 0) { + //unsigned long long pos = in.tellg(); + //if (pos != -1) { positions.push_back(pos - input.length() - 1); } + //else { positions.push_back(lastpos); } + //} + //gobble(in); //has to be here since windows line endings are 2 characters and mess up the positions + + + //getline counting reads + char d = in.get(); count++; + while ((d != '\n') && (d != '\r') && (d != '\f') && (d != in.eof())) { + //get next character + d = in.get(); + count++; + } + + if (!in.eof()) { + d=in.get(); count++; + while(isspace(d) && (d != in.eof())) { d=in.get(); count++;} } - gobble(in); //has to be here since windows line endings are 2 characters and mess up the positions + positions.push_back(count-1); + cout << count-1 << endl; } in.close(); - num = positions.size(); + num = positions.size()-1; FILE * pFile; - unsigned long int size; + unsigned long long size; //get num bytes in file pFile = fopen (filename.c_str(),"rb"); @@ -1137,7 +1166,7 @@ vector MothurOut::setFilePosEachLine(string filename, int& nu fclose (pFile); } - positions.push_back(size); + positions[(positions.size()-1)] = size; return positions; } @@ -1148,14 +1177,14 @@ vector MothurOut::setFilePosEachLine(string filename, int& nu } /**************************************************************************************************/ -vector MothurOut::divideFile(string filename, int& proc) { +vector MothurOut::divideFile(string filename, int& proc) { try{ - vector filePos; + vector filePos; filePos.push_back(0); FILE * pFile; - unsigned long int size; + unsigned long long size; filename = getFullPathName(filename); @@ -1169,7 +1198,7 @@ vector MothurOut::divideFile(string filename, int& proc) { } //estimate file breaks - unsigned long int chunkSize = 0; + unsigned long long chunkSize = 0; chunkSize = size / proc; //file to small to divide by processors @@ -1177,21 +1206,21 @@ vector MothurOut::divideFile(string filename, int& proc) { //for each process seekg to closest file break and search for next '>' char. make that the filebreak for (int i = 0; i < proc; i++) { - unsigned long int spot = (i+1) * chunkSize; + unsigned long long spot = (i+1) * chunkSize; ifstream in; openInputFile(filename, in); in.seekg(spot); //look for next '>' - unsigned long int newSpot = spot; + unsigned long long newSpot = spot; while (!in.eof()) { char c = in.get(); if (c == '>') { in.putback(c); newSpot = in.tellg(); break; } } //there was not another sequence before the end of the file - unsigned long int sanityPos = in.tellg(); + unsigned long long sanityPos = in.tellg(); if (sanityPos == -1) { break; } else { filePos.push_back(newSpot); } @@ -1220,7 +1249,7 @@ vector MothurOut::divideFile(string filename, int& proc) { int MothurOut::divideFile(string filename, int& proc, vector& files) { try{ - vector filePos = divideFile(filename, proc); + vector filePos = divideFile(filename, proc); for (int i = 0; i < (filePos.size()-1); i++) { @@ -1228,7 +1257,7 @@ int MothurOut::divideFile(string filename, int& proc, vector& files) { ifstream in; openInputFile(filename, in); in.seekg(filePos[i]); - unsigned long int size = filePos[(i+1)] - filePos[i]; + unsigned long long size = filePos[(i+1)] - filePos[i]; char* chunk = new char[size]; in.read(chunk, size); in.close(); diff --git a/mothurout.h b/mothurout.h index 6f04f89..013aa54 100644 --- a/mothurout.h +++ b/mothurout.h @@ -57,10 +57,10 @@ class MothurOut { //functions from mothur.h //file operations - vector divideFile(string, int&); + vector divideFile(string, int&); int divideFile(string, int&, vector&); - vector setFilePosEachLine(string, int&); - vector setFilePosFasta(string, int&); + vector setFilePosEachLine(string, int&); + vector setFilePosFasta(string, int&); string sortFile(string, string); void appendFiles(string, string); int renameFile(string, string); //oldname, newname diff --git a/pairwiseseqscommand.cpp b/pairwiseseqscommand.cpp index ebbf686..2245ac9 100644 --- a/pairwiseseqscommand.cpp +++ b/pairwiseseqscommand.cpp @@ -385,7 +385,7 @@ int PairwiseSeqsCommand::execute(){ //do your part string outputMyPart; - unsigned long int mySize; + unsigned long long mySize; if (output != "square"){ driverMPI(start, end, outputFile, mySize); } else { driverMPI(start, end, outputFile, mySize, output); } @@ -403,7 +403,7 @@ int PairwiseSeqsCommand::execute(){ //wait on chidren for(int b = 1; b < processors; b++) { - unsigned long int fileSize; + unsigned long long fileSize; if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); delete distCalculator; return 0; } @@ -431,7 +431,7 @@ int PairwiseSeqsCommand::execute(){ MPI_File_close(&outMPI); }else { //you are a child process //do your part - unsigned long int size; + unsigned long long size; if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); } else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); } @@ -782,7 +782,7 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, } /**************************************************************************************************/ /////// need to fix to work with calcs and sequencedb -int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){ +int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size){ try { MPI_Status status; @@ -858,7 +858,7 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsi } /**************************************************************************************************/ /////// need to fix to work with calcs and sequencedb -int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){ +int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size, string square){ try { MPI_Status status; diff --git a/pairwiseseqscommand.h b/pairwiseseqscommand.h index afb8e19..a3a91f7 100644 --- a/pairwiseseqscommand.h +++ b/pairwiseseqscommand.h @@ -54,8 +54,8 @@ private: #ifdef USE_MPI int driverMPI(int, int, MPI_File&, float); - int driverMPI(int, int, string, unsigned long int&); - int driverMPI(int, int, string, unsigned long int&, string); + int driverMPI(int, int, string, unsigned long long&); + int driverMPI(int, int, string, unsigned long long&, string); #endif string fastaFileName, align, calc, outputDir, output; diff --git a/phylotree.cpp b/phylotree.cpp index 891cca4..6ca3636 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -131,7 +131,7 @@ PhyloTree::PhyloTree(string tfile){ #ifdef USE_MPI int pid, num, processors; - vector positions; + vector positions; MPI_Status status; MPI_File inMPI; diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 3f6e0c9..67b2f31 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -8,6 +8,8 @@ */ #include "preclustercommand.h" +#include "sequenceparser.h" +#include "deconvolutecommand.h" //********************************************************************************************************************** inline bool comparePriority(seqPNode first, seqPNode second) { return (first.numIdentical > second.numIdentical); } @@ -17,7 +19,9 @@ vector PreClusterCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs); + CommandParameter pbygroup("bygroup", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pbygroup); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -38,7 +42,9 @@ string PreClusterCommand::getHelpString(){ helpString += "The pre.cluster command outputs a new fasta and name file.\n"; helpString += "The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n"; helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n"; + helpString += "The group parameter allows you to provide a group file. \n"; helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n"; + helpString += "The bygroup parameter allows you to indicate you whether you want to cluster sequences only within each group, default=T, when a groupfile is given. \n"; helpString += "The pre.cluster command should be in the following format: \n"; helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n"; helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n"; @@ -114,6 +120,14 @@ PreClusterCommand::PreClusterCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } } //check for required parameters @@ -137,10 +151,25 @@ PreClusterCommand::PreClusterCommand(string option) { namefile = validParameter.validFile(parameters, "name", true); if (namefile == "not found") { namefile = ""; } else if (namefile == "not open") { abort = true; } - else { readNameFile(); m->setNameFile(namefile); } + else { m->setNameFile(namefile); } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not found") { groupfile = ""; } + else if (groupfile == "not open") { abort = true; groupfile = ""; } + else { m->setGroupFile(groupfile); } - string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; } + string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; } convert(temp, diffs); + + temp = validParameter.validFile(parameters, "bygroup", false); + if (temp == "not found") { + if (groupfile != "") { temp = "T"; } + else { temp = "F"; } + } + bygroup = m->isTrue(temp); + + if (bygroup && (groupfile == "")) { m->mothurOut("You cannot set bygroup=T, unless you provide a groupfile."); m->mothurOutEndLine(); abort=true; } + } } @@ -158,22 +187,133 @@ int PreClusterCommand::execute(){ int start = time(NULL); - //reads fasta file and return number of seqs - int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active + string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile)); + string newFastaFile = fileroot + "precluster" + m->getExtension(fastafile); + string newNamesFile = fileroot + "precluster.names"; - if (m->control_pressed) { return 0; } - - if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0; } - if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; } + if (bygroup) { + //clear out old files + ofstream outFasta; m->openOutputFile(newFastaFile, outFasta); outFasta.close(); + ofstream outNames; m->openOutputFile(newNamesFile, outNames); outNames.close(); + + //parse fasta and name file by group + SequenceParser* parser; + if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile); } + else { parser = new SequenceParser(groupfile, fastafile); } + + vector groups = parser->getNamesOfGroups(); + + //precluster each group + for (int i = 0; i < groups.size(); i++) { + + start = time(NULL); + + if (m->control_pressed) { delete parser; m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; } + + m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine(); + + map thisNameMap; + if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); } + vector thisSeqs = parser->getSeqs(groups[i]); + + //fill alignSeqs with this groups info. + int numSeqs = loadSeqs(thisNameMap, thisSeqs); + + if (m->control_pressed) { delete parser; m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; } + + if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; } + + int count = process(); + + if (m->control_pressed) { delete parser; m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; } + + m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine(); + m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); + printData(newFastaFile, newNamesFile); + + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); + + } + + //run unique.seqs for deconvolute results + string inputString = "fasta=" + newFastaFile; + if (namefile != "") { inputString += ", name=" + newNamesFile; } + m->mothurOutEndLine(); + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); + + Command* uniqueCommand = new DeconvoluteCommand(inputString); + uniqueCommand->execute(); + + map > filenames = uniqueCommand->getOutputFiles(); + + delete uniqueCommand; + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + newNamesFile = filenames["name"][0]; + newFastaFile = filenames["fasta"][0]; + + }else { + if (namefile != "") { readNameFile(); } + + //reads fasta file and return number of seqs + int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active - //clear sizes since you only needed this info to build the alignSeqs seqPNode structs -// sizes.clear(); + if (m->control_pressed) { return 0; } + if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0; } + if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; } + + int count = process(); + + if (m->control_pressed) { return 0; } + + m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine(); + m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); + printData(newFastaFile, newNamesFile); + + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); + } + + if (m->control_pressed) { m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + m->mothurOut(newFastaFile); m->mothurOutEndLine(); outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile); + m->mothurOut(newNamesFile); m->mothurOutEndLine(); outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); + m->mothurOutEndLine(); + + //set fasta file as new current fastafile + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "PreClusterCommand", "execute"); + exit(1); + } +} +/**************************************************************************************************/ +int PreClusterCommand::process(){ + try { + //sort seqs by number of identical seqs sort(alignSeqs.begin(), alignSeqs.end(), comparePriority); - + int count = 0; - + int numSeqs = alignSeqs.size(); + //think about running through twice... for (int i = 0; i < numSeqs; i++) { @@ -195,70 +335,32 @@ int PreClusterCommand::execute(){ //merge alignSeqs[i].names += ',' + alignSeqs[j].names; alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; - + alignSeqs[j].active = 0; alignSeqs[j].numIdentical = 0; count++; } }//end if j active }//end if i != j - - //remove from active list + + //remove from active list alignSeqs[i].active = 0; }//end if active i if(i % 100 == 0) { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); } } - if(numSeqs % 100 != 0) { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); } - - - string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile)); - - string newFastaFile = fileroot + "precluster" + m->getExtension(fastafile); - string newNamesFile = fileroot + "precluster.names"; - - if (m->control_pressed) { return 0; } - - m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine(); - m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); - printData(newFastaFile, newNamesFile); - - m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); - - if (m->control_pressed) { m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; } - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(newFastaFile); m->mothurOutEndLine(); outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile); - m->mothurOut(newNamesFile); m->mothurOutEndLine(); outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); - m->mothurOutEndLine(); - - //set fasta file as new current fastafile - string current = ""; - itTypes = outputTypes.find("fasta"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } - } - - itTypes = outputTypes.find("name"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } - } + if(numSeqs % 100 != 0) { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); } - return 0; + return count; } catch(exception& e) { - m->errorOut(e, "PreClusterCommand", "execute"); + m->errorOut(e, "PreClusterCommand", "process"); exit(1); } } - /**************************************************************************************************/ - -//this requires the names and fasta file to be in the same order - int PreClusterCommand::readFASTA(){ try { //ifstream inNames; @@ -313,7 +415,55 @@ int PreClusterCommand::readFASTA(){ exit(1); } } - +/**************************************************************************************************/ +int PreClusterCommand::loadSeqs(map& thisName, vector& thisSeqs){ + try { + length = 0; + alignSeqs.clear(); + map::iterator it; + bool error = false; + + for (int i = 0; i < thisSeqs.size(); i++) { + + if (m->control_pressed) { return 0; } + + if (namefile != "") { + it = thisName.find(thisSeqs[i].getName()); + + //should never be true since parser checks for this + if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; } + else{ + //get number of reps + int numReps = 0; + for(int j=0;j<(it->second).length();j++){ + if((it->second)[j] == ','){ numReps++; } + } + + seqPNode tempNode(numReps, thisSeqs[i], it->second); + alignSeqs.push_back(tempNode); + if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); } + } + }else { //no names file, you are identical to yourself + seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName()); + alignSeqs.push_back(tempNode); + if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); } + } + } + + //sanity check + if (error) { m->control_pressed = true; } + + thisSeqs.clear(); + + return alignSeqs.size(); + } + + catch(exception& e) { + m->errorOut(e, "PreClusterCommand", "loadSeqs"); + exit(1); + } +} + /**************************************************************************************************/ int PreClusterCommand::calcMisMatches(string seq1, string seq2){ @@ -341,10 +491,14 @@ void PreClusterCommand::printData(string newfasta, string newname){ ofstream outFasta; ofstream outNames; - m->openOutputFile(newfasta, outFasta); - m->openOutputFile(newname, outNames); + if (bygroup) { + m->openOutputFileAppend(newfasta, outFasta); + m->openOutputFileAppend(newname, outNames); + }else { + m->openOutputFile(newfasta, outFasta); + m->openOutputFile(newname, outNames); + } - for (int i = 0; i < alignSeqs.size(); i++) { if (alignSeqs[i].numIdentical != 0) { alignSeqs[i].seq.printSequence(outFasta); diff --git a/preclustercommand.h b/preclustercommand.h index 3a12413..38bcd37 100644 --- a/preclustercommand.h +++ b/preclustercommand.h @@ -47,8 +47,8 @@ public: private: int diffs, length; - bool abort; - string fastafile, namefile, outputDir; + bool abort, bygroup; + string fastafile, namefile, outputDir, groupfile; vector alignSeqs; //maps the number of identical seqs to a sequence map names; //represents the names file first column maps to second column map sizes; //this map a seq name to the number of identical seqs in the names file @@ -62,6 +62,8 @@ private: //int readNamesFASTA(); int calcMisMatches(string, string); void printData(string, string); //fasta filename, names file name + int process(); + int loadSeqs(map&, vector&); }; /************************************************************/ diff --git a/rarefactcommand.cpp b/rarefactcommand.cpp index 8d883aa..bc27d6a 100644 --- a/rarefactcommand.cpp +++ b/rarefactcommand.cpp @@ -38,6 +38,7 @@ vector RareFactCommand::setParameters(){ CommandParameter pcalc("calc", "Multiple", "sobs-chao-nseqs-coverage-ace-jack-shannon-shannoneven-npshannon-heip-smithwilson-simpson-simpsoneven-invsimpson-bootstrap", "sobs", "", "", "",true,false); parameters.push_back(pcalc); CommandParameter pabund("abund", "Number", "", "10", "", "", "",false,false); parameters.push_back(pabund); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pgroupmode("groupmode", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pgroupmode); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -63,6 +64,7 @@ string RareFactCommand::getHelpString(){ helpString += "Example rarefaction.single(label=unique-.01-.03, iters=10000, freq=10, calc=sobs-rchao-race-rjack-rbootstrap-rshannon-rnpshannon-rsimpson).\n"; helpString += "The default values for iters is 1000, freq is 100, and calc is rarefaction which calculates the rarefaction curve for the observed richness.\n"; validCalculator.printCalc("rarefaction"); + helpString += "If you are running rarefaction.single with a shared file and would like your results collated in one file, set groupmode=t. (Default=true).\n"; helpString += "The label parameter is used to analyze specific labels in your input.\n"; helpString += "Note: No spaces between parameter labels (i.e. freq), '=' and parameters (i.e.yourFreq).\n"; return helpString; @@ -262,6 +264,9 @@ RareFactCommand::RareFactCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); convert(temp, processors); + + temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; } + groupMode = m->isTrue(temp); } } @@ -440,6 +445,11 @@ int RareFactCommand::execute(){ } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + //create summary file containing all the groups data for each label - this function just combines the info from the files already created. + if ((sharedfile != "") && (groupMode)) { outputNames = createGroupFile(outputNames); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } m->mothurOutEndLine(); @@ -455,6 +465,114 @@ int RareFactCommand::execute(){ } } //********************************************************************************************************************** +vector RareFactCommand::createGroupFile(vector& outputNames) { + try { + + vector newFileNames; + + //find different types of files + map > typesFiles; + for (int i = 0; i < outputNames.size(); i++) { + string extension = m->getExtension(outputNames[i]); + + ifstream in; + m->openInputFile(outputNames[i], in); + + string labels = m->getline(in); + string newLine = labels.substr(0, labels.find_first_of('\t')); + + newLine += "\tGroup" + labels.substr(labels.find_first_of('\t')); + + typesFiles[extension].push_back(outputNames[i]); + + string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension; + + //print headers + ofstream out; + m->openOutputFile(combineFileName, out); + out << newLine << endl; + out.close(); + + } + + //for each type create a combo file + for (map >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) { + + ofstream out; + string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first; + m->openOutputFileAppend(combineFileName, out); + newFileNames.push_back(combineFileName); + + vector thisTypesFiles = it->second; + + //open each type summary file + map > files; //maps file name to lines in file + int maxLines = 0; + for (int i=0; iopenInputFile(thisTypesFiles[i], temp); + + //read through first line - labels + m->getline(temp); m->gobble(temp); + + vector thisFilesLines; + string group = m->getRootName(thisTypesFiles[i]); + group = group.substr(0, group.length()-1); + group = group.substr(group.find_last_of('.')+1); + + thisFilesLines.push_back(group); + while (!temp.eof()){ + + string thisLine = m->getline(temp); + + thisFilesLines.push_back(thisLine); + + m->gobble(temp); + } + + files[thisTypesFiles[i]] = thisFilesLines; + + //save longest file for below + if (maxLines < thisFilesLines.size()) { maxLines = thisFilesLines.size(); } + + temp.close(); + m->mothurRemove(thisTypesFiles[i]); + } + + + //for each label + for (int k = 1; k < maxLines; k++) { + + //grab data for each group + for (int i=0; ierrorOut(e, "RareFactCommand", "createGroupFile"); + exit(1); + } +} +//********************************************************************************************************************** vector RareFactCommand::parseSharedFile(string filename) { try { vector filenames; diff --git a/rarefactcommand.h b/rarefactcommand.h index b4bce3e..3628892 100644 --- a/rarefactcommand.h +++ b/rarefactcommand.h @@ -42,7 +42,7 @@ private: int nIters, abund, processors; float freq; - bool abort, allLines; + bool abort, allLines, groupMode; set labels; //holds labels to be used string label, calc, sharedfile, listfile, rabundfile, sabundfile, format, inputfile; vector Estimators; @@ -51,6 +51,7 @@ private: string outputDir; vector parseSharedFile(string); + vector createGroupFile(vector&); }; #endif diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index f751525..d5f2a6c 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -261,7 +261,7 @@ int ScreenSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } //if the user want to optimize we need to know the 90% mark - vector positions; + vector positions; if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here //use the namefile to optimize correctly if (namefile != "") { nameMap = m->readNames(namefile); } @@ -284,7 +284,7 @@ int ScreenSeqsCommand::execute(){ #ifdef USE_MPI int pid, numSeqsPerProcessor; int tag = 2001; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -629,7 +629,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ } } //*************************************************************************************************************** -int ScreenSeqsCommand::getSummary(vector& positions){ +int ScreenSeqsCommand::getSummary(vector& positions){ try { vector startPosition; @@ -638,7 +638,7 @@ int ScreenSeqsCommand::getSummary(vector& positions){ vector ambigBases; vector longHomoPolymer; - vector positions = m->divideFile(fastafile, processors); + vector positions = m->divideFile(fastafile, processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); @@ -724,7 +724,7 @@ int ScreenSeqsCommand::driverCreateSummary(vector& startPosition, vector= filePos->end)) { break; } #else if (in.eof()) { break; } @@ -1050,7 +1050,7 @@ int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAcc } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } @@ -1076,7 +1076,7 @@ int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAcc } //********************************************************************************************************************** #ifdef USE_MPI -int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector& MPIPos, set& badSeqNames){ +int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector& MPIPos, set& badSeqNames){ try { string outputString = ""; MPI_Status statusGood; diff --git a/screenseqscommand.h b/screenseqscommand.h index 3642295..87c8bee 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -33,9 +33,9 @@ public: private: struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector processIDS; //processid @@ -50,7 +50,7 @@ private: int createProcesses(string, string, string, set&); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&, set&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&, set&); #endif bool abort; @@ -61,7 +61,7 @@ private: map nameMap; int readNames(); - int getSummary(vector&); + int getSummary(vector&); int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, string); int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, string, linePair*); }; diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index 5c80047..d3f6282 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -273,9 +273,9 @@ int SeqErrorCommand::execute(){ if(namesFileName != ""){ weights = getWeights(); } - vector fastaFilePos; - vector qFilePos; - vector reportFilePos; + vector fastaFilePos; + vector qFilePos; + vector reportFilePos; setLines(queryFileName, qualFileName, reportFileName, fastaFilePos, qFilePos, reportFilePos); @@ -731,7 +731,7 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName, index++; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = queryFile.tellg(); + unsigned long long pos = queryFile.tellg(); if ((pos == -1) || (pos >= line.end)) { break; } #else if (queryFile.eof()) { break; } @@ -1204,7 +1204,7 @@ void SeqErrorCommand::printQualityFR(vector > qualForwardMap, vector } /**************************************************************************************************/ -int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector& fastaFilePos, vector& qfileFilePos, vector& rfileFilePos) { +int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector& fastaFilePos, vector& qfileFilePos, vector& rfileFilePos) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) //set file positions for fasta file @@ -1246,7 +1246,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam map::iterator it = firstSeqNames.find(sname); if(it != firstSeqNames.end()) { //this is the start of a new chunk - unsigned long int pos = inQual.tellg(); + unsigned long long pos = inQual.tellg(); qfileFilePos.push_back(pos - input.length() - 1); firstSeqNames.erase(it); } @@ -1267,7 +1267,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam //get last file position of qfile FILE * pFile; - unsigned long int size; + unsigned long long size; //get num bytes in file pFile = fopen (qfilename.c_str(),"rb"); @@ -1305,7 +1305,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam map::iterator it = firstSeqNamesReport.find(sname); if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk - unsigned long int pos = inR.tellg(); + unsigned long long pos = inR.tellg(); rfileFilePos.push_back(pos - input.length() - 1); firstSeqNamesReport.erase(it); } @@ -1326,7 +1326,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam //get last file position of qfile FILE * rFile; - unsigned long int sizeR; + unsigned long long sizeR; //get num bytes in file rFile = fopen (rfilename.c_str(),"rb"); @@ -1346,7 +1346,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam fastaFilePos.push_back(0); qfileFilePos.push_back(0); //get last file position of fastafile FILE * pFile; - unsigned long int size; + unsigned long long size; //get num bytes in file pFile = fopen (filename.c_str(),"rb"); diff --git a/seqerrorcommand.h b/seqerrorcommand.h index 590c554..cc904ec 100644 --- a/seqerrorcommand.h +++ b/seqerrorcommand.h @@ -62,9 +62,9 @@ private: ReferenceDB* rdb; struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector processIDS; //processid @@ -82,7 +82,7 @@ private: void printErrorQuality(map >); void printQualityFR(vector >, vector >); - int setLines(string, string, string, vector&, vector&, vector&); + int setLines(string, string, string, vector&, vector&, vector&); int driver(string, string, string, string, string, string, linePair, linePair, linePair); int createProcesses(string, string, string, string, string, string); diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index ee88353..be75e4f 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -170,7 +170,7 @@ int SeqSummaryCommand::execute(){ int tag = 2001; int startTag = 1; int endTag = 2; int lengthTag = 3; int baseTag = 4; int lhomoTag = 5; int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_Status statusOut; @@ -281,7 +281,7 @@ int SeqSummaryCommand::execute(){ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - vector positions; + vector positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) positions = m->divideFile(fastafile, processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } @@ -332,14 +332,14 @@ int SeqSummaryCommand::execute(){ if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } m->mothurOutEndLine(); - m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer"); m->mothurOutEndLine(); - m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); m->mothurOutEndLine(); - m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); m->mothurOutEndLine(); - m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); m->mothurOutEndLine(); - m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); m->mothurOutEndLine(); - m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); m->mothurOutEndLine(); - m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); m->mothurOutEndLine(); - m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); m->mothurOutEndLine(); + m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer\tNumSeqs"); m->mothurOutEndLine(); + m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0]) + "\t" + toString(1)); m->mothurOutEndLine(); + m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25]) + "\t" + toString(ptile0_25+1)); m->mothurOutEndLine(); + m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25]) + "\t" + toString(ptile25+1)); m->mothurOutEndLine(); + m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50]) + "\t" + toString(ptile50+1)); m->mothurOutEndLine(); + m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75]) + "\t" + toString(ptile75+1)); m->mothurOutEndLine(); + m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5]) + "\t" + toString(ptile97_5+1)); m->mothurOutEndLine(); + m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine(); if (namefile == "") { m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); } else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); } @@ -415,7 +415,7 @@ int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vector= filePos->end)) { break; } #else if (in.eof()) { break; } @@ -438,7 +438,7 @@ int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vector& startPosition, vector& endPosition, vector& seqLength, vector& ambigBases, vector& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector& MPIPos) { +int SeqSummaryCommand::MPICreateSummary(int start, int num, vector& startPosition, vector& endPosition, vector& seqLength, vector& ambigBases, vector& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector& MPIPos) { try { int pid; diff --git a/seqsummarycommand.h b/seqsummarycommand.h index a7ccb57..32947ca 100644 --- a/seqsummarycommand.h +++ b/seqsummarycommand.h @@ -39,9 +39,9 @@ private: map nameMap; struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; vector lines; @@ -51,7 +51,7 @@ private: int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, string, string, linePair*); #ifdef USE_MPI - int MPICreateSummary(int, int, vector&, vector&, vector&, vector&, vector&, MPI_File&, MPI_File&, vector&); + int MPICreateSummary(int, int, vector&, vector&, vector&, vector&, vector&, MPI_File&, MPI_File&, vector&); #endif @@ -69,8 +69,8 @@ typedef struct seqSumData { vector* longHomoPolymer; string filename; string sumFile; - unsigned long int start; - unsigned long int end; + unsigned long long start; + unsigned long long end; int count; MothurOut* m; string namefile; @@ -78,7 +78,7 @@ typedef struct seqSumData { seqSumData(){} - seqSumData(vector* s, vector* e, vector* l, vector* a, vector* h, string f, string sf, MothurOut* mout, unsigned long int st, unsigned long int en, string na, map nam) { + seqSumData(vector* s, vector* e, vector* l, vector* a, vector* h, string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, string na, map nam) { startPosition = s; endPosition = e; seqLength = l; diff --git a/sequenceparser.cpp b/sequenceparser.cpp new file mode 100644 index 0000000..44012d8 --- /dev/null +++ b/sequenceparser.cpp @@ -0,0 +1,273 @@ +/* + * sequenceParser.cpp + * Mothur + * + * Created by westcott on 9/9/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "sequenceParser.h" + + +/************************************************************/ +SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) { + try { + + m = MothurOut::getInstance(); + int error; + + //read group file + groupMap = new GroupMap(groupFile); + error = groupMap->readMap(); + + if (error == 1) { m->control_pressed = true; } + + //initialize maps + vector namesOfGroups = groupMap->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { + vector temp; + map tempMap; + seqs[namesOfGroups[i]] = temp; + nameMapPerGroup[namesOfGroups[i]] = tempMap; + } + + //read fasta file making sure each sequence is in the group file + ifstream in; + m->openInputFile(fastaFile, in); + + map seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file + while (!in.eof()) { + + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { + + string group = groupMap->getGroup(seq.getName()); + if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); } + else { + seqs[group].push_back(seq); + seqName[seq.getName()] = seq.getAligned(); + } + } + } + in.close(); + + if (error == 1) { m->control_pressed = true; } + + //read name file + ifstream inName; + m->openInputFile(nameFile, inName); + + string first, second; + int countName = 0; + while(!inName.eof()) { + + if (m->control_pressed) { break; } + + inName >> first; m->gobble(inName); + inName >> second; m->gobble(inName); + + vector names; + m->splitAtChar(second, names, ','); + + //get aligned string for these seqs from the fasta file + string alignedString = ""; + map::iterator itAligned = seqName.find(names[0]); + if (itAligned == seqName.end()) { + error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine(); + }else { + alignedString = itAligned->second; + } + + //separate by group - parse one line in name file + map splitMap; //group -> name1,name2,... + map::iterator it; + for (int i = 0; i < names.size(); i++) { + + string group = groupMap->getGroup(names[i]); + if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); } + else { + + it = splitMap.find(group); + if (it != splitMap.end()) { //adding seqs to this group + (it->second) += "," + names[i]; + countName++; + }else { //first sighting of this group + splitMap[group] = names[i]; + countName++; + + //is this seq in the fasta file? + if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match + Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file. + seqs[group].push_back(tempSeq); + } + } + } + } + + + //fill nameMapPerGroup - holds all lines in namefile separated by group + for (it = splitMap.begin(); it != splitMap.end(); it++) { + //grab first name + string firstName = ""; + for(int i = 0; i < (it->second).length(); i++) { + if (((it->second)[i]) != ',') { + firstName += ((it->second)[i]); + }else { break; } + } + + //group1 -> seq1 -> seq1,seq2,seq3 + nameMapPerGroup[it->first][firstName] = it->second; + } + } + + inName.close(); + + if (error == 1) { m->control_pressed = true; } + + if (countName != (groupMap->getNumSeqs())) { + m->mothurOutEndLine(); + m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); + m->mothurOutEndLine(); + m->control_pressed = true; + } + + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "SequenceParser"); + exit(1); + } +} +/************************************************************/ +SequenceParser::SequenceParser(string groupFile, string fastaFile) { + try { + + m = MothurOut::getInstance(); + int error; + + //read group file + groupMap = new GroupMap(groupFile); + error = groupMap->readMap(); + + if (error == 1) { m->control_pressed = true; } + + //initialize maps + vector namesOfGroups = groupMap->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { + vector temp; + seqs[namesOfGroups[i]] = temp; + } + + //read fasta file making sure each sequence is in the group file + ifstream in; + m->openInputFile(fastaFile, in); + int count = 0; + + while (!in.eof()) { + + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { + + string group = groupMap->getGroup(seq.getName()); + if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); } + else { seqs[group].push_back(seq); count++; } + } + } + in.close(); + + if (error == 1) { m->control_pressed = true; } + + if (count != (groupMap->getNumSeqs())) { + m->mothurOutEndLine(); + m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); + if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); } + m->mothurOutEndLine(); + m->control_pressed = true; + } + + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "SequenceParser"); + exit(1); + } +} +/************************************************************/ +SequenceParser::~SequenceParser(){ delete groupMap; } +/************************************************************/ +int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); } +/************************************************************/ +vector SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); } +/************************************************************/ +bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); } +/************************************************************/ +string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); } +/************************************************************/ +int SequenceParser::getNumSeqs(string g){ + try { + map >::iterator it; + int num = 0; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine(); + }else { + num = (it->second).size(); + } + + return num; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getNumSeqs"); + exit(1); + } +} +/************************************************************/ +vector SequenceParser::getSeqs(string g){ + try { + map >::iterator it; + vector seqForThisGroup; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + seqForThisGroup = it->second; + } + + return seqForThisGroup; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getSeqs"); + exit(1); + } +} +/************************************************************/ +map SequenceParser::getNameMap(string g){ + try { + map >::iterator it; + map nameMapForThisGroup; + + it = nameMapPerGroup.find(g); + if(it == nameMapPerGroup.end()) { + m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + nameMapForThisGroup = it->second; + } + + return nameMapForThisGroup; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getNameMap"); + exit(1); + } +} +/************************************************************/ + + + diff --git a/sequenceparser.h b/sequenceparser.h new file mode 100644 index 0000000..fa838f0 --- /dev/null +++ b/sequenceparser.h @@ -0,0 +1,56 @@ +#ifndef SEQUENCEPARSER_H +#define SEQUENCEPARSER_H + +/* + * sequenceParser.h + * Mothur + * + * Created by westcott on 9/9/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + + +#include "mothur.h" +#include "mothurout.h" +#include "sequence.hpp" +#include "groupmap.h" + +/* This class reads a fasta and group file with a namesfile as optional and parses the data by group. + + Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences. + This is because when we parse the name file we make a unique for each group instead of 1 unique for all + groups. + + */ + +class SequenceParser { + + public: + + SequenceParser(string, string); //group, fasta - file mismatches will set m->control_pressed = true + SequenceParser(string, string, string); //group, fasta, name - file mismatches will set m->control_pressed = true + ~SequenceParser(); + + //general operations + int getNumGroups(); + vector getNamesOfGroups(); + bool isValidGroup(string); //return true if string is a valid group + string getGroup(string); //returns group of a specific sequence + + int getNumSeqs(string); //returns the number of unique sequences in a specific group + vector getSeqs(string); //returns unique sequences in a specific group + map getNameMap(string); //returns seqName -> namesOfRedundantSeqs separated by commas for a specific group - the name file format, but each line is parsed by group. + + private: + + GroupMap* groupMap; + MothurOut* m; + + int numSeqs; + map > seqs; //a vector for each group + map > nameMapPerGroup; //nameMap for each group +}; + +#endif + diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp index 341dd18..4965cfd 100644 --- a/sffinfocommand.cpp +++ b/sffinfocommand.cpp @@ -466,7 +466,7 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ //read offset char buffer2 [8]; in.read(buffer2, 8); - header.indexOffset = be_int8(*(unsigned long int *)(&buffer2)); + header.indexOffset = be_int8(*(unsigned long long *)(&buffer2)); //read index length char buffer3 [4]; @@ -513,8 +513,8 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ delete[] tempBuffer2; /* Pad to 8 chars */ - unsigned long int spotInFile = in.tellg(); - unsigned long int spot = (spotInFile + 7)& ~7; // ~ inverts + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts in.seekg(spot); }else{ @@ -581,8 +581,8 @@ int SffInfoCommand::readHeader(ifstream& in, Header& header){ decodeName(header.timestamp, header.region, header.xy, header.name); /* Pad to 8 chars */ - unsigned long int spotInFile = in.tellg(); - unsigned long int spot = (spotInFile + 7)& ~7; + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; in.seekg(spot); }else{ @@ -634,8 +634,8 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, i } /* Pad to 8 chars */ - unsigned long int spotInFile = in.tellg(); - unsigned long int spot = (spotInFile + 7)& ~7; + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; in.seekg(spot); }else{ diff --git a/sffinfocommand.h b/sffinfocommand.h index f18a3f5..903a589 100644 --- a/sffinfocommand.h +++ b/sffinfocommand.h @@ -17,7 +17,7 @@ struct CommonHeader { unsigned int magicNumber; string version; - unsigned long int indexOffset; + unsigned long long indexOffset; unsigned int indexLength; unsigned int numReads; unsigned short headerLength; diff --git a/sharedrabundvector.cpp b/sharedrabundvector.cpp index 56b7d09..51e277a 100644 --- a/sharedrabundvector.cpp +++ b/sharedrabundvector.cpp @@ -395,7 +395,7 @@ int SharedRAbundVector::getGroupIndex() { return index; } void SharedRAbundVector::setGroupIndex(int vIndex) { index = vIndex; } /***********************************************************************/ int SharedRAbundVector::getNumBins(){ - return numBins; + return numBins; } /***********************************************************************/ @@ -423,6 +423,7 @@ vector SharedRAbundVector::getSharedRAbundVectors(){ vector Groups = m->getGroups(); vector allGroups = m->getAllGroups(); util->setGroups(Groups, allGroups); + m->setGroups(Groups); bool remove = false; for (int i = 0; i < lookup.size(); i++) { diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 5017f88..a40d774 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -18,14 +18,6 @@ #include "sparsematrix.hpp" #include -//********************************************************************************************************************** - -#define NUMBINS 1000 -#define HOMOPS 10 -#define MIN_COUNT 0.1 -#define MIN_WEIGHT 0.1 -#define MIN_TAU 0.0001 -#define MIN_ITER 10 //********************************************************************************************************************** vector ShhherCommand::setParameters(){ try { @@ -310,8 +302,8 @@ int ShhherCommand::execute(){ processors = ncpus; m->mothurOut("\nGetting preliminary data...\n"); - getSingleLookUp(); - getJointLookUp(); + getSingleLookUp(); if (m->control_pressed) { return 0; } + getJointLookUp(); if (m->control_pressed) { return 0; } vector flowFileVector; if(flowFilesFileName != "not found"){ @@ -320,7 +312,7 @@ int ShhherCommand::execute(){ ifstream flowFilesFile; m->openInputFile(flowFilesFileName, flowFilesFile); while(flowFilesFile){ - flowFilesFile >> fName; + fName = m->getline(flowFilesFile); flowFileVector.push_back(fName); m->gobble(flowFilesFile); } @@ -335,17 +327,24 @@ int ShhherCommand::execute(){ } for(int i=0;icontrol_pressed) { break; } + double begClock = clock(); - unsigned long int begTime = time(NULL); + unsigned long long begTime = time(NULL); flowFileName = flowFileVector[i]; m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n"); m->mothurOut("Reading flowgrams...\n"); getFlowData(); + + if (m->control_pressed) { break; } m->mothurOut("Identifying unique flowgrams...\n"); getUniques(); + + if (m->control_pressed) { break; } m->mothurOut("Calculating distances between flowgrams...\n"); char fileName[1024]; @@ -368,6 +367,8 @@ int ShhherCommand::execute(){ string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques)); + if (m->control_pressed) { break; } + int done; for(int i=1;icontrol_pressed) { break; } + m->mothurOut("\nClustering flowgrams...\n"); string listFileName = cluster(distFileName, namesFileName); - + + if (m->control_pressed) { break; } + getOTUData(listFileName); m->mothurRemove(distFileName); m->mothurRemove(namesFileName); m->mothurRemove(listFileName); + if (m->control_pressed) { break; } + initPyroCluster(); + if (m->control_pressed) { break; } + for(int i=1;icontrol_pressed) { break; } double maxDelta = 0; int iter = 0; @@ -406,10 +416,12 @@ int ShhherCommand::execute(){ m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n"); while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){ - + double cycClock = clock(); - unsigned long int cycTime = time(NULL); + unsigned long long cycTime = time(NULL); fill(); + + if (m->control_pressed) { break; } int total = singleTau.size(); for(int i=1;icontrol_pressed) { break; } + double nLL = getLikelihood(); if (m->control_pressed) { break; } + checkCentroids(); if (m->control_pressed) { break; } for(int i=1;icontrol_pressed) { break; } + m->mothurOut("\nFinalizing...\n"); fill(); + + if (m->control_pressed) { break; } + setOTUs(); vector otuCounts(numOTUs, 0); for(int i=0;icontrol_pressed) { break; } + + writeQualities(otuCounts); if (m->control_pressed) { break; } + writeSequences(otuCounts); if (m->control_pressed) { break; } + writeNames(otuCounts); if (m->control_pressed) { break; } + writeClusters(otuCounts); if (m->control_pressed) { break; } + writeGroups(); if (m->control_pressed) { break; } m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n'); @@ -549,6 +568,9 @@ int ShhherCommand::execute(){ MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); for(int i=0;icontrol_pressed) { break; } + //Now into the pyrodist part bool live = 1; @@ -578,7 +600,9 @@ int ShhherCommand::execute(){ int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques); string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd); - + + if (m->control_pressed) { break; } + int done = 1; MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); @@ -608,6 +632,8 @@ int ShhherCommand::execute(){ while(live){ + if (m->control_pressed) { break; } + MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); singleTau.assign(total, 0.0000); seqNumber.assign(total, 0); @@ -643,7 +669,10 @@ int ShhherCommand::execute(){ MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); } } - } + } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + MPI_Barrier(MPI_COMM_WORLD); @@ -680,6 +709,9 @@ string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){ double begClock = clock(); for(int i=startSeq;icontrol_pressed) { break; } + for(int j=0;jmothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n'); - string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist"; if(pid != 0){ fDistFileName += ".temp." + toString(pid); } + + if (m->control_pressed) { return fDistFileName; } + + m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n'); ofstream distFile(fDistFileName.c_str()); distFile << outStream.str(); @@ -719,9 +753,10 @@ int ShhherCommand::execute(){ try { if (abort == true) { return 0; } - getSingleLookUp(); - getJointLookUp(); + getSingleLookUp(); if (m->control_pressed) { return 0; } + getJointLookUp(); if (m->control_pressed) { return 0; } + vector flowFileVector; if(flowFilesFileName != "not found"){ string fName; @@ -729,7 +764,7 @@ int ShhherCommand::execute(){ ifstream flowFilesFile; m->openInputFile(flowFilesFileName, flowFilesFile); while(flowFilesFile){ - flowFilesFile >> fName; + fName = m->getline(flowFilesFile); flowFileVector.push_back(fName); m->gobble(flowFilesFile); } @@ -741,76 +776,111 @@ int ShhherCommand::execute(){ for(int i=0;icontrol_pressed) { break; } + flowFileName = flowFileVector[i]; m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n"); m->mothurOut("Reading flowgrams...\n"); getFlowData(); + if (m->control_pressed) { break; } + m->mothurOut("Identifying unique flowgrams...\n"); getUniques(); + if (m->control_pressed) { break; } m->mothurOut("Calculating distances between flowgrams...\n"); string distFileName = createDistFile(processors); string namesFileName = createNamesFile(); - + + if (m->control_pressed) { break; } + m->mothurOut("\nClustering flowgrams...\n"); string listFileName = cluster(distFileName, namesFileName); + + if (m->control_pressed) { break; } + getOTUData(listFileName); + if (m->control_pressed) { break; } + m->mothurRemove(distFileName); m->mothurRemove(namesFileName); m->mothurRemove(listFileName); initPyroCluster(); + if (m->control_pressed) { break; } + double maxDelta = 0; int iter = 0; double begClock = clock(); - unsigned long int begTime = time(NULL); + unsigned long long begTime = time(NULL); m->mothurOut("\nDenoising flowgrams...\n"); m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n"); while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){ + if (m->control_pressed) { break; } + double cycClock = clock(); - unsigned long int cycTime = time(NULL); + unsigned long long cycTime = time(NULL); fill(); + if (m->control_pressed) { break; } + calcCentroids(); - maxDelta = getNewWeights(); - double nLL = getLikelihood(); + if (m->control_pressed) { break; } + + maxDelta = getNewWeights(); if (m->control_pressed) { break; } + double nLL = getLikelihood(); if (m->control_pressed) { break; } checkCentroids(); + if (m->control_pressed) { break; } + calcNewDistances(); - + + if (m->control_pressed) { break; } + iter++; m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n'); } + if (m->control_pressed) { break; } + m->mothurOut("\nFinalizing...\n"); fill(); + + if (m->control_pressed) { break; } + setOTUs(); + if (m->control_pressed) { break; } + vector otuCounts(numOTUs, 0); for(int i=0;icontrol_pressed) { break; } + writeQualities(otuCounts); if (m->control_pressed) { break; } + writeSequences(otuCounts); if (m->control_pressed) { break; } + writeNames(otuCounts); if (m->control_pressed) { break; } + writeClusters(otuCounts); if (m->control_pressed) { break; } + writeGroups(); if (m->control_pressed) { break; } m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n'); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + if(compositeFASTAFileName != ""){ outputNames.push_back(compositeFASTAFileName); outputNames.push_back(compositeNamesFileName); @@ -850,6 +920,9 @@ void ShhherCommand::getFlowData(){ flowFile >> numFlowCells; int index = 0;//pcluster while(!flowFile.eof()){ + + if (m->control_pressed) { break; } + flowFile >> seqName >> currentNumFlowCells; lengths.push_back(currentNumFlowCells); @@ -869,6 +942,9 @@ void ShhherCommand::getFlowData(){ numSeqs = seqNameVector.size(); for(int i=0;icontrol_pressed) { break; } + int iNumFlowCells = i * numFlowCells; for(int j=lengths[i];jopenInputFile(lookupFileName, lookUpFile); for(int i=0;icontrol_pressed) { break; } + float logFracFreq; lookUpFile >> logFracFreq; @@ -919,6 +998,9 @@ void ShhherCommand::getJointLookUp(){ jointLookUp.resize(NUMBINS * NUMBINS, 0); for(int i=0;icontrol_pressed) { break; } + for(int j=0;jcontrol_pressed) { break; } + float negLogProb = singleLookUp[i * NUMBINS + intIntensity]; if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; } } @@ -975,6 +1060,9 @@ void ShhherCommand::getUniques(){ vector uniqueFlowDataIntI(numFlowCells * numSeqs, -1); for(int i=0;icontrol_pressed) { break; } + int index = 0; vector current(numFlowCells); @@ -1025,7 +1113,7 @@ void ShhherCommand::getUniques(){ uniqueLengths.resize(numUniques); flowDataPrI.resize(numSeqs * numFlowCells, 0); - for(int i=0;icontrol_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); } } catch(exception& e) { m->errorOut(e, "ShhherCommand", "getUniques"); @@ -1046,6 +1134,9 @@ float ShhherCommand::calcPairwiseDist(int seqA, int seqB){ float dist = 0; for(int i=0;icontrol_pressed) { break; } + int flowAIntI = flowDataIntI[ANumFlowCells + i]; float flowAPrI = flowDataPrI[ANumFlowCells + i]; @@ -1078,6 +1169,9 @@ void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int st double begClock = clock(); for(int i=startSeq;icontrol_pressed) { break; } + for(int j=0;jmothurOutEndLine(); } } - m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime)); - m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); - m->mothurOutEndLine(); ofstream distFile(distFileName.c_str()); distFile << outStream.str(); distFile.close(); + + if (m->control_pressed) {} + else { + m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime)); + m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); + m->mothurOutEndLine(); + } } catch(exception& e) { m->errorOut(e, "ShhherCommand", "flowDistParentFork"); @@ -1112,31 +1210,37 @@ void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int st string ShhherCommand::createDistFile(int processors){ try{ +//////////////////////// until I figure out the shared memory issue ////////////////////// +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else + processors=1; +#endif +//////////////////////// until I figure out the shared memory issue ////////////////////// + string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist"; - unsigned long int begTime = time(NULL); + unsigned long long begTime = time(NULL); double begClock = clock(); - - vector start; - vector end; -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if (numSeqs < processors){ processors = 1; } + if(processors == 1) { flowDistParentFork(fDistFileName, 0, numUniques); } + else{ //you have multiple processors - if (numSeqs < processors){ processors = 1; } - vector start(processors, 0); vector end(processors, 0); + int process = 1; + vector processIDs; + for (int i = 0; i < processors; i++) { start[i] = int(sqrt(float(i)/float(processors)) * numUniques); end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques); } - int process = 1; - vector processIDs; - +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //loop through and create all the processes you want while (process != processors) { int pid = fork(); @@ -1163,6 +1267,43 @@ string ShhherCommand::createDistFile(int processors){ int temp = processIDs[i]; wait(&temp); } +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the flowDistParentForkData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for(int i = 0; i < processors-1; i++){ + // Allocate memory for thread data. + string extension = extension = toString(i) + ".temp"; + + flowDistParentForkData* tempdist = new flowDistParentForkData((fDistFileName + extension), mapUniqueToSeq, mapSeqToUnique, lengths, flowDataIntI, flowDataPrI, jointLookUp, m, start[i+1], end[i+1], numFlowCells, cutoff, i); + pDataArray.push_back(tempdist); + processIDs.push_back(i); + + //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. + //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier + hThreadArray[i] = CreateThread(NULL, 0, MyflowDistParentForkThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + } + + //parent does its part + flowDistParentFork(fDistFileName, start[0], end[0]); + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + +#endif //append and remove temp files for (int i=0;imothurOutEndLine(); - m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n'); - return fDistFileName; } catch(exception& e) { @@ -1206,6 +1341,9 @@ string ShhherCommand::createNamesFile(){ m->openOutputFile(nameFileName, nameFile); for(int i=0;icontrol_pressed) { break; } + // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl; nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl; } @@ -1244,6 +1382,9 @@ string ShhherCommand::cluster(string distFileName, string namesFileName){ double clusterCutoff = cutoff; while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){ + + if (m->control_pressed) { break; } + cluster->update(clusterCutoff); } @@ -1288,6 +1429,8 @@ void ShhherCommand::getOTUData(string listFileName){ string singleOTU = ""; for(int i=0;icontrol_pressed) { break; } listFile >> singleOTU; @@ -1358,6 +1501,9 @@ void ShhherCommand::getOTUData(string listFileName){ void ShhherCommand::initPyroCluster(){ try{ + + if (numOTUs < processors) { processors = 1; } + dist.assign(numSeqs * numOTUs, 0); change.assign(numOTUs, 1); centroids.assign(numOTUs, -1); @@ -1387,6 +1533,9 @@ void ShhherCommand::fill(){ try { int index = 0; for(int i=0;icontrol_pressed) { break; } + cumNumSeqs[i] = index; for(int j=0;jcontrol_pressed) { break; } + double count = 0; int position = 0; int minFlowGram = 100000000; @@ -1545,7 +1695,7 @@ double ShhherCommand::getDistToCentroid(int cent, int flow, int length){ int flowBValue = flow * numFlowCells; double dist = 0; - + for(int i=0;icontrol_pressed) { break; } + double difference = weight[i]; weight[i] = 0; @@ -1606,6 +1758,9 @@ double ShhherCommand::getLikelihood(){ string hold; for(int i=0;icontrol_pressed) { break; } + for(int j=0;jcontrol_pressed) { break; } + if(unique[i] == 1){ for(int j=i+1;jcontrol_pressed) { break; } + double offset = 1e8; int indexOffset = i * numOTUs; @@ -1818,6 +1977,9 @@ void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector child_singleTau.resize(0); for(int i=startSeq;icontrol_pressed) { break; } + double offset = 1e8; int indexOffset = i * numOTUs; @@ -1869,22 +2031,26 @@ void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){ vector newTau(numOTUs,0); vector norms(numSeqs, 0); nSeqsPerOTU.assign(numOTUs, 0); - + for(int i=startSeq;icontrol_pressed) { break; } + + int indexOffset = i * numOTUs; + double offset = 1e8; for(int j=0;j MIN_WEIGHT && change[j] == 1){ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]); } - + if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){ offset = dist[indexOffset + j]; } } - + for(int j=0;j MIN_WEIGHT){ newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j]; @@ -1894,11 +2060,11 @@ void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){ newTau[j] = 0.0; } } - + for(int j=0;j MIN_TAU){ @@ -1917,7 +2083,9 @@ void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){ nSeqsPerOTU[j]++; } } + } + } catch(exception& e) { m->errorOut(e, "ShhherCommand", "calcNewDistancesParent"); @@ -1933,6 +2101,9 @@ void ShhherCommand::setOTUs(){ vector bigTauMatrix(numOTUs * numSeqs, 0.0000); for(int i=0;icontrol_pressed) { break; } + for(int j=0;j otuCounts){ for(int i=0;icontrol_pressed) { break; } + int index = 0; int base = 0; @@ -2090,6 +2264,9 @@ void ShhherCommand::writeSequences(vector otuCounts){ vector names(numOTUs, ""); for(int i=0;icontrol_pressed) { break; } + int index = centroids[i]; if(otuCounts[i] > 0){ @@ -2131,6 +2308,9 @@ void ShhherCommand::writeNames(vector otuCounts){ m->openOutputFile(nameFileName, nameFile); for(int i=0;icontrol_pressed) { break; } + if(otuCounts[i] > 0){ nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]]; @@ -2165,6 +2345,7 @@ void ShhherCommand::writeGroups(){ m->openOutputFile(groupFileName, groupFile); for(int i=0;icontrol_pressed) { break; } groupFile << seqNameVector[i] << '\t' << fileRoot << endl; } groupFile.close(); @@ -2188,6 +2369,10 @@ void ShhherCommand::writeClusters(vector otuCounts){ string bases = flowOrder; for(int i=0;icontrol_pressed) { + break; + } //output the translated version of the centroid sequence for the otu if(otuCounts[i] > 0){ int index = centroids[i]; diff --git a/shhhercommand.h b/shhhercommand.h index bdc8503..00bd41a 100644 --- a/shhhercommand.h +++ b/shhhercommand.h @@ -13,6 +13,15 @@ #include "mothur.h" #include "command.hpp" +//********************************************************************************************************************** + +#define NUMBINS 1000 +#define HOMOPS 10 +#define MIN_COUNT 0.1 +#define MIN_WEIGHT 0.1 +#define MIN_TAU 0.0001 +#define MIN_ITER 10 +//********************************************************************************************************************** class ShhherCommand : public Command { @@ -116,6 +125,125 @@ private: }; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +typedef struct flowDistParentForkData { + string distFileName; + vector mapUniqueToSeq; + vector mapSeqToUnique; + vector lengths; + vector flowDataIntI; + vector flowDataPrI; + vector jointLookUp; + MothurOut* m; + int threadID, startSeq, stopSeq, numFlowCells; + float cutoff; + + flowDistParentForkData(){} + flowDistParentForkData(string d, vector mapU, vector mapS, vector l, vector flowD, vector flowDa, vector j, MothurOut* mout, int st, int sp, int n, float cut, int tid) { + distFileName = d; + mapUniqueToSeq = mapU; + mapSeqToUnique = mapS; + lengths = l; + flowDataIntI = flowD; + flowDataPrI = flowDa; + jointLookUp = j; + m = mout; + startSeq = st; + stopSeq = sp; + numFlowCells = n; + cutoff= cut; + threadID = tid; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyflowDistParentForkThreadFunction(LPVOID lpParam){ + flowDistParentForkData* pDataArray; + pDataArray = (flowDistParentForkData*)lpParam; + + try { + ostringstream outStream; + outStream.setf(ios::fixed, ios::floatfield); + outStream.setf(ios::dec, ios::basefield); + outStream.setf(ios::showpoint); + outStream.precision(6); + + int begTime = time(NULL); + double begClock = clock(); + string tempOut = "start and end = " + toString(pDataArray->startSeq) +'\t' + toString(pDataArray->stopSeq) + "-"; + cout << tempOut << endl; + + for(int i=pDataArray->startSeq;istopSeq;i++){ + + if (pDataArray->m->control_pressed) { break; } + cout << "thread i = " << i << endl; + for(int j=0;jm->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); + pDataArray->m->mothurOutEndLine(); + } + } + + ofstream distFile(pDataArray->distFileName.c_str()); + distFile << outStream.str(); + distFile.close(); + + if (pDataArray->m->control_pressed) {} + else { + pDataArray->m->mothurOut(toString(pDataArray->stopSeq-1) + "\t" + toString(time(NULL) - begTime)); + pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); + pDataArray->m->mothurOutEndLine(); + } + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ShhherCommand", "MyflowDistParentForkThreadFunction"); + exit(1); + } +} +#endif + #endif diff --git a/simpson.cpp b/simpson.cpp index bf3365d..edc35bd 100644 --- a/simpson.cpp +++ b/simpson.cpp @@ -28,13 +28,13 @@ EstOutput Simpson::getValues(SAbundVector* rank){ if(sobs != 0){ double simnum=0.0000; - for(unsigned long int i=1;i<=maxRank;i++){ + for(unsigned long long i=1;i<=maxRank;i++){ simnum += (double)(rank->get(i)*i*(i-1)); } simpson = simnum / (sampled*(sampled-1)); - for(unsigned long int i=1;i<=maxRank;i++){ + for(unsigned long long i=1;i<=maxRank;i++){ double piI = (double) i / (double)sampled; firstTerm += rank->get(i) * pow(piI, 3); secondTerm += rank->get(i) * pow(piI, 2); diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index 651ee02..28cfc34 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -9,6 +9,7 @@ #include "subsamplecommand.h" #include "sharedutilities.h" +#include "deconvolutecommand.h" //********************************************************************************************************************** vector SubSampleCommand::setParameters(){ @@ -423,60 +424,45 @@ int SubSampleCommand::getSubSampleFasta() { set subset; //dont want repeat sequence names added if (persample) { - for (int i = 0; i < Groups.size(); i++) { - - //randomly select a subset of those names from this group to include in the subsample - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { return 0; } + //initialize counts + map groupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + + for (int j = 0; j < names.size(); j++) { - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if (subset.count(names[myrand]) == 0) { - - string group = groupMap->getGroup(names[myrand]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (group == Groups[i]) { subset.insert(names[myrand]); break; } - } - } - } + if (m->control_pressed) { return 0; } + + string group = groupMap->getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + if (groupCounts[group] < size) { subset.insert(names[j]); } + } } }else { //randomly select a subset of those names to include in the subsample - for (int j = 0; j < size; j++) { + //since names was randomly shuffled just grab the next one + for (int j = 0; j < names.size(); j++) { if (m->control_pressed) { return 0; } - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); + if (groupfile != "") { //if there is a groupfile given fill in group info + string group = groupMap->getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - if (subset.count(names[myrand]) == 0) { - - if (groupfile != "") { //if there is a groupfile given fill in group info - string group = groupMap->getGroup(names[myrand]); - if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } - - if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups - if (m->inUsersGroups(group, Groups)) { - subset.insert(names[myrand]); break; - } - }else{ - subset.insert(names[myrand]); break; - } - }else{ //save everyone, group - subset.insert(names[myrand]); break; - } + if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups + if (m->inUsersGroups(group, Groups)) { + subset.insert(names[j]); + } + }else{ + subset.insert(names[j]); } - } + }else{ //save everyone, group + subset.insert(names[j]); + } + + //do we have enough?? + if (subset.size() == size) { break; } } } @@ -488,7 +474,6 @@ int SubSampleCommand::getSubSampleFasta() { ofstream out; m->openOutputFile(outputFileName, out); - outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); //read through fasta file outputting only the names on the subsample list ifstream in; @@ -531,6 +516,32 @@ int SubSampleCommand::getSubSampleFasta() { m->mothurOut("[ERROR]: The subset selected contained " + toString(subset.size()) + " sequences, but I only found " + toString(count) + " of those in the fastafile."); m->mothurOutEndLine(); } + if (namefile != "") { + m->mothurOut("Deconvoluting subsampled fasta file... "); m->mothurOutEndLine(); + + //use unique.seqs to create new name and fastafile + string inputString = "fasta=" + outputFileName; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); + + Command* uniqueCommand = new DeconvoluteCommand(inputString); + uniqueCommand->execute(); + + map > filenames = uniqueCommand->getOutputFiles(); + + delete uniqueCommand; + + outputTypes["name"].push_back(filenames["name"][0]); outputNames.push_back(filenames["name"][0]); + m->mothurRemove(outputFileName); + outputFileName = filenames["fasta"][0]; + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + m->mothurOut("Done."); m->mothurOutEndLine(); + } + + outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName); + //if a groupfile is provided read through the group file only outputting the names on the subsample list if (groupfile != "") { @@ -815,9 +826,10 @@ int SubSampleCommand::processShared(vector& thislookup) { if (m->control_pressed) { delete order; out.close(); return 0; } //get random number to sample from order between 0 and thisSize-1. - int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); + //don't need this because of the random shuffle above + //int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - int bin = order->get(myrand); + int bin = order->get(j); int abund = thislookup[i]->getAbundance(bin); thislookup[i]->set(bin, (abund+1), thisgroup); @@ -1011,37 +1023,26 @@ int SubSampleCommand::getSubSampleList() { //randomly select a subset of those names to include in the subsample set subset; //dont want repeat sequence names added if (persample) { - for (int i = 0; i < Groups.size(); i++) { + //initialize counts + map groupCounts; + for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; } + + for (int j = 0; j < names.size(); j++) { - for (int j = 0; j < size; j++) { - - if (m->control_pressed) { break; } - - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if (subset.count(names[myrand]) == 0) { //you are not already added - if (groupMap->getGroup(names[myrand]) == Groups[i]) { subset.insert(names[myrand]); break; } - } - } - } + if (m->control_pressed) { return 0; } + + string group = groupMap->getGroup(names[j]); + if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; } + else{ + if (groupCounts[group] < size) { subset.insert(names[j]); } + } } }else{ for (int j = 0; j < size; j++) { if (m->control_pressed) { break; } - //get random sequence to add, making sure we have not already added it - bool done = false; - int myrand; - while (!done) { - myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if (subset.count(names[myrand]) == 0) { subset.insert(names[myrand]); break; } - } + subset.insert(names[j]); } } @@ -1322,10 +1323,7 @@ int SubSampleCommand::processRabund(RAbundVector*& rabund, ofstream& out) { if (m->control_pressed) { delete order; return 0; } - //get random number to sample from order between 0 and thisSize-1. - int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - int bin = order->get(myrand); + int bin = order->get(j); int abund = rabund->get(bin); rabund->set(bin, (abund+1)); @@ -1484,10 +1482,7 @@ int SubSampleCommand::processSabund(SAbundVector*& sabund, ofstream& out) { if (m->control_pressed) { delete order; return 0; } - //get random number to sample from order between 0 and thisSize-1. - int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - int bin = order->get(myrand); + int bin = order->get(j); int abund = rabund->get(bin); rabund->set(bin, (abund+1)); diff --git a/systemcommand.cpp b/systemcommand.cpp index d0057e8..c07deff 100644 --- a/systemcommand.cpp +++ b/systemcommand.cpp @@ -91,23 +91,31 @@ int SystemCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - command += " > ./commandScreen.output 2>&1"; - system(command.c_str()); + //if command contains a redirect don't add the redirect + bool usedRedirect = false; + if ((command.find('>')) == string::npos) { + command += " > ./commandScreen.output 2>&1"; + usedRedirect = true; + } - ifstream in; - string filename = "./commandScreen.output"; - m->openInputFile(filename, in, "no error"); + system(command.c_str()); - string output = ""; - while(char c = in.get()){ - if(in.eof()) { break; } - else { output += c; } + if (usedRedirect) { + ifstream in; + string filename = "./commandScreen.output"; + m->openInputFile(filename, in, "no error"); + + string output = ""; + while(char c = in.get()){ + if(in.eof()) { break; } + else { output += c; } + } + in.close(); + + m->mothurOut(output); m->mothurOutEndLine(); + m->mothurRemove(filename); } - in.close(); - m->mothurOut(output); m->mothurOutEndLine(); - m->mothurRemove(filename); - return 0; } diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp index d17e000..6d69735 100644 --- a/trimflowscommand.cpp +++ b/trimflowscommand.cpp @@ -10,6 +10,7 @@ #include "trimflowscommand.h" #include "needlemanoverlap.hpp" + //********************************************************************************************************************** vector TrimFlowsCommand::setParameters(){ try { @@ -224,25 +225,46 @@ int TrimFlowsCommand::execute(){ outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName); } - vector flowFilePos = getFlowFileBreaks(); + vector flowFilePos; + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + flowFilePos = getFlowFileBreaks(); for (int i = 0; i < (flowFilePos.size()-1); i++) { lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)])); } - + #else + ifstream in; m->openInputFile(flowFileName, in); in >> numFlows; in.close(); + ///////////////////////////////////////// until I fix multiple processors for windows ////////////////// + processors = 1; + ///////////////////////////////////////// until I fix multiple processors for windows ////////////////// + if (processors == 1) { + lines.push_back(new linePair(0, 1000)); + }else { + int numFlowLines; + flowFilePos = m->setFilePosEachLine(flowFileName, numFlowLines); + flowFilePos.erase(flowFilePos.begin() + 1); numFlowLines--; + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFlowLines / processors; + cout << numSeqsPerProcessor << '\t' << numFlowLines << endl; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFlowLines - i * numSeqsPerProcessor; } + lines.push_back(new linePair(flowFilePos[startIndex], numSeqsPerProcessor)); + cout << flowFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl; + } + } + #endif + vector > barcodePrimerComboFileNames; if(oligoFileName != ""){ getOligos(barcodePrimerComboFileNames); } -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]); }else{ createProcessesCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames); } -#else - driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]); -#endif if (m->control_pressed) { return 0; } @@ -258,7 +280,7 @@ int TrimFlowsCommand::execute(){ for(int j=0;j > barcodePrimerComboFileNames, linePair* line){ +int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector > thisBarcodePrimerComboFileNames, linePair* line){ try { - ofstream trimFlowFile; m->openOutputFile(trimFlowFileName, trimFlowFile); trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint); @@ -325,16 +346,24 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN ofstream scrapFlowFile; m->openOutputFile(scrapFlowFileName, scrapFlowFile); scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint); - - if(line->start == 4){ + + ofstream fastaFile; + if(fasta){ m->openOutputFile(fastaFileName, fastaFile); } + + ifstream flowFile; + m->openInputFile(flowFileName, flowFile); + + flowFile.seekg(line->start); + + if(line->start == 0){ + flowFile >> numFlows; m->gobble(flowFile); scrapFlowFile << maxFlows << endl; trimFlowFile << maxFlows << endl; if(allFiles){ - for(int i=0;iopenOutputFile(barcodePrimerComboFileNames[i][j], temp); + m->openOutputFile(thisBarcodePrimerComboFileNames[i][j], temp); temp << maxFlows << endl; temp.close(); } @@ -343,28 +372,28 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN } FlowData flowData(numFlows, signal, noise, maxHomoP, flowOrder); - - ofstream fastaFile; - if(fasta){ m->openOutputFile(fastaFileName, fastaFile); } - - ifstream flowFile; - m->openInputFile(flowFileName, flowFile); - - flowFile.seekg(line->start); - + //cout << " driver flowdata address " << &flowData << &flowFile << endl; int count = 0; bool moreSeqs = 1; - + + TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer); + while(moreSeqs) { + //cout << "driver " << count << endl; + + + if (m->control_pressed) { break; } int success = 1; int currentSeqDiffs = 0; string trashCode = ""; - flowData.getNext(flowFile); + flowData.getNext(flowFile); + //cout << "driver good bit " << flowFile.good() << endl; flowData.capFlows(maxFlows); Sequence currSeq = flowData.getSequence(); + if(!flowData.hasMinFlows(minFlows)){ //screen to see if sequence is of a minimum number of flows success = 0; trashCode += 'l'; @@ -374,13 +403,13 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN int barcodeIndex = 0; if(barcodes.size() != 0){ - success = stripBarcode(currSeq, barcodeIndex); + success = trimOligos.stripBarcode(currSeq, barcodeIndex); if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqDiffs += success; } } if(numFPrimers != 0){ - success = stripForward(currSeq, primerIndex); + success = trimOligos.stripForward(currSeq, primerIndex); if(success > pdiffs) { trashCode += 'f'; } else{ currentSeqDiffs += success; } } @@ -388,10 +417,10 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN if (currentSeqDiffs > tdiffs) { trashCode += 't'; } if(numRPrimers != 0){ - success = stripReverse(currSeq); + success = trimOligos.stripReverse(currSeq); if(!success) { trashCode += 'r'; } } - + if(trashCode.length() == 0){ flowData.printFlows(trimFlowFile); @@ -400,7 +429,7 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN if(allFiles){ ofstream output; - m->openOutputFileAppend(barcodePrimerComboFileNames[barcodeIndex][primerIndex], output); + m->openOutputFileAppend(thisBarcodePrimerComboFileNames[barcodeIndex][primerIndex], output); output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint); flowData.printFlows(output); @@ -412,12 +441,12 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN } count++; - + //cout << "driver" << '\t' << currSeq.getName() << endl; //report progress if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = flowFile.tellg(); + unsigned long long pos = flowFile.tellg(); if ((pos == -1) || (pos >= line->end)) { break; } #else @@ -575,352 +604,16 @@ void TrimFlowsCommand::getOligos(vector >& outFlowFileNames){ exit(1); } } - -//*************************************************************************************************************** - -int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){ - try { - - string rawSequence = seq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - - //can you find the barcode - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((bdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (barcodes.size() > 0) { - map::iterator it=barcodes.begin(); - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - - }else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - // int length = oligo.length(); - - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimFlowsCommand", "stripBarcode"); - exit(1); - } - -} - -//*************************************************************************************************************** - -int TrimFlowsCommand::stripForward(Sequence& seq, int& group){ - try { - - string rawSequence = seq.getUnaligned(); - int success = pdiffs + 1; //guilty until proven innocent - - //can you find the primer - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((pdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (primers.size() > 0) { - map::iterator it=primers.begin(); - - for(it;it!=primers.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); - - }else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - // int length = oligo.length(); - - if(rawSequence.length() < maxLength){ - success = pdiffs + 100; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i pdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimFlowsCommand", "stripForward"); - exit(1); - } -} - -//*************************************************************************************************************** - -bool TrimFlowsCommand::stripReverse(Sequence& seq){ - try { - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;ierrorOut(e, "TrimFlowsCommand", "stripReverse"); - exit(1); - } -} - - -//*************************************************************************************************************** - -bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){ - try { - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "TrimFlowsCommand", "compareDNASeq"); - exit(1); - } - -} - -//*************************************************************************************************************** - -int TrimFlowsCommand::countDiffs(string oligo, string seq){ - try { - - int length = oligo.length(); - int countDiffs = 0; - - for(int i=0;ierrorOut(e, "TrimFlowsCommand", "countDiffs"); - exit(1); - } - -} - /**************************************************************************************************/ - -vector TrimFlowsCommand::getFlowFileBreaks() { +vector TrimFlowsCommand::getFlowFileBreaks() { try{ - vector filePos; + vector filePos; filePos.push_back(0); FILE * pFile; - unsigned long int size; + unsigned long long size; //get num bytes in file pFile = fopen (flowFileName.c_str(),"rb"); @@ -932,7 +625,7 @@ vector TrimFlowsCommand::getFlowFileBreaks() { } //estimate file breaks - unsigned long int chunkSize = 0; + unsigned long long chunkSize = 0; chunkSize = size / processors; //file too small to divide by processors @@ -940,7 +633,7 @@ vector TrimFlowsCommand::getFlowFileBreaks() { //for each process seekg to closest file break and search for next '>' char. make that the filebreak for (int i = 0; i < processors; i++) { - unsigned long int spot = (i+1) * chunkSize; + unsigned long long spot = (i+1) * chunkSize; ifstream in; m->openInputFile(flowFileName, in); @@ -949,7 +642,7 @@ vector TrimFlowsCommand::getFlowFileBreaks() { string dummy = m->getline(in); //there was not another sequence before the end of the file - unsigned long int sanityPos = in.tellg(); + unsigned long long sanityPos = in.tellg(); // if (sanityPos == -1) { break; } // else { filePos.push_back(newSpot); } @@ -971,8 +664,8 @@ vector TrimFlowsCommand::getFlowFileBreaks() { m->openInputFile(flowFileName, in); in >> numFlows; m->gobble(in); - unsigned long int spot = in.tellg(); - filePos[0] = spot; + //unsigned long long spot = in.tellg(); + //filePos[0] = spot; in.close(); processors = (filePos.size() - 1); @@ -990,10 +683,11 @@ vector TrimFlowsCommand::getFlowFileBreaks() { int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector > barcodePrimerComboFileNames){ try { + processIDS.clear(); + int exitCommand = 1; + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 1; - int exitCommand = 1; - processIDS.clear(); //loop through and create all the processes you want while (process != processors) { @@ -1050,7 +744,85 @@ int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trim int temp = processIDS[i]; wait(&temp); } +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the trimFlowData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; i > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames; + if(allFiles){ + for(int i=0;iopenOutputFile(tempBarcodePrimerComboFileNames[i][j], temp); + temp.close(); + + } + } + } + + trimFlowData* tempflow = new trimFlowData(flowFileName, (trimFlowFileName + extension), (scrapFlowFileName + extension), fastaFileName, flowOrder, tempBarcodePrimerComboFileNames, barcodes, primers, revPrimer, fasta, allFiles, lines[i]->start, lines[i]->end, m, signal, noise, numFlows, maxFlows, minFlows, maxHomoP, tdiffs, bdiffs, pdiffs, i); + pDataArray.push_back(tempflow); + + //MyTrimFlowThreadFunction is in header. It must be global or static to work with the threads. + //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier + hThreadArray[i] = CreateThread(NULL, 0, MyTrimFlowThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + } + + //using the main process as a worker saves time and memory + ofstream temp; + m->openOutputFile(trimFlowFileName, temp); + temp.close(); + + m->openOutputFile(scrapFlowFileName, temp); + temp.close(); + + if(fasta){ + m->openOutputFile(fastaFileName, temp); + temp.close(); + } + vector > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames; + if(allFiles){ + for(int i=0;iopenOutputFile(tempBarcodePrimerComboFileNames[i][j], temp); + temp.close(); + + } + } + } + + //do my part - do last piece because windows is looking for eof + int num = driverCreateTrim(flowFileName, (trimFlowFileName + toString(processors-1) + ".temp"), (scrapFlowFileName + toString(processors-1) + ".temp"), (fastaFileName + toString(processors-1) + ".temp"), tempBarcodePrimerComboFileNames, lines[processors-1]); + processIDS.push_back((processors-1)); + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + num += pDataArray[i]->count; + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + + +#endif //append files m->mothurOutEndLine(); for(int i=0;ierrorOut(e, "TrimFlowsCommand", "createProcessesCreateTrim"); diff --git a/trimflowscommand.h b/trimflowscommand.h index 5affc3f..ab8ca91 100644 --- a/trimflowscommand.h +++ b/trimflowscommand.h @@ -15,6 +15,7 @@ #include "sequence.hpp" #include "flowdata.h" #include "groupmap.h" +#include "trimoligos.h" class TrimFlowsCommand : public Command { public: @@ -37,15 +38,15 @@ private: bool abort; struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; int comboStarts; vector processIDS; //processid vector lines; - vector getFlowFileBreaks(); + vector getFlowFileBreaks(); int createProcessesCreateTrim(string, string, string, string, vector >); int driverCreateTrim(string, string, string, string, vector >, linePair*); @@ -53,11 +54,6 @@ private: set filesToRemove; void getOligos(vector >&); //a rewrite of what is in trimseqscommand.h - int stripBarcode(Sequence&, int&); //largely redundant with trimseqscommand.h - int stripForward(Sequence&, int&); //largely redundant with trimseqscommand.h - bool stripReverse(Sequence&); //largely redundant with trimseqscommand.h - bool compareDNASeq(string, string); //largely redundant with trimseqscommand.h - int countDiffs(string, string); //largely redundant with trimseqscommand.h bool allFiles; int processors; @@ -83,5 +79,183 @@ private: }; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +typedef struct trimFlowData { + string flowFileName; + string trimFlowFileName; + string scrapFlowFileName; + string fastaFileName; + string flowOrder; + vector > barcodePrimerComboFileNames; + map barcodes; + map primers; + vector revPrimer; + bool fasta, allFiles; + unsigned long long start; + unsigned long long end; + MothurOut* m; + float signal, noise; + int numFlows, maxFlows, minFlows, maxHomoP, tdiffs, bdiffs, pdiffs, threadID, count; + + trimFlowData(){} + trimFlowData(string ff, string tf, string sf, string f, string fo, vector > bfn, map bar, map pri, vector rev, bool fa, bool al, unsigned long long st, unsigned long long en, MothurOut* mout, float sig, float n, int numF, int maxF, int minF, int maxH, int td, int bd, int pd, int tid) { + flowFileName = ff; + trimFlowFileName = tf; + scrapFlowFileName = sf; + fastaFileName = f; + flowOrder = fo; + barcodePrimerComboFileNames = bfn; + barcodes = bar; + primers = pri; + revPrimer = rev; + fasta = fa; + allFiles = al; + start = st; + end = en; + m = mout; + signal = sig; + noise = n; + numFlows = numF; + maxFlows = maxF; + minFlows = minF; + maxHomoP = maxH; + tdiffs = td; + bdiffs = bd; + pdiffs = pd; + threadID = tid; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyTrimFlowThreadFunction(LPVOID lpParam){ + trimFlowData* pDataArray; + pDataArray = (trimFlowData*)lpParam; + + try { + ofstream trimFlowFile; + pDataArray->m->openOutputFile(pDataArray->trimFlowFileName, trimFlowFile); + trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint); + + ofstream scrapFlowFile; + pDataArray->m->openOutputFile(pDataArray->scrapFlowFileName, scrapFlowFile); + scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint); + + ofstream fastaFile; + if(pDataArray->fasta){ pDataArray->m->openOutputFile(pDataArray->fastaFileName, fastaFile); } + + ifstream flowFile; + pDataArray->m->openInputFile(pDataArray->flowFileName, flowFile); + + flowFile.seekg(pDataArray->start); + + if(pDataArray->start == 0){ + flowFile >> pDataArray->numFlows; pDataArray->m->gobble(flowFile); + scrapFlowFile << pDataArray->maxFlows << endl; + trimFlowFile << pDataArray->maxFlows << endl; + if(pDataArray->allFiles){ + for(int i=0;ibarcodePrimerComboFileNames.size();i++){ + for(int j=0;jbarcodePrimerComboFileNames[0].size();j++){ + ofstream temp; + pDataArray->m->openOutputFile(pDataArray->barcodePrimerComboFileNames[i][j], temp); + temp << pDataArray->maxFlows << endl; + temp.close(); + } + } + } + } + + FlowData flowData(pDataArray->numFlows, pDataArray->signal, pDataArray->noise, pDataArray->maxHomoP, pDataArray->flowOrder); + cout << " thread flowdata address " << &flowData << '\t' << &flowFile << endl; + TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer); + + pDataArray->count = pDataArray->end; + cout << pDataArray->threadID << '\t' << pDataArray->count << endl; + int count = 0; + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { break; } + cout << pDataArray->threadID << '\t' << count << endl; + int success = 1; + int currentSeqDiffs = 0; + string trashCode = ""; + + flowData.getNext(flowFile); + cout << "thread good bit " << flowFile.good() << endl; + flowData.capFlows(pDataArray->maxFlows); + + Sequence currSeq = flowData.getSequence(); + if(!flowData.hasMinFlows(pDataArray->minFlows)){ //screen to see if sequence is of a minimum number of flows + success = 0; + trashCode += 'l'; + } + + int primerIndex = 0; + int barcodeIndex = 0; + + if(pDataArray->barcodes.size() != 0){ + success = trimOligos.stripBarcode(currSeq, barcodeIndex); + if(success > pDataArray->bdiffs) { trashCode += 'b'; } + else{ currentSeqDiffs += success; } + } + + if(pDataArray->primers.size() != 0){ + success = trimOligos.stripForward(currSeq, primerIndex); + if(success > pDataArray->pdiffs) { trashCode += 'f'; } + else{ currentSeqDiffs += success; } + } + + if (currentSeqDiffs > pDataArray->tdiffs) { trashCode += 't'; } + + if(pDataArray->revPrimer.size() != 0){ + success = trimOligos.stripReverse(currSeq); + if(!success) { trashCode += 'r'; } + } + + if(trashCode.length() == 0){ + + flowData.printFlows(trimFlowFile); + + if(pDataArray->fasta) { currSeq.printSequence(fastaFile); } + + if(pDataArray->allFiles){ + ofstream output; + pDataArray->m->openOutputFileAppend(pDataArray->barcodePrimerComboFileNames[barcodeIndex][primerIndex], output); + output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint); + + flowData.printFlows(output); + output.close(); + } + } + else{ + flowData.printFlows(scrapFlowFile, trashCode); + } + + count++; + cout << pDataArray->threadID << '\t' << currSeq.getName() << endl; + //report progress + if((count) % 10000 == 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); } + + } + //report progress + if((count) % 10000 != 0){ pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine(); } + + trimFlowFile.close(); + scrapFlowFile.close(); + flowFile.close(); + if(pDataArray->fasta){ fastaFile.close(); } + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "TrimFlowsCommand", "MyTrimFlowsThreadFunction"); + exit(1); + } +} +#endif + #endif diff --git a/trimoligos.cpp b/trimoligos.cpp new file mode 100644 index 0000000..f0b5a80 --- /dev/null +++ b/trimoligos.cpp @@ -0,0 +1,627 @@ +/* + * trimoligos.cpp + * Mothur + * + * Created by westcott on 9/1/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "trimoligos.h" +#include "alignment.hpp" +#include "needlemanoverlap.hpp" + + +/********************************************************************/ +//strip, pdiffs, bdiffs, primers, barcodes, revPrimers +TrimOligos::TrimOligos(int p, int b, map pr, map br, vector r){ + try { + m = MothurOut::getInstance(); + + pdiffs = p; + bdiffs = b; + + barcodes = br; + primers = pr; + revPrimer = r; + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "TrimOligos"); + exit(1); + } +} +/********************************************************************/ +TrimOligos::~TrimOligos() {} +//*******************************************************************/ +int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + + if(qual.getName() != ""){ + qual.trimQScores(oligo.length(), -1); + } + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (barcodes.size() > 0) { + map::iterator it=barcodes.begin(); + + for(it;it!=barcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripBarcode"); + exit(1); + } + +} +//*******************************************************************/ +int TrimOligos::stripBarcode(Sequence& seq, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (barcodes.size() > 0) { + map::iterator it=barcodes.begin(); + + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripBarcode"); + exit(1); + } + +} +//********************************************************************/ +int TrimOligos::stripForward(Sequence& seq, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = pdiffs + 1; //guilty until proven innocent + + //can you find the primer + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((pdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (primers.size() > 0) { + map::iterator it=primers.begin(); + + for(it;it!=primers.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ + success = pdiffs + 100; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i pdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripForward"); + exit(1); + } +} +//*******************************************************************/ +int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){ + try { + string rawSequence = seq.getUnaligned(); + int success = pdiffs + 1; //guilty until proven innocent + + //can you find the primer + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + if(qual.getName() != ""){ + qual.trimQScores(oligo.length(), -1); + } + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((pdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (primers.size() > 0) { + map::iterator it=primers.begin(); + + for(it;it!=primers.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ + success = pdiffs + 100; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i pdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripForward"); + exit(1); + } +} +//******************************************************************/ +bool TrimOligos::stripReverse(Sequence& seq, QualityScores& qual){ + try { + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent + + for(int i=0;ierrorOut(e, "TrimOligos", "stripReverse"); + exit(1); + } +} +//******************************************************************/ +bool TrimOligos::stripReverse(Sequence& seq){ + try { + + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent + + for(int i=0;ierrorOut(e, "TrimOligos", "stripReverse"); + exit(1); + } +} + +//******************************************************************/ +bool TrimOligos::compareDNASeq(string oligo, string seq){ + try { + bool success = 1; + int length = oligo.length(); + + for(int i=0;ierrorOut(e, "TrimOligos", "compareDNASeq"); + exit(1); + } + +} +//********************************************************************/ +int TrimOligos::countDiffs(string oligo, string seq){ + try { + + int length = oligo.length(); + int countDiffs = 0; + + for(int i=0;ierrorOut(e, "TrimOligos", "countDiffs"); + exit(1); + } +} +/********************************************************************/ + + + diff --git a/trimoligos.h b/trimoligos.h new file mode 100644 index 0000000..8830dff --- /dev/null +++ b/trimoligos.h @@ -0,0 +1,49 @@ +#ifndef TRIMOLIGOS_H +#define TRIMOLIGOS_H + +/* + * trimoligos.h + * Mothur + * + * Created by westcott on 9/1/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "mothurout.h" +#include "sequence.hpp" +#include "qualityscores.h" + + +class TrimOligos { + + public: + TrimOligos(int,int, map, map, vector); //pdiffs, bdiffs, primers, barcodes, revPrimers + ~TrimOligos(); + + int stripBarcode(Sequence&, int&); + int stripBarcode(Sequence&, QualityScores&, int&); + + int stripForward(Sequence&, int&); + int stripForward(Sequence&, QualityScores&, int&); + + bool stripReverse(Sequence&); + bool stripReverse(Sequence&, QualityScores&); + + + private: + int pdiffs, bdiffs; + + map barcodes; + map primers; + vector revPrimer; + + MothurOut* m; + + bool compareDNASeq(string, string); + int countDiffs(string, string); +}; + +#endif + diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index a8c3fc6..1cbb91a 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -9,6 +9,7 @@ #include "trimseqscommand.h" #include "needlemanoverlap.hpp" +#include "trimoligos.h" //********************************************************************************************************************** vector TrimSeqsCommand::setParameters(){ @@ -351,8 +352,8 @@ int TrimSeqsCommand::execute(){ } } - vector fastaFilePos; - vector qFilePos; + vector fastaFilePos; + vector qFilePos; setLines(fastaFile, qFileName, fastaFilePos, qFilePos); @@ -539,6 +540,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int count = 0; bool moreSeqs = 1; + TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer); while (moreSeqs) { @@ -572,13 +574,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int primerIndex = 0; if(barcodes.size() != 0){ - success = stripBarcode(currSeq, currQual, barcodeIndex); + success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex); if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } if(numFPrimers != 0){ - success = stripForward(currSeq, currQual, primerIndex); + success = trimOligos.stripForward(currSeq, currQual, primerIndex); if(success > pdiffs) { trashCode += 'f'; } else{ currentSeqsDiffs += success; } } @@ -586,7 +588,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } if(numRPrimers != 0){ - success = stripReverse(currSeq, currQual); + success = trimOligos.stripReverse(currSeq, currQual); if(!success) { trashCode += 'r'; } } @@ -722,7 +724,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= line->end)) { break; } #else @@ -938,7 +940,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName /**************************************************************************************************/ -int TrimSeqsCommand::setLines(string filename, string qfilename, vector& fastaFilePos, vector& qfileFilePos) { +int TrimSeqsCommand::setLines(string filename, string qfilename, vector& fastaFilePos, vector& qfileFilePos) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) //set file positions for fasta file @@ -977,7 +979,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector::iterator it = firstSeqNames.find(sname); if(it != firstSeqNames.end()) { //this is the start of a new chunk - unsigned long int pos = inQual.tellg(); + unsigned long long pos = inQual.tellg(); qfileFilePos.push_back(pos - input.length() - 1); firstSeqNames.erase(it); } @@ -999,7 +1001,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector >& fastaFileNames, vector< exit(1); } } - -//*************************************************************************************************************** - -int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ - try { - - string rawSequence = seq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - - //can you find the barcode - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - - if(qual.getName() != ""){ - qual.trimQScores(oligo.length(), -1); - } - - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((bdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (barcodes.size() > 0) { - map::iterator it=barcodes.begin(); - - for(it;it!=barcodes.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - - }else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; -// int length = oligo.length(); - - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - - if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); - } - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "stripBarcode"); - exit(1); - } - -} - -//*************************************************************************************************************** - -int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){ - try { - string rawSequence = seq.getUnaligned(); - int success = pdiffs + 1; //guilty until proven innocent - - //can you find the primer - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - if(qual.getName() != ""){ - qual.trimQScores(oligo.length(), -1); - } - success = 0; - break; - } - } - - //if you found the barcode or if you don't want to allow for diffs - if ((pdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (primers.size() > 0) { - map::iterator it=primers.begin(); - - for(it;it!=primers.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); - - }else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; -// int length = oligo.length(); - - if(rawSequence.length() < maxLength){ - success = pdiffs + 100; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i pdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); - } - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "stripForward"); - exit(1); - } -} - -//*************************************************************************************************************** - -bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){ - try { - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "stripReverse"); - exit(1); - } -} - //*************************************************************************************************************** bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){ @@ -1618,80 +1347,4 @@ bool TrimSeqsCommand::cullAmbigs(Sequence& seq){ } } - -//*************************************************************************************************************** - -bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ - try { - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "compareDNASeq"); - exit(1); - } - -} - -//*************************************************************************************************************** - -int TrimSeqsCommand::countDiffs(string oligo, string seq){ - try { - - int length = oligo.length(); - int countDiffs = 0; - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "countDiffs"); - exit(1); - } - -} - //*************************************************************************************************************** diff --git a/trimseqscommand.h b/trimseqscommand.h index 13d1a41..137cb73 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -37,24 +37,17 @@ private: GroupMap* groupMap; struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; bool getOligos(vector >&, vector >&, vector >&); - int stripBarcode(Sequence&, QualityScores&, int&); - int stripForward(Sequence&, QualityScores&, int&); - bool stripReverse(Sequence&, QualityScores&); - bool keepFirstTrim(Sequence&, QualityScores&); bool removeLastTrim(Sequence&, QualityScores&); - bool cullLength(Sequence&); bool cullHomoP(Sequence&); bool cullAmbigs(Sequence&); - bool compareDNASeq(string, string); - int countDiffs(string, string); bool abort, createGroup; string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir; @@ -81,7 +74,7 @@ private: int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair*, linePair*); int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >); - int setLines(string, string, vector&, vector&); + int setLines(string, string, vector&, vector&); }; #endif