From: westcott Date: Thu, 19 Aug 2010 16:28:18 +0000 (+0000) Subject: changed how we break up the files on parallelized commands to avoid scanning file. X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=284fd95c611ccc3b1a7875c4dacfca06d1f50ed6 changed how we break up the files on parallelized commands to avoid scanning file. --- diff --git a/aligncommand.cpp b/aligncommand.cpp index 4641ed7..3f70e2d 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -292,7 +292,6 @@ int AlignCommand::execute(){ int startIndex = pid * numSeqsPerProcessor; if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } - //align your part driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIAlign, outMPIReport, outMPIAccnos, MPIPos); @@ -347,24 +346,16 @@ int AlignCommand::execute(){ } #else - + vector positions = divideFile(candidateFileNames[s], processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - openInputFile(candidateFileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); + numFastaSeqs = driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); - driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); - - if (m->control_pressed) { - remove(accnosFileName.c_str()); - remove(alignFileName.c_str()); - remove(reportFileName.c_str()); - return 0; - } + if (m->control_pressed) { remove(accnosFileName.c_str()); remove(alignFileName.c_str()); remove(reportFileName.c_str()); return 0; } //delete accnos file if its blank else report to user if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } @@ -375,36 +366,10 @@ int AlignCommand::execute(){ }else{ m->mothurOut(" If the reverse compliment proved to be better it was reported."); } m->mothurOutEndLine(); } - } - else{ - vector positions; + }else{ processIDS.resize(0); - ifstream inFASTA; - openInputFile(candidateFileNames[s], inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numFastaSeqs = positions.size(); - - int numSeqsPerProcessor = numFastaSeqs / processors; - - for (int i = 0; i < processors; i++) { - unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - - createProcesses(alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); + numFastaSeqs = createProcesses(alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str()); rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str()); @@ -441,29 +406,12 @@ int AlignCommand::execute(){ m->mothurOutEndLine(); }else{ hasAccnos = false; } - if (m->control_pressed) { - remove(accnosFileName.c_str()); - remove(alignFileName.c_str()); - remove(reportFileName.c_str()); - return 0; - } + if (m->control_pressed) { remove(accnosFileName.c_str()); remove(alignFileName.c_str()); remove(reportFileName.c_str()); return 0; } } #else - ifstream inFASTA; - openInputFile(candidateFileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); + numFastaSeqs = driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); - if (m->control_pressed) { - remove(accnosFileName.c_str()); - remove(alignFileName.c_str()); - remove(reportFileName.c_str()); - return 0; - } + if (m->control_pressed) { remove(accnosFileName.c_str()); remove(alignFileName.c_str()); remove(reportFileName.c_str()); return 0; } //delete accnos file if its blank else report to user if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } @@ -515,7 +463,7 @@ int AlignCommand::execute(){ //********************************************************************************************************************** -int AlignCommand::driver(linePair* line, string alignFName, string reportFName, string accnosFName, string filename){ +int AlignCommand::driver(linePair* filePos, string alignFName, string reportFName, string accnosFName, string filename){ try { ofstream alignmentFile; openOutputFile(alignFName, alignmentFile); @@ -528,9 +476,12 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName, ifstream inFASTA; openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); + inFASTA.seekg(filePos->start); + + bool done = false; + int count = 0; - for(int i=0;inumSeqs;i++){ + while (!done) { if (m->control_pressed) { return 0; } @@ -607,20 +558,26 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName, report.print(); delete nast; if (needToDeleteCopy) { delete copy; } + + count++; } delete candidateSeq; + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + //report progress - if((i+1) % 100 == 0){ m->mothurOut(toString(i+1)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + } //report progress - if((line->numSeqs) % 100 != 0){ m->mothurOut(toString(line->numSeqs)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } alignmentFile.close(); inFASTA.close(); accnosFile.close(); - return 1; + return count; } catch(exception& e) { m->errorOut(e, "AlignCommand", "driver"); @@ -796,7 +753,7 @@ int AlignCommand::createProcesses(string alignFileName, string reportFileName, s try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - int exitCommand = 1; + int num = 0; // processIDS.resize(0); //loop through and create all the processes you want @@ -807,7 +764,15 @@ int AlignCommand::createProcesses(string alignFileName, string reportFileName, s processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - exitCommand = driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp", accnosFName + toString(getpid()) + ".temp", filename); + num = driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp", accnosFName + toString(getpid()) + ".temp", filename); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".temp"; + openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -818,7 +783,15 @@ int AlignCommand::createProcesses(string alignFileName, string reportFileName, s wait(&temp); } - return exitCommand; + for (int i = 0; i < processIDS.size(); i++) { + ifstream in; + string tempFile = toString(processIDS[i]) + ".temp"; + openInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); remove(tempFile.c_str()); + } + + return num; #endif } catch(exception& e) { @@ -826,7 +799,6 @@ int AlignCommand::createProcesses(string alignFileName, string reportFileName, s exit(1); } } - /**************************************************************************************************/ void AlignCommand::appendAlignFiles(string temp, string filename) { diff --git a/aligncommand.h b/aligncommand.h index fb47874..b58c3f2 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -27,8 +27,8 @@ public: private: struct linePair { unsigned long int start; - int numSeqs; - linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; vector processIDS; //processid vector lines; @@ -43,7 +43,7 @@ private: void appendReportFiles(string, string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); + int driverMPI(MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); #endif string candidateFileName, templateFileName, distanceFileName, search, align, outputDir; diff --git a/chimeraccodecommand.cpp b/chimeraccodecommand.cpp index 4c8848d..8ea91e5 100644 --- a/chimeraccodecommand.cpp +++ b/chimeraccodecommand.cpp @@ -300,58 +300,24 @@ int ChimeraCcodeCommand::execute(){ outHeader.close(); + vector positions = divideFile(fastaFileNames[s], processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } + //break up file #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numSeqs)); + + numSeqs = driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); - driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); - - if (m->control_pressed) { - remove(outputFileName.c_str()); - remove(tempHeader.c_str()); - remove(accnosFileName.c_str()); - for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } + if (m->control_pressed) { remove(outputFileName.c_str()); remove(tempHeader.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; } }else{ - vector positions; processIDS.resize(0); - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numSeqs = positions.size(); - - int numSeqsPerProcessor = numSeqs / processors; - - for (int i = 0; i < processors; i++) { - unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - - - createProcesses(outputFileName, fastaFileNames[s], accnosFileName); + numSeqs = createProcesses(outputFileName, fastaFileNames[s], accnosFileName); rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); rename((accnosFileName + toString(processIDS[0]) + ".temp").c_str(), accnosFileName.c_str()); @@ -380,23 +346,9 @@ int ChimeraCcodeCommand::execute(){ } #else - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - lines.push_back(new linePair(0, numSeqs)); - - driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); + numSeqs = driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); - if (m->control_pressed) { - remove(outputFileName.c_str()); - remove(tempHeader.c_str()); - remove(accnosFileName.c_str()); - for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } + if (m->control_pressed) { remove(outputFileName.c_str()); remove(tempHeader.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; } #endif @@ -432,7 +384,7 @@ int ChimeraCcodeCommand::execute(){ } //********************************************************************************************************************** -int ChimeraCcodeCommand::driver(linePair* line, string outputFName, string filename, string accnos){ +int ChimeraCcodeCommand::driver(linePair* filePos, string outputFName, string filename, string accnos){ try { ofstream out; openOutputFile(outputFName, out); @@ -443,9 +395,12 @@ int ChimeraCcodeCommand::driver(linePair* line, string outputFName, string filen ifstream inFASTA; openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); - - for(int i=0;inumSeqs;i++){ + inFASTA.seekg(filePos->start); + + bool done = false; + int count = 0; + + while (!done) { if (m->control_pressed) { return 1; } @@ -464,20 +419,24 @@ int ChimeraCcodeCommand::driver(linePair* line, string outputFName, string filen //print results chimera->print(out, out2); } + count++; } delete candidateSeq; + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + //report progress - if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } } //report progress - if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } out.close(); out2.close(); inFASTA.close(); - return 0; + return count; } catch(exception& e) { m->errorOut(e, "ChimeraCcodeCommand", "driver"); @@ -549,7 +508,7 @@ int ChimeraCcodeCommand::createProcesses(string outputFileName, string filename, try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - // processIDS.resize(0); + int num = 0; //loop through and create all the processes you want while (process != processors) { @@ -559,7 +518,15 @@ int ChimeraCcodeCommand::createProcesses(string outputFileName, string filename, processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); + num = driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".temp"; + openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -570,7 +537,15 @@ int ChimeraCcodeCommand::createProcesses(string outputFileName, string filename, wait(&temp); } - return 0; + for (int i = 0; i < processIDS.size(); i++) { + ifstream in; + string tempFile = toString(processIDS[i]) + ".temp"; + openInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); remove(tempFile.c_str()); + } + + return num; #endif } catch(exception& e) { diff --git a/chimeraccodecommand.h b/chimeraccodecommand.h index 5989eb2..d980288 100644 --- a/chimeraccodecommand.h +++ b/chimeraccodecommand.h @@ -26,11 +26,10 @@ public: private: - struct linePair { unsigned long int start; - int numSeqs; - linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; vector processIDS; //processid vector lines; diff --git a/chimeracheckcommand.cpp b/chimeracheckcommand.cpp index 6cea5d5..0ec5f4b 100644 --- a/chimeracheckcommand.cpp +++ b/chimeracheckcommand.cpp @@ -299,55 +299,23 @@ int ChimeraCheckCommand::execute(){ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else + vector positions = divideFile(fastaFileNames[i], processors); + + for (int s = 0; s < (positions.size()-1); s++) { + lines.push_back(new linePair(positions[s], positions[(s+1)])); + } + //break up file #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - openInputFile(fastaFileNames[i], inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numSeqs)); + numSeqs = driver(lines[0], outputFileName, fastaFileNames[i]); - driver(lines[0], outputFileName, fastaFileNames[i]); - - if (m->control_pressed) { - for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } - for (int j = 0; j < lines.size(); j++) { delete lines[j]; } lines.clear(); - delete chimera; - return 0; - } + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int j = 0; j < lines.size(); j++) { delete lines[j]; } lines.clear(); delete chimera; return 0; } }else{ - vector positions; processIDS.resize(0); - ifstream inFASTA; - openInputFile(fastaFileNames[i], inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numSeqs = positions.size(); - - int numSeqsPerProcessor = numSeqs / processors; - - for (int j = 0; j < processors; j++) { - unsigned long int startPos = positions[ j * numSeqsPerProcessor ]; - if(j == processors - 1){ - numSeqsPerProcessor = numSeqs - j * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - - - createProcesses(outputFileName, fastaFileNames[i]); + numSeqs = createProcesses(outputFileName, fastaFileNames[i]); rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); @@ -366,20 +334,9 @@ int ChimeraCheckCommand::execute(){ } #else - ifstream inFASTA; - openInputFile(fastaFileNames[i], inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - lines.push_back(new linePair(0, numSeqs)); - - driver(lines[0], outputFileName, fastaFileNames[i]); + numSeqs = driver(lines[0], outputFileName, fastaFileNames[i]); - if (m->control_pressed) { - for (int j = 0; j < lines.size(); j++) { delete lines[j]; } lines.clear(); - for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } - delete chimera; - return 0; - } + if (m->control_pressed) { for (int j = 0; j < lines.size(); j++) { delete lines[j]; } lines.clear(); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; } #endif #endif delete chimera; @@ -405,7 +362,7 @@ int ChimeraCheckCommand::execute(){ } //********************************************************************************************************************** -int ChimeraCheckCommand::driver(linePair* line, string outputFName, string filename){ +int ChimeraCheckCommand::driver(linePair* filePos, string outputFName, string filename){ try { ofstream out; openOutputFile(outputFName, out); @@ -415,10 +372,13 @@ int ChimeraCheckCommand::driver(linePair* line, string outputFName, string filen ifstream inFASTA; openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); - - for(int i=0;inumSeqs;i++){ - + inFASTA.seekg(filePos->start); + + bool done = false; + int count = 0; + + while (!done) { + if (m->control_pressed) { return 1; } Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); @@ -434,16 +394,19 @@ int ChimeraCheckCommand::driver(linePair* line, string outputFName, string filen } delete candidateSeq; + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + //report progress - if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } } //report progress - if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } out.close(); inFASTA.close(); - return 0; + return count; } catch(exception& e) { m->errorOut(e, "ChimeraCheckCommand", "driver"); @@ -506,7 +469,7 @@ int ChimeraCheckCommand::createProcesses(string outputFileName, string filename) try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - // processIDS.resize(0); + int num = 0; //loop through and create all the processes you want while (process != processors) { @@ -516,7 +479,15 @@ int ChimeraCheckCommand::createProcesses(string outputFileName, string filename) processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename); + num = driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".temp"; + openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -527,7 +498,15 @@ int ChimeraCheckCommand::createProcesses(string outputFileName, string filename) wait(&temp); } - return 0; + for (int i = 0; i < processIDS.size(); i++) { + ifstream in; + string tempFile = toString(processIDS[i]) + ".temp"; + openInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); remove(tempFile.c_str()); + } + + return num; #endif } catch(exception& e) { diff --git a/chimeracheckcommand.h b/chimeracheckcommand.h index 84c75ac..71e1f0c 100644 --- a/chimeracheckcommand.h +++ b/chimeracheckcommand.h @@ -30,9 +30,10 @@ private: struct linePair { unsigned long int start; - int numSeqs; - linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; + vector processIDS; //processid vector lines; diff --git a/chimerapintailcommand.cpp b/chimerapintailcommand.cpp index fd8724d..d9368cc 100644 --- a/chimerapintailcommand.cpp +++ b/chimerapintailcommand.cpp @@ -337,57 +337,24 @@ int ChimeraPintailCommand::execute(){ MPI_File_close(&outMPIAccnos); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - + vector positions = divideFile(fastaFileNames[s], processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } + //break up file #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numSeqs)); - - driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); + + numSeqs = driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); - if (m->control_pressed) { - remove(outputFileName.c_str()); - remove(accnosFileName.c_str()); - for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } + if (m->control_pressed) { remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; } }else{ - vector positions; processIDS.resize(0); - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numSeqs = positions.size(); - - int numSeqsPerProcessor = numSeqs / processors; - - for (int i = 0; i < processors; i++) { - unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - - createProcesses(outputFileName, fastaFileNames[s], accnosFileName); + numSeqs = createProcesses(outputFileName, fastaFileNames[s], accnosFileName); rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); rename((accnosFileName + toString(processIDS[0]) + ".temp").c_str(), accnosFileName.c_str()); @@ -415,22 +382,9 @@ int ChimeraPintailCommand::execute(){ } #else - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - lines.push_back(new linePair(0, numSeqs)); - - driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); + numSeqs = driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); - if (m->control_pressed) { - remove(outputFileName.c_str()); - remove(accnosFileName.c_str()); - for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } + if (m->control_pressed) { remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; } #endif #endif @@ -460,7 +414,7 @@ int ChimeraPintailCommand::execute(){ } //********************************************************************************************************************** -int ChimeraPintailCommand::driver(linePair* line, string outputFName, string filename, string accnos){ +int ChimeraPintailCommand::driver(linePair* filePos, string outputFName, string filename, string accnos){ try { ofstream out; openOutputFile(outputFName, out); @@ -471,10 +425,13 @@ int ChimeraPintailCommand::driver(linePair* line, string outputFName, string fil ifstream inFASTA; openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); - - for(int i=0;inumSeqs;i++){ - + inFASTA.seekg(filePos->start); + + bool done = false; + int count = 0; + + while (!done) { + if (m->control_pressed) { return 1; } Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); @@ -492,20 +449,24 @@ int ChimeraPintailCommand::driver(linePair* line, string outputFName, string fil //print results chimera->print(out, out2); } + count++; } delete candidateSeq; + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + //report progress - if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } } //report progress - if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } out.close(); out2.close(); inFASTA.close(); - return 0; + return count; } catch(exception& e) { m->errorOut(e, "ChimeraPintailCommand", "driver"); @@ -576,7 +537,7 @@ int ChimeraPintailCommand::createProcesses(string outputFileName, string filenam try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - // processIDS.resize(0); + int num = 0; //loop through and create all the processes you want while (process != processors) { @@ -586,7 +547,15 @@ int ChimeraPintailCommand::createProcesses(string outputFileName, string filenam processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); + num = driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".temp"; + openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -597,7 +566,15 @@ int ChimeraPintailCommand::createProcesses(string outputFileName, string filenam wait(&temp); } - return 0; + for (int i = 0; i < processIDS.size(); i++) { + ifstream in; + string tempFile = toString(processIDS[i]) + ".temp"; + openInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); remove(tempFile.c_str()); + } + + return num; #endif } catch(exception& e) { diff --git a/chimerapintailcommand.h b/chimerapintailcommand.h index 9370745..9082204 100644 --- a/chimerapintailcommand.h +++ b/chimerapintailcommand.h @@ -30,9 +30,10 @@ private: struct linePair { unsigned long int start; - int numSeqs; - linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; + vector processIDS; //processid vector lines; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index e675c00..1de7021 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -325,57 +325,23 @@ int ChimeraSlayerCommand::execute(){ chimera->printHeader(outHeader); outHeader.close(); + vector positions = divideFile(fastaFileNames[s], processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } + //break up file #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); + numSeqs = driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); - lines.push_back(new linePair(0, numSeqs)); - - driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); - - if (m->control_pressed) { - remove(outputFileName.c_str()); - remove(tempHeader.c_str()); - remove(accnosFileName.c_str()); - for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } + if (m->control_pressed) { remove(outputFileName.c_str()); remove(tempHeader.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; } }else{ - vector positions; processIDS.resize(0); - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numSeqs = positions.size(); - - int numSeqsPerProcessor = numSeqs / processors; - - for (int i = 0; i < processors; i++) { - unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - - createProcesses(outputFileName, fastaFileNames[s], accnosFileName); + numSeqs = createProcesses(outputFileName, fastaFileNames[s], accnosFileName); rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); rename((accnosFileName + toString(processIDS[0]) + ".temp").c_str(), accnosFileName.c_str()); @@ -392,35 +358,13 @@ int ChimeraSlayerCommand::execute(){ remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str()); } - if (m->control_pressed) { - remove(outputFileName.c_str()); - remove(accnosFileName.c_str()); - for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } - + if (m->control_pressed) { remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; } } #else - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - lines.push_back(new linePair(0, numSeqs)); + numSeqs = driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); - driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName); - - if (m->control_pressed) { - remove(outputFileName.c_str()); - remove(tempHeader.c_str()); - remove(accnosFileName.c_str()); - for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); - delete chimera; - return 0; - } + if (m->control_pressed) { remove(outputFileName.c_str()); remove(tempHeader.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; } #endif @@ -456,7 +400,7 @@ int ChimeraSlayerCommand::execute(){ } //********************************************************************************************************************** -int ChimeraSlayerCommand::driver(linePair* line, string outputFName, string filename, string accnos){ +int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string filename, string accnos){ try { ofstream out; openOutputFile(outputFName, out); @@ -467,9 +411,12 @@ int ChimeraSlayerCommand::driver(linePair* line, string outputFName, string file ifstream inFASTA; openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); - - for(int i=0;inumSeqs;i++){ + inFASTA.seekg(filePos->start); + + bool done = false; + int count = 0; + + while (!done) { if (m->control_pressed) { return 1; } @@ -488,20 +435,24 @@ int ChimeraSlayerCommand::driver(linePair* line, string outputFName, string file //print results chimera->print(out, out2); } + count++; } delete candidateSeq; + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + //report progress - if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } } //report progress - if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } out.close(); out2.close(); inFASTA.close(); - return 0; + return count; } catch(exception& e) { m->errorOut(e, "ChimeraSlayerCommand", "driver"); @@ -573,7 +524,7 @@ int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - // processIDS.resize(0); + int num = 0; //loop through and create all the processes you want while (process != processors) { @@ -583,7 +534,15 @@ int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); + num = driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".temp"; + openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -594,7 +553,15 @@ int ChimeraSlayerCommand::createProcesses(string outputFileName, string filename wait(&temp); } - return 0; + for (int i = 0; i < processIDS.size(); i++) { + ifstream in; + string tempFile = toString(processIDS[i]) + ".temp"; + openInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); remove(tempFile.c_str()); + } + + return num; #endif } catch(exception& e) { diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h index 95541b2..c5ef888 100644 --- a/chimeraslayercommand.h +++ b/chimeraslayercommand.h @@ -29,9 +29,10 @@ private: struct linePair { unsigned long int start; - int numSeqs; - linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; + vector processIDS; //processid vector lines; diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 128fd01..4c1a025 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -460,45 +460,21 @@ int ClassifySeqsCommand::execute(){ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else + + vector positions = divideFile(fastaFileNames[s], processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); } else{ - vector positions; processIDS.resize(0); - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numFastaSeqs = positions.size(); - - int numSeqsPerProcessor = numFastaSeqs / processors; - - for (int i = 0; i < processors; i++) { - unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str()); rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str()); @@ -512,14 +488,7 @@ int ClassifySeqsCommand::execute(){ } #else - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); #endif #endif @@ -681,11 +650,11 @@ string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) { /**************************************************************************************************/ -void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { +int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - // processIDS.resize(0); + int num = 0; //loop through and create all the processes you want while (process != processors) { @@ -695,7 +664,15 @@ void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename); + num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".temp"; + openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -705,6 +682,16 @@ void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile int temp = processIDS[i]; wait(&temp); } + + for (int i = 0; i < processIDS.size(); i++) { + ifstream in; + string tempFile = toString(processIDS[i]) + ".temp"; + openInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); remove(tempFile.c_str()); + } + + return num; #endif } catch(exception& e) { @@ -738,7 +725,7 @@ void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) { //********************************************************************************************************************** -int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName, string filename){ +int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){ try { ofstream outTax; openOutputFile(taxFName, outTax); @@ -748,12 +735,15 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa ifstream inFASTA; openInputFile(filename, inFASTA); - - inFASTA.seekg(line->start); string taxonomy; - for(int i=0;inumSeqs;i++){ + inFASTA.seekg(filePos->start); + + bool done = false; + int count = 0; + + while (!done) { if (m->control_pressed) { return 0; } Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); @@ -773,19 +763,24 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; } - } + count++; + } delete candidateSeq; - if((i+1) % 100 == 0){ - m->mothurOut("Classifying sequence " + toString(i+1)); m->mothurOutEndLine(); - } + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + + //report progress + if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } } - + //report progress + if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + inFASTA.close(); outTax.close(); outTaxSimple.close(); - return 1; + return count; } catch(exception& e) { m->errorOut(e, "ClassifySeqsCommand", "driver"); diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 03e6826..84f5f9f 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -36,9 +36,10 @@ public: private: struct linePair { unsigned long int start; - int numSeqs; - linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; + vector processIDS; //processid vector lines; vector fastaFileNames; @@ -56,7 +57,7 @@ private: int driver(linePair*, string, string, string); void appendTaxFiles(string, string); - void createProcesses(string, string, string); + int createProcesses(string, string, string); string addUnclassifieds(string, int); int MPIReadNamesFile(string); diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index e6e616a..1ebac0e 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -338,22 +338,18 @@ int FilterSeqsCommand::filterSequences() { MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else + vector positions = divideFile(fastafileNames[s], processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - int numFastaSeqs; - openInputFile(fastafileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - + int numFastaSeqs = driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]); numSeqs += numFastaSeqs; - - driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]); }else{ - setLines(fastafileNames[s]); - createProcessesRunFilter(filter, fastafileNames[s]); + int numFastaSeqs = createProcessesRunFilter(filter, fastafileNames[s]); + numSeqs += numFastaSeqs; rename((fastafileNames[s] + toString(processIDS[0]) + ".temp").c_str(), filteredFasta.c_str()); @@ -366,17 +362,8 @@ int FilterSeqsCommand::filterSequences() { if (m->control_pressed) { return 1; } #else - ifstream inFASTA; - int numFastaSeqs; - openInputFile(fastafileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - + numFastaSeqs = driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]); numSeqs += numFastaSeqs; - - driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]); if (m->control_pressed) { return 1; } #endif @@ -466,7 +453,7 @@ int FilterSeqsCommand::driverMPIRun(int start, int num, MPI_File& inMPI, MPI_Fil } #endif /**************************************************************************************/ -int FilterSeqsCommand::driverRunFilter(string F, string outputFilename, string inputFilename, linePair* line) { +int FilterSeqsCommand::driverRunFilter(string F, string outputFilename, string inputFilename, linePair* filePos) { try { ofstream out; openOutputFile(outputFilename, out); @@ -474,13 +461,16 @@ int FilterSeqsCommand::driverRunFilter(string F, string outputFilename, string i ifstream in; openInputFile(inputFilename, in); - in.seekg(line->start); - - for(int i=0;inum;i++){ + in.seekg(filePos->start); + + bool done = false; + int count = 0; + + while (!done) { if (m->control_pressed) { in.close(); out.close(); return 0; } - Sequence seq(in); + Sequence seq(in); gobble(in); if (seq.getName() != "") { string align = seq.getAligned(); string filterSeq = ""; @@ -492,20 +482,23 @@ int FilterSeqsCommand::driverRunFilter(string F, string outputFilename, string i } out << '>' << seq.getName() << endl << filterSeq << endl; - } - gobble(in); - + count++; + } + + unsigned long int pos = in.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + //report progress - if((i+1) % 100 == 0){ m->mothurOut(toString(i+1)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } - //report progress - if((line->num) % 100 != 0){ m->mothurOut(toString(line->num)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + out.close(); in.close(); - return 0; + return count; } catch(exception& e) { m->errorOut(e, "FilterSeqsCommand", "driverRunFilter"); @@ -518,7 +511,7 @@ int FilterSeqsCommand::createProcessesRunFilter(string F, string filename) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - int exitCommand = 1; + int num = 0; processIDS.clear(); //loop through and create all the processes you want @@ -530,7 +523,15 @@ int FilterSeqsCommand::createProcessesRunFilter(string F, string filename) { process++; }else if (pid == 0){ string filteredFasta = filename + toString(getpid()) + ".temp"; - driverRunFilter(F, filteredFasta, filename, lines[process]); + num = driverRunFilter(F, filteredFasta, filename, lines[process]); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".temp"; + openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -539,9 +540,18 @@ int FilterSeqsCommand::createProcessesRunFilter(string F, string filename) { for (int i=0;i> tempNum; num += tempNum; } + in.close(); remove(tempFile.c_str()); } + - return exitCommand; + return num; #endif } catch(exception& e) { @@ -636,37 +646,24 @@ string FilterSeqsCommand::createFilter() { MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else + vector positions = divideFile(fastafileNames[s], processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - int numFastaSeqs; - openInputFile(fastafileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - + int numFastaSeqs = driverCreateFilter(F, fastafileNames[s], lines[0]); numSeqs += numFastaSeqs; - - lines.push_back(new linePair(0, numFastaSeqs)); - - driverCreateFilter(F, fastafileNames[s], lines[0]); }else{ - setLines(fastafileNames[s]); - createProcessesCreateFilter(F, fastafileNames[s]); + int numFastaSeqs = createProcessesCreateFilter(F, fastafileNames[s]); + numSeqs += numFastaSeqs; } if (m->control_pressed) { return filterString; } #else - ifstream inFASTA; - int numFastaSeqs; - openInputFile(fastafileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - + numFastaSeqs = driverCreateFilter(F, fastafileNames[s], lines[0]); numSeqs += numFastaSeqs; - - lines.push_back(new linePair(0, numFastaSeqs)); - - driverCreateFilter(F, fastafileNames[s], lines[0]); if (m->control_pressed) { return filterString; } #endif #endif @@ -765,37 +762,42 @@ string FilterSeqsCommand::createFilter() { } } /**************************************************************************************/ -int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair* line) { +int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair* filePos) { try { ifstream in; openInputFile(filename, in); - in.seekg(line->start); - - for(int i=0;inum;i++){ + in.seekg(filePos->start); + + bool done = false; + int count = 0; + + while (!done) { if (m->control_pressed) { in.close(); return 1; } - Sequence seq(in); + Sequence seq(in); gobble(in); if (seq.getName() != "") { if (seq.getAligned().length() != alignmentLength) { m->mothurOut("Sequences are not all the same length, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } if(trump != '*'){ F.doTrump(seq); } if(isTrue(vertical) || soft != 0){ F.getFreqs(seq); } cout.flush(); + count++; } + unsigned long int pos = in.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + //report progress - if((i+1) % 100 == 0){ m->mothurOut(toString(i+1)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } - //report progress - if((line->num) % 100 != 0){ m->mothurOut(toString(line->num)); m->mothurOutEndLine(); } - + if((count) % 100 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } in.close(); - return 0; + return count; } catch(exception& e) { m->errorOut(e, "FilterSeqsCommand", "driverCreateFilter"); @@ -855,7 +857,7 @@ int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - int exitCommand = 1; + int num = 0; processIDS.clear(); //loop through and create all the processes you want @@ -866,13 +868,14 @@ int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driverCreateFilter(F, filename, lines[process]); + num = driverCreateFilter(F, filename, lines[process]); //write out filter counts to file filename += toString(getpid()) + "filterValues.temp"; ofstream out; openOutputFile(filename, out); + out << num << endl; for (int k = 0; k < alignmentLength; k++) { out << F.a[k] << '\t'; } out << endl; for (int k = 0; k < alignmentLength; k++) { out << F.t[k] << '\t'; } out << endl; for (int k = 0; k < alignmentLength; k++) { out << F.g[k] << '\t'; } out << endl; @@ -897,7 +900,8 @@ int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) ifstream in; openInputFile(tempFilename, in); - int temp; + int temp, tempNum; + in >> tempNum; gobble(in); num += tempNum; for (int k = 0; k < alignmentLength; k++) { in >> temp; F.a[k] += temp; } gobble(in); for (int k = 0; k < alignmentLength; k++) { in >> temp; F.t[k] += temp; } gobble(in); for (int k = 0; k < alignmentLength; k++) { in >> temp; F.g[k] += temp; } gobble(in); @@ -908,7 +912,7 @@ int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) remove(tempFilename.c_str()); } - return exitCommand; + return num; #endif } catch(exception& e) { @@ -916,63 +920,4 @@ int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) exit(1); } } -/**************************************************************************************************/ - -int FilterSeqsCommand::setLines(string filename) { - try { - - vector positions; - bufferSizes.clear(); - - ifstream inFASTA; - openInputFile(filename, inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - int numFastaSeqs = positions.size(); - - FILE * pFile; - unsigned long int size; - - //get num bytes in file - pFile = fopen (filename.c_str(),"rb"); - if (pFile==NULL) perror ("Error opening file"); - else{ - fseek (pFile, 0, SEEK_END); - size=ftell (pFile); - fclose (pFile); - } - - numSeqs += numFastaSeqs; - - int numSeqsPerProcessor = numFastaSeqs / processors; - - for (int i = 0; i < processors; i++) { - - unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; - bufferSizes.push_back(size - startPos); - }else{ - unsigned long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; - bufferSizes.push_back(myEnd-startPos); - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "FilterSeqsCommand", "setLines"); - exit(1); - } -} /**************************************************************************************/ diff --git a/filterseqscommand.h b/filterseqscommand.h index e068405..cb02732 100644 --- a/filterseqscommand.h +++ b/filterseqscommand.h @@ -25,9 +25,10 @@ public: private: struct linePair { unsigned long int start; - int num; - linePair(unsigned long int i, long int j) : start(i), num(j) {} + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; + vector lines; vector processIDS; @@ -52,8 +53,6 @@ private: int driverMPIRun(int, int, MPI_File&, MPI_File&, vector&); int MPICreateFilter(int, int, Filters&, MPI_File&, vector&); #endif - int setLines(string); - }; diff --git a/globaldata.cpp b/globaldata.cpp index affa463..74daaa2 100644 --- a/globaldata.cpp +++ b/globaldata.cpp @@ -29,6 +29,7 @@ string GlobalData::getOrderFile() { return orderfile; } string GlobalData::getOrderGroupFile() { return ordergroup; } string GlobalData::getTreeFile() { return treefile; } string GlobalData::getSharedFile() { return sharedfile; } +string GlobalData::getRelAbundFile() { return relAbundfile; } string GlobalData::getFormat() { return format; } void GlobalData::setListFile(string file) { listfile = file; inputFileName = file; } @@ -39,6 +40,7 @@ void GlobalData::setPhylipFile(string file) { phylipfile = file; inputFileNa void GlobalData::setColumnFile(string file) { columnfile = file; inputFileName = file; } void GlobalData::setGroupFile(string file) { groupfile = file; } void GlobalData::setSharedFile(string file) { sharedfile = file; inputFileName = file; } +void GlobalData::setRelAbundFile(string file) { relAbundfile = file; inputFileName = file; } void GlobalData::setNameFile(string file) { namefile = file; } void GlobalData::setOrderFile(string file) { orderfile = file; } void GlobalData::setOrderGroupFile(string file) { ordergroup = file; } @@ -83,6 +85,7 @@ void GlobalData::clear() { // fastafile = ""; //do we need this? treefile = ""; sharedfile = ""; + relAbundfile = ""; format = ""; } diff --git a/globaldata.hpp b/globaldata.hpp index 301527a..bfb8a06 100644 --- a/globaldata.hpp +++ b/globaldata.hpp @@ -61,6 +61,7 @@ public: string getOrderGroupFile(); string getTreeFile(); string getSharedFile(); + string getRelAbundFile(); string getFormat(); //do we need this? @@ -73,6 +74,7 @@ public: void setRabundFile(string); void setSabundFile(string); void setSharedFile(string); + void setRelAbundFile(string); void setOrderFile(string file); void setOrderGroupFile(string file); void setFormat(string); //do we need this? @@ -87,7 +89,7 @@ public: private: MothurOut* m; - string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, treefile, sharedfile, format, distfile, ordergroup; + string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, treefile, sharedfile, format, distfile, ordergroup, relAbundfile; static GlobalData* _uniqueInstance; GlobalData( const GlobalData& ); // Disable copy constructor diff --git a/mothur.h b/mothur.h index 765c90a..2622e25 100644 --- a/mothur.h +++ b/mothur.h @@ -1134,6 +1134,72 @@ inline vector setFilePosEachLine(string filename, int& num) { return positions; } /**************************************************************************************************/ + +inline vector divideFile(string filename, int& proc) { + try{ + + vector filePos; + filePos.push_back(0); + + FILE * pFile; + unsigned long int size; + + //get num bytes in file + pFile = fopen (filename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + + //estimate file breaks + unsigned long int chunkSize = 0; + chunkSize = size / proc; + + //file to small to divide by processors + if (chunkSize == 0) { proc = 1; filePos.push_back(size); return filePos; } + + //for each process seekg to closest file break and search for next '>' char. make that the filebreak + for (int i = 0; i < proc; i++) { + unsigned long int spot = (i+1) * chunkSize; + + ifstream in; + openInputFile(filename, in); + in.seekg(spot); + + //look for next '>' + unsigned long int newSpot = spot; + while (!in.eof()) { + char c = in.get(); + if (c == '>') { in.putback(c); newSpot = in.tellg(); break; } + } + + //there was not another sequence before the end of the file + if (newSpot == spot) { break; } + else { filePos.push_back(newSpot); } + + in.close(); + } + + //save end pos + filePos.push_back(size); + + //sanity check filePos + for (int i = 0; i < (filePos.size()-1); i++) { + if (filePos[(i+1)] <= filePos[i]) { filePos.erase(filePos.begin()+(i+1)); i--; } + } + + proc = (filePos.size() - 1); + + return filePos; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the mothur.h function divideFile. Please contact Pat Schloss at mothur.bugs@gmail.com." << "\n"; + exit(1); + } +} +/**************************************************************************************************/ inline bool checkReleaseVersion(ifstream& file, string version) { try { diff --git a/readotucommand.cpp b/readotucommand.cpp index c4bf523..5b9cde8 100644 --- a/readotucommand.cpp +++ b/readotucommand.cpp @@ -21,7 +21,7 @@ ReadOtuCommand::ReadOtuCommand(string option) { else { //valid paramters for this command - string Array[] = {"list","order","shared", "label","group","sabund", "rabund","groups","ordergroup","outputdir","inputdir"}; + string Array[] = {"list","order","shared","relabund","label","group","sabund", "rabund","groups","ordergroup","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -97,7 +97,14 @@ ReadOtuCommand::ReadOtuCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["ordergroup"] = inputDir + it->second; } } - + + it = parameters.find("relabund"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["relabund"] = inputDir + it->second; } + } } @@ -130,6 +137,12 @@ ReadOtuCommand::ReadOtuCommand(string option) { else if (sharedfile == "not found") { sharedfile = ""; } else { globaldata->setSharedFile(sharedfile); globaldata->setFormat("sharedfile"); } + relAbundfile = validParameter.validFile(parameters, "relabund", true); + if (relAbundfile == "not open") { abort = true; } + else if (relAbundfile == "not found") { relAbundfile = ""; } + else { globaldata->setRelAbundFile(relAbundfile); globaldata->setFormat("relabund"); } + + groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { abort = true; } else if (groupfile == "not found") { groupfile = ""; } @@ -154,8 +167,8 @@ ReadOtuCommand::ReadOtuCommand(string option) { if ((listfile != "") && (groupfile != "")) { globaldata->setFormat("shared"); } //you have not given a file - if ((listfile == "") && (sharedfile == "") && (rabundfile == "") && (sabundfile == "")) { - m->mothurOut("You must enter either a listfile, rabundfile, sabundfile or a sharedfile with the read.otu command. "); m->mothurOutEndLine(); abort = true; + if ((listfile == "") && (sharedfile == "") && (rabundfile == "") && (sabundfile == "") && (relAbundfile == "")) { + m->mothurOut("You must enter either a listfile, rabundfile, sabundfile, relabund or a sharedfile with the read.otu command. "); m->mothurOutEndLine(); abort = true; } //check for optional parameter and set defaults @@ -194,7 +207,7 @@ void ReadOtuCommand::help(){ try { m->mothurOut("The read.otu command must be run before you execute a collect.single, rarefaction.single, summary.single, \n"); m->mothurOut("collect.shared, rarefaction.shared, summary.shared heatmap.bin, heatmap.sim or venn command. Mothur will generate a .list, .rabund and .sabund upon completion of the cluster command \n"); - m->mothurOut("or you may use your own. The read.otu command parameter options are list, rabund, sabund, shared, group, order, ordergroup, label and groups.\n"); + m->mothurOut("or you may use your own. The read.otu command parameter options are list, rabund, sabund, shared, relabund, group, order, ordergroup, label and groups.\n"); m->mothurOut("The read.otu command can be used in two ways. The first is to read a list, rabund or sabund and run the collect.single, rarefaction.single or summary.single.\n"); m->mothurOut("For this use the read.otu command should be in the following format: read.otu(list=yourListFile, order=yourOrderFile, label=yourLabels).\n"); m->mothurOut("The list, rabund or sabund parameter is required, but you may only use one of them.\n"); diff --git a/readotucommand.h b/readotucommand.h index b5e32d8..bf9ccc0 100644 --- a/readotucommand.h +++ b/readotucommand.h @@ -29,7 +29,7 @@ private: InputData* input; Command* shared; GroupMap* groupMap; - string filename, listfile, orderfile, sharedfile, label, groupfile, sabundfile, rabundfile, format, groups, outputDir, ordergroupfile; + string filename, listfile, orderfile, sharedfile, label, groupfile, sabundfile, rabundfile, format, groups, outputDir, ordergroupfile, relAbundfile; vector Groups; bool abort, allLines; diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index b2fdaff..fb97c0a 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -290,50 +290,22 @@ int ScreenSeqsCommand::execute(){ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - + vector positions = divideFile(fastafile, processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - openInputFile(fastafile, inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); + numFastaSeqs = driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; } }else{ - vector positions; processIDS.resize(0); - ifstream inFASTA; - openInputFile(fastafile, inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numFastaSeqs = positions.size(); - - int numSeqsPerProcessor = numFastaSeqs / processors; - - for (int i = 0; i < processors; i++) { - unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - - } - - createProcesses(goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); + numFastaSeqs = createProcesses(goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str()); rename((badSeqFile + toString(processIDS[0]) + ".temp").c_str(), badSeqFile.c_str()); @@ -368,14 +340,7 @@ int ScreenSeqsCommand::execute(){ } } #else - ifstream inFASTA; - openInputFile(fastafile, inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); + numFastaSeqs = driver(lines[0], goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; } @@ -692,7 +657,7 @@ int ScreenSeqsCommand::screenAlignReport(set badSeqNames){ } //********************************************************************************************************************** -int ScreenSeqsCommand::driver(linePair* line, string goodFName, string badFName, string badAccnosFName, string filename, set& badSeqNames){ +int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badFName, string badAccnosFName, string filename, set& badSeqNames){ try { ofstream goodFile; openOutputFile(goodFName, goodFile); @@ -706,13 +671,16 @@ int ScreenSeqsCommand::driver(linePair* line, string goodFName, string badFName, ifstream inFASTA; openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); + inFASTA.seekg(filePos->start); + + bool done = false; + int count = 0; - for(int i=0;inumSeqs;i++){ + while (!done) { if (m->control_pressed) { return 0; } - Sequence currSeq(inFASTA); + Sequence currSeq(inFASTA); gobble(inFASTA); if (currSeq.getName() != "") { bool goodSeq = 1; // innocent until proven guilty if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; } @@ -730,9 +698,17 @@ int ScreenSeqsCommand::driver(linePair* line, string goodFName, string badFName, badAccnosFile << currSeq.getName() << endl; badSeqNames.insert(currSeq.getName()); } + count++; } - gobble(inFASTA); + + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + + //report progress + if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } } + //report progress + if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } goodFile.close(); @@ -740,7 +716,7 @@ int ScreenSeqsCommand::driver(linePair* line, string goodFName, string badFName, badFile.close(); badAccnosFile.close(); - return 1; + return count; } catch(exception& e) { m->errorOut(e, "ScreenSeqsCommand", "driver"); @@ -838,7 +814,7 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badFileName, try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - int exitCommand = 1; + int num = 0; //loop through and create all the processes you want while (process != processors) { @@ -848,7 +824,15 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badFileName, processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - exitCommand = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames); + num = driver(lines[process], goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badAccnos + toString(getpid()) + ".temp", filename, badSeqNames); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".temp"; + openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -859,7 +843,15 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badFileName, wait(&temp); } - return exitCommand; + for (int i = 0; i < processIDS.size(); i++) { + ifstream in; + string tempFile = toString(processIDS[i]) + ".temp"; + openInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); remove(tempFile.c_str()); + } + + return num; #endif } catch(exception& e) { diff --git a/screenseqscommand.h b/screenseqscommand.h index 071724f..8f62ae4 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -24,9 +24,10 @@ private: struct linePair { unsigned long int start; - int numSeqs; - linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; + vector processIDS; //processid vector lines; diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 5009ff5..7362c1b 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -224,19 +224,17 @@ int SeqSummaryCommand::execute(){ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else + vector positions = divideFile(fastafile, processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - openInputFile(fastafile, inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numSeqs)); - - driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]); + numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]); }else{ - numSeqs = setLines(fastafile); - createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile); + numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile); rename((summaryFile + toString(processIDS[0]) + ".temp").c_str(), summaryFile.c_str()); //append files @@ -248,14 +246,7 @@ int SeqSummaryCommand::execute(){ if (m->control_pressed) { return 0; } #else - ifstream inFASTA; - openInputFile(fastafile, inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numSeqs)); - - driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]); + numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]); if (m->control_pressed) { return 0; } #endif #endif @@ -313,27 +304,30 @@ int SeqSummaryCommand::execute(){ } } /**************************************************************************************/ -int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vector& endPosition, vector& seqLength, vector& ambigBases, vector& longHomoPolymer, string filename, string sumFile, linePair* line) { +int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vector& endPosition, vector& seqLength, vector& ambigBases, vector& longHomoPolymer, string filename, string sumFile, linePair* filePos) { try { ofstream outSummary; openOutputFile(sumFile, outSummary); //print header if you are process 0 - if (line->start == 0) { + if (filePos->start == 0) { outSummary << "seqname\tstart\tend\tnbases\tambigs\tpolymer" << endl; } ifstream in; openInputFile(filename, in); - in.seekg(line->start); - - for(int i=0;inum;i++){ + in.seekg(filePos->start); + + bool done = false; + int count = 0; + + while (!done) { if (m->control_pressed) { in.close(); outSummary.close(); return 1; } - Sequence current(in); + Sequence current(in); gobble(in); if (current.getName() != "") { startPosition.push_back(current.getStartPos()); @@ -346,12 +340,21 @@ int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vector= filePos->end)) { break; } + + //report progress + if((count) % 100 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } + //report progress + if((count) % 100 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } + in.close(); - return 0; + return count; } catch(exception& e) { m->errorOut(e, "SeqSummaryCommand", "driverCreateSummary"); @@ -418,18 +421,33 @@ int SeqSummaryCommand::createProcessesCreateSummary(vector& startPosition, try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - int exitCommand = 1; + int num = 0; processIDS.clear(); //loop through and create all the processes you want while (process != processors) { - int pid = vfork(); + int pid = fork(); if (pid > 0) { processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile + toString(getpid()) + ".temp", lines[process]); + num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile + toString(getpid()) + ".temp", lines[process]); + + //pass numSeqs to parent + ofstream out; + string tempFile = toString(getpid()) + ".temp"; + openOutputFile(tempFile, out); + + out << num << endl; + for (int k = 0; k < startPosition.size(); k++) { out << startPosition[k] << '\t'; } out << endl; + for (int k = 0; k < endPosition.size(); k++) { out << endPosition[k] << '\t'; } out << endl; + for (int k = 0; k < seqLength.size(); k++) { out << seqLength[k] << '\t'; } out << endl; + for (int k = 0; k < ambigBases.size(); k++) { out << ambigBases[k] << '\t'; } out << endl; + for (int k = 0; k < longHomoPolymer.size(); k++) { out << longHomoPolymer[k] << '\t'; } out << endl; + + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -440,65 +458,29 @@ int SeqSummaryCommand::createProcessesCreateSummary(vector& startPosition, wait(&temp); } - return exitCommand; -#endif - } - catch(exception& e) { - m->errorOut(e, "SeqSummaryCommand", "createProcessesCreateSummary"); - exit(1); - } -} -/**************************************************************************************************/ - -int SeqSummaryCommand::setLines(string filename) { - try { - - vector positions; - - ifstream inFASTA; - openInputFile(filename, inFASTA); + //parent reads in and combine Filter info + for (int i = 0; i < processIDS.size(); i++) { + string tempFilename = toString(processIDS[i]) + ".temp"; + ifstream in; + openInputFile(tempFilename, in); - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - int numFastaSeqs = positions.size(); - - FILE * pFile; - unsigned long int size; - - //get num bytes in file - pFile = fopen (filename.c_str(),"rb"); - if (pFile==NULL) perror ("Error opening file"); - else{ - fseek (pFile, 0, SEEK_END); - size=ftell (pFile); - fclose (pFile); - } - - int numSeqsPerProcessor = numFastaSeqs / processors; - - for (int i = 0; i < processors; i++) { - - unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; - }else{ - unsigned long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + int temp, tempNum; + in >> tempNum; gobble(in); num += tempNum; + for (int k = 0; k < tempNum; k++) { in >> temp; startPosition.push_back(temp); } gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; endPosition.push_back(temp); } gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; seqLength.push_back(temp); } gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; ambigBases.push_back(temp); } gobble(in); + for (int k = 0; k < tempNum; k++) { in >> temp; longHomoPolymer.push_back(temp); } gobble(in); + + in.close(); + remove(tempFilename.c_str()); } - return numFastaSeqs; + return num; +#endif } catch(exception& e) { - m->errorOut(e, "SeqSummaryCommand", "setLines"); + m->errorOut(e, "SeqSummaryCommand", "createProcessesCreateSummary"); exit(1); } } diff --git a/seqsummarycommand.h b/seqsummarycommand.h index a625726..6abf06a 100644 --- a/seqsummarycommand.h +++ b/seqsummarycommand.h @@ -27,15 +27,15 @@ private: struct linePair { unsigned long int start; - int num; - linePair(unsigned long int i, long int j) : start(i), num(j) {} + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} }; + vector lines; vector processIDS; int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, string, string); int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, string, string, linePair*); - int setLines(string); #ifdef USE_MPI int MPICreateSummary(int, int, vector&, vector&, vector&, vector&, vector&, MPI_File&, MPI_File&, vector&);