From: Sarah Westcott Date: Tue, 21 Feb 2012 16:11:35 +0000 (-0500) Subject: paralellized filter.seqs for windows X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;ds=sidebyside;h=26cf6509c41e1f38fe5a178bf9d22c02d9d97580;p=mothur.git paralellized filter.seqs for windows --- diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index 82c73f3..806ca0d 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -420,9 +420,9 @@ int FilterSeqsCommand::filterSequences() { MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - + vector positions = savedPositions[s]; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - vector positions = m->divideFile(fastafileNames[s], processors); + //vector positions = m->divideFile(fastafileNames[s], processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); @@ -432,23 +432,31 @@ int FilterSeqsCommand::filterSequences() { int numFastaSeqs = driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]); numSeqs += numFastaSeqs; }else{ - int numFastaSeqs = createProcessesRunFilter(filter, fastafileNames[s]); + int numFastaSeqs = createProcessesRunFilter(filter, fastafileNames[s], filteredFasta); numSeqs += numFastaSeqs; - - rename((fastafileNames[s] + toString(processIDS[0]) + ".temp").c_str(), filteredFasta.c_str()); - - //append fasta files - for(int i=1;iappendFiles((fastafileNames[s] + toString(processIDS[i]) + ".temp"), filteredFasta); - m->mothurRemove((fastafileNames[s] + toString(processIDS[i]) + ".temp")); - } } if (m->control_pressed) { return 1; } #else - lines.push_back(new linePair(0, 1000)); + if(processors == 1){ + lines.push_back(new linePair(0, 1000)); int numFastaSeqs = driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]); numSeqs += numFastaSeqs; + }else { + int numFastaSeqs = positions.size()-1; + //positions = m->setFilePosFasta(fastafileNames[s], numFastaSeqs); + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor)); + } + + numFastaSeqs = createProcessesRunFilter(filter, fastafileNames[s], filteredFasta); + numSeqs += numFastaSeqs; + } if (m->control_pressed) { return 1; } #endif @@ -596,12 +604,15 @@ int FilterSeqsCommand::driverRunFilter(string F, string outputFilename, string i } /**************************************************************************************************/ -int FilterSeqsCommand::createProcessesRunFilter(string F, string filename) { +int FilterSeqsCommand::createProcessesRunFilter(string F, string filename, string filteredFastaName) { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; + + int process = 1; int num = 0; processIDS.clear(); + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //loop through and create all the processes you want while (process != processors) { @@ -629,8 +640,10 @@ int FilterSeqsCommand::createProcessesRunFilter(string F, string filename) { } } + num = driverRunFilter(F, filteredFastaName, filename, lines[0]); + //force parent to wait until all the processes are done - for (int i=0;iopenInputFile(tempFile, in); if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } in.close(); m->mothurRemove(tempFile); + + m->appendFiles((filename + toString(processIDS[i]) + ".temp"), filteredFastaName); + m->mothurRemove((filename + toString(processIDS[i]) + ".temp")); } - - - return num; -#endif + +#else + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the filterData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + //Taking advantage of shared memory to allow both threads to add info to F. + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; istart, lines[i]->end, alignmentLength, i); + pDataArray.push_back(tempFilter); + processIDS.push_back(i); + + hThreadArray[i] = CreateThread(NULL, 0, MyRunFilterThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + } + + num = driverRunFilter(F, (filteredFastaName + toString(processors-1) + ".temp"), filename, lines[processors-1]); + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + num += pDataArray[i]->count; + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + + for (int i = 1; i < processors; i++) { + m->appendFiles((filteredFastaName + toString(i) + ".temp"), filteredFastaName); + m->mothurRemove((filteredFastaName + toString(i) + ".temp")); + } +#endif + + return num; + } catch(exception& e) { m->errorOut(e, "FilterSeqsCommand", "createProcessesRunFilter"); @@ -740,9 +798,9 @@ string FilterSeqsCommand::createFilter() { #else - + vector positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - vector positions = m->divideFile(fastafileNames[s], processors); + positions = m->divideFile(fastafileNames[s], processors); for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } @@ -754,14 +812,31 @@ string FilterSeqsCommand::createFilter() { int numFastaSeqs = createProcessesCreateFilter(F, fastafileNames[s]); numSeqs += numFastaSeqs; } - - if (m->control_pressed) { return filterString; } #else - lines.push_back(new linePair(0, 1000)); - int numFastaSeqs = driverCreateFilter(F, fastafileNames[s], lines[0]); - numSeqs += numFastaSeqs; - if (m->control_pressed) { return filterString; } + if(processors == 1){ + lines.push_back(new linePair(0, 1000)); + int numFastaSeqs = driverCreateFilter(F, fastafileNames[s], lines[0]); + numSeqs += numFastaSeqs; + }else { + int numFastaSeqs = 0; + positions = m->setFilePosFasta(fastafileNames[s], numFastaSeqs); + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor)); + } + + numFastaSeqs = createProcessesCreateFilter(F, fastafileNames[s]); + numSeqs += numFastaSeqs; + } #endif + //save the file positions so we can reuse them in the runFilter function + savedPositions[s] = positions; + + if (m->control_pressed) { return filterString; } #endif } @@ -848,7 +923,7 @@ string FilterSeqsCommand::createFilter() { MPI_Barrier(MPI_COMM_WORLD); #endif - + return filterString; } catch(exception& e) { @@ -954,11 +1029,12 @@ int FilterSeqsCommand::MPICreateFilter(int start, int num, Filters& F, MPI_File& int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 1; + int process = 1; int num = 0; processIDS.clear(); - + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //loop through and create all the processes you want while (process != processors) { int pid = fork(); @@ -1033,8 +1109,50 @@ int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) m->mothurRemove(tempFilename); } - return num; -#endif + +#else + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the filterData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + //Taking advantage of shared memory to allow both threads to add info to F. + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors]; + HANDLE hThreadArray[processors]; + + //Create processor worker threads. + for( int i=0; istart, lines[i]->end, alignmentLength, trump, vertical, soft, hard, i); + pDataArray.push_back(tempFilter); + processIDS.push_back(i); + + hThreadArray[i] = CreateThread(NULL, 0, MyCreateFilterThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + } + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + num += pDataArray[i]->count; + F.mergeFilter(pDataArray[i]->F.getFilter()); + + for (int k = 0; k < alignmentLength; k++) { F.a[k] += pDataArray[i]->F.a[k]; } + for (int k = 0; k < alignmentLength; k++) { F.t[k] += pDataArray[i]->F.t[k]; } + for (int k = 0; k < alignmentLength; k++) { F.g[k] += pDataArray[i]->F.g[k]; } + for (int k = 0; k < alignmentLength; k++) { F.c[k] += pDataArray[i]->F.c[k]; } + for (int k = 0; k < alignmentLength; k++) { F.gap[k] += pDataArray[i]->F.gap[k]; } + + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + +#endif + return num; + } catch(exception& e) { m->errorOut(e, "FilterSeqsCommand", "createProcessesCreateFilter"); diff --git a/filterseqscommand.h b/filterseqscommand.h index 3bf36c0..16062f3 100644 --- a/filterseqscommand.h +++ b/filterseqscommand.h @@ -40,6 +40,7 @@ private: vector lines; vector processIDS; + map > savedPositions; string vertical, filter, fasta, hard, outputDir, filterFileName; vector fastafileNames; @@ -55,7 +56,7 @@ private: string createFilter(); int filterSequences(); int createProcessesCreateFilter(Filters&, string); - int createProcessesRunFilter(string, string); + int createProcessesRunFilter(string, string, string); int driverRunFilter(string, string, string, linePair*); int driverCreateFilter(Filters& F, string filename, linePair* line); #ifdef USE_MPI @@ -65,4 +66,179 @@ private: }; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct filterData { + Filters F; + int count, tid, alignmentLength; + unsigned long long start, end; + MothurOut* m; + string filename, vertical, hard; + char trump; + float soft; + + filterData(){} + filterData(string fn, MothurOut* mout, unsigned long long st, unsigned long long en, int aLength, char tr, string vert, float so, string ha, int t) { + filename = fn; + m = mout; + start = st; + end = en; + tid = t; + trump = tr; + alignmentLength = aLength; + vertical = vert; + soft = so; + hard = ha; + count = 0; + } +}; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct filterRunData { + int count, tid, alignmentLength; + unsigned long long start, end; + MothurOut* m; + string filename; + string filter, outputFilename; + + filterRunData(){} + filterRunData(string f, string fn, string ofn, MothurOut* mout, unsigned long long st, unsigned long long en, int aLength, int t) { + filter = f; + outputFilename = ofn; + filename = fn; + m = mout; + start = st; + end = en; + tid = t; + alignmentLength = aLength; + count = 0; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyCreateFilterThreadFunction(LPVOID lpParam){ + filterData* pDataArray; + pDataArray = (filterData*)lpParam; + + try { + + if (pDataArray->soft != 0) { pDataArray->F.setSoft(pDataArray->soft); } + if (pDataArray->trump != '*') { pDataArray->F.setTrump(pDataArray->trump); } + + pDataArray->F.setLength(pDataArray->alignmentLength); + + if(pDataArray->trump != '*' || pDataArray->m->isTrue(pDataArray->vertical) || pDataArray->soft != 0){ + pDataArray->F.initialize(); + } + + if(pDataArray->hard.compare("") != 0) { pDataArray->F.doHard(pDataArray->hard); } + else { pDataArray->F.setFilter(string(pDataArray->alignmentLength, '1')); } + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + pDataArray->count = pDataArray->end; + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; } + + Sequence current(in); pDataArray->m->gobble(in); + + if (current.getName() != "") { + if (current.getAligned().length() != pDataArray->alignmentLength) { pDataArray->m->mothurOut("Sequences are not all the same length, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + + if(pDataArray->trump != '*') { pDataArray->F.doTrump(current); } + if(pDataArray->m->isTrue(pDataArray->vertical) || pDataArray->soft != 0) { pDataArray->F.getFreqs(current); } + } + + //report progress + if((i) % 100 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); } + } + + if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); } + + in.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "FilterSeqsCommand", "MyCreateFilterThreadFunction"); + exit(1); + } +} +/**************************************************************************************************/ +static DWORD WINAPI MyRunFilterThreadFunction(LPVOID lpParam){ + filterRunData* pDataArray; + pDataArray = (filterRunData*)lpParam; + + try { + + ofstream out; + pDataArray->m->openOutputFile(pDataArray->outputFilename, out); + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + pDataArray->count = pDataArray->end; + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + if (pDataArray->m->control_pressed) { in.close(); out.close(); pDataArray->count = 1; return 1; } + + Sequence seq(in); pDataArray->m->gobble(in); + if (seq.getName() != "") { + string align = seq.getAligned(); + string filterSeq = ""; + + for(int j=0;jalignmentLength;j++){ + if(pDataArray->filter[j] == '1'){ + filterSeq += align[j]; + } + } + + out << '>' << seq.getName() << endl << filterSeq << endl; + } + + //report progress + if((i) % 100 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); } + } + + if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); } + + in.close(); + out.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "FilterSeqsCommand", "MyRunFilterThreadFunction"); + exit(1); + } +} +/**************************************************************************************************/ +#endif + + #endif