X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=screenseqscommand.h;h=aeaddae1300207aff7ef713858e9ee760be52e12;hp=b165bba2f2f343a8a20a4b07cb2290a43fd13c32;hb=c48d91112209b841444923670dca5454da0e2a4d;hpb=e150b0b0664caec517485ee6d69dcdade6dcae77 diff --git a/screenseqscommand.h b/screenseqscommand.h index b165bba..aeaddae 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -11,6 +11,7 @@ */ #include "mothur.h" #include "command.hpp" +#include "sequence.hpp" class ScreenSeqsCommand : public Command { @@ -22,9 +23,12 @@ public: vector setParameters(); string getCommandName() { return "screen.seqs"; } string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); + string getOutputPattern(string); string getCitation() { return "http://www.mothur.org/wiki/Screen.seqs"; } - + string getDescription() { return "enables you to keep sequences that fulfill certain user defined criteria"; } + int execute(); void help() { m->mothurOut(getHelpString()); } @@ -32,37 +36,443 @@ public: private: struct linePair { - unsigned long int start; - unsigned long int end; - linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} + unsigned long long start; + unsigned long long end; + linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; - vector processIDS; //processid - vector lines; + vector lines; - int screenNameGroupFile(set); - int screenGroupFile(set); - int screenAlignReport(set); - int screenQual(set); - - int driver(linePair*, string, string, string, set&); - int createProcesses(string, string, string, set&); + int screenNameGroupFile(map); + int screenGroupFile(map); + int screenCountFile(map); + int screenAlignReport(map&); + int screenQual(map); + int screenTaxonomy(map); + int optimizeContigs(); + int optimizeAlign(); + int driver(linePair, string, string, string, map&); + int createProcesses(string, string, string, map&); + int screenSummary(map&); + int screenContigs(map&); + int runFastaScreening(map&); + int screenFasta(map&); + int screenReports(map&); + int getSummary(vector&); + int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, vector&, string); + int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, vector&, string, linePair); + int getSummaryReport(); + int driverContigsSummary(vector&, vector&, vector&, vector&, vector&, linePair); + int createProcessesContigsSummary(vector&, vector&, vector&, vector&, vector&, vector); + int driverAlignSummary(vector&, vector&, vector&, linePair); + int createProcessesAlignSummary(vector&, vector&, vector&, vector); + #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&, set&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&, map&); #endif bool abort; - string fastafile, namefile, groupfile, alignreport, outputDir, qualfile; - int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria; + string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile, contigsreport, summaryfile; + int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria, minOverlap, oStart, oEnd, mismatches, maxN, maxInsert; + float minSim, minScore; vector outputNames; vector optimize; map nameMap; - int readNames(); - int getSummary(vector&); - int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, string); - int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, string, linePair*); + +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct sumData { + vector startPosition; + vector endPosition; + vector seqLength; + vector ambigBases; + vector longHomoPolymer; + vector numNs; + string filename, namefile, countfile; + unsigned long long start; + unsigned long long end; + int count; + MothurOut* m; + map nameMap; + + + sumData(){} + sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map nam) { + filename = f; + namefile = nf; + countfile = cf; + m = mout; + start = st; + end = en; + nameMap = nam; + count = 0; + } +}; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct contigsSumData { + vector ostartPosition; + vector oendPosition; + vector oLength; + vector omismatches; + vector numNs; + string filename, namefile, countfile; + unsigned long long start; + unsigned long long end; + int count; + MothurOut* m; + map nameMap; + + + contigsSumData(){} + contigsSumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map nam) { + filename = f; + namefile = nf; + countfile = cf; + m = mout; + start = st; + end = en; + nameMap = nam; + count = 0; + } +}; +/**************************************************************************************************/ +struct alignsData { + vector sims; + vector scores; + vector inserts; + string filename, namefile, countfile; + unsigned long long start; + unsigned long long end; + int count; + MothurOut* m; + map nameMap; + + + alignsData(){} + alignsData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map nam) { + filename = f; + namefile = nf; + countfile = cf; + m = mout; + start = st; + end = en; + nameMap = nam; + count = 0; + } +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct sumScreenData { + int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN; + unsigned long long start; + unsigned long long end; + int count; + MothurOut* m; + string goodFName, badAccnosFName, filename; + map badSeqNames; + string summaryfile, contigsreport; + + + sumScreenData(){} + sumScreenData(int s, int e, int a, int h, int minl, int maxl, int mn, map bs, string f, string sum, string cont, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) { + startPos = s; + endPos = e; + minLength = minl; + maxLength = maxl; + maxAmbig = a; + maxHomoP = h; + maxN = mn; + filename = f; + goodFName = gf; + badAccnosFName = bf; + m = mout; + start = st; + end = en; + summaryfile = sum; + contigsreport = cont; + badSeqNames = bs; + count = 0; + } }; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){ + sumData* pDataArray; + pDataArray = (sumData*)lpParam; + + try { + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + pDataArray->count++; + + if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; } + + Sequence current(in); pDataArray->m->gobble(in); + + if (current.getName() != "") { + + int num = 1; + if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){ + //make sure this sequence is in the namefile, else error + map::iterator it = pDataArray->nameMap.find(current.getName()); + + if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + int numns = current.getNumNs(); + for (int i = 0; i < num; i++) { + pDataArray->startPosition.push_back(current.getStartPos()); + pDataArray->endPosition.push_back(current.getEndPos()); + pDataArray->seqLength.push_back(current.getNumBases()); + pDataArray->ambigBases.push_back(current.getAmbigBases()); + pDataArray->longHomoPolymer.push_back(current.getLongHomoPolymer()); + pDataArray->numNs.push_back(numns); + } + } + } + + in.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MySumThreadFunction"); + exit(1); + } +} + +/**************************************************************************************************/ +static DWORD WINAPI MyContigsSumThreadFunction(LPVOID lpParam){ + contigsSumData* pDataArray; + pDataArray = (contigsSumData*)lpParam; + + try { + string name; + //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns + int length, OLength, thisOStart, thisOEnd, numMisMatches, numns; + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + pDataArray->count++; + + if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; } + + //seqname start end nbases ambigs polymer numSeqs + in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; pDataArray->m->gobble(in); + + int num = 1; + if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){ + //make sure this sequence is in the namefile, else error + map::iterator it = pDataArray->nameMap.find(name); + + if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + pDataArray->ostartPosition.push_back(thisOStart); + pDataArray->oendPosition.push_back(thisOEnd); + pDataArray->oLength.push_back(OLength); + pDataArray->omismatches.push_back(numMisMatches); + pDataArray->numNs.push_back(numns); + } + } + + in.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyContigsThreadFunction"); + exit(1); + } +} +/**************************************************************************************************/ +static DWORD WINAPI MyAlignsThreadFunction(LPVOID lpParam){ + alignsData* pDataArray; + pDataArray = (alignsData*)lpParam; + + try { + + string name, TemplateName, SearchMethod, AlignmentMethod; + //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template + //checking for minScore, maxInsert, minSim + int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert; + float SearchScore, SimBtwnQueryTemplate; + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + pDataArray->count++; + + if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; } + + in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; pDataArray->m->gobble(in); + cout << i << '\t' << name << endl; + int num = 1; + if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){ + //make sure this sequence is in the namefile, else error + map::iterator it = pDataArray->nameMap.find(name); + + if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + pDataArray->sims.push_back(SimBtwnQueryTemplate); + pDataArray->scores.push_back(SearchScore); + pDataArray->inserts.push_back(LongestInsert); + } + } + + in.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyAlignsThreadFunction"); + exit(1); + } +} + +/**************************************************************************************************/ +static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){ + sumScreenData* pDataArray; + pDataArray = (sumScreenData*)lpParam; + + try { + + ofstream goodFile; + pDataArray->m->openOutputFile(pDataArray->goodFName, goodFile); + + ofstream badAccnosFile; + pDataArray->m->openOutputFile(pDataArray->badAccnosFName, badAccnosFile); + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + pDataArray->count++; + + if (pDataArray->m->control_pressed) { in.close(); badAccnosFile.close(); goodFile.close(); pDataArray->count = 1; return 1; } + + Sequence currSeq(in); pDataArray->m->gobble(in); + + if (currSeq.getName() != "") { + bool goodSeq = 1; // innocent until proven guilty + string trashCode = ""; + //have the report files found you bad + map::iterator it = pDataArray->badSeqNames.find(currSeq.getName()); + if (it != pDataArray->badSeqNames.end()) { goodSeq = 0; trashCode = it->second; } //found it + + if (pDataArray->summaryfile == "") { + if(pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos()) { goodSeq = 0; trashCode += "start|"; } + if(pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos()) { goodSeq = 0; trashCode += "end|"; } + if(pDataArray->maxAmbig != -1 && pDataArray->maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; trashCode += "ambig|"; } + if(pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; trashCode += "homop|"; } + if(pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases()) { goodSeq = 0; trashCode += "maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases()) { goodSeq = 0; trashCode += ">length|"; } + } + if (pDataArray->contigsreport == "") { //contigs report includes this so no need to check again + if(pDataArray->maxN != -1 && pDataArray->maxN < currSeq.getNumNs()) { goodSeq = 0; trashCode += "n|"; } + } + + + if(goodSeq == 1){ + currSeq.printSequence(goodFile); + } + else{ + badAccnosFile << currSeq.getName() << '\t' << trashCode.substr(0, trashCode.length()-1) << endl; + pDataArray->badSeqNames[currSeq.getName()] = trashCode; + } + + } + //report progress + if((i+1) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(i+1)+"\n"); } + } + //report progress + if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); } + + + + in.close(); + goodFile.close(); + badAccnosFile.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MySumScreenThreadFunction"); + exit(1); + } +} + +#endif + +/**************************************************************************************************/ + + + #endif