X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=screenseqscommand.h;h=aeaddae1300207aff7ef713858e9ee760be52e12;hp=284c9a5ac1a0bd5c90dca6b4854b725d6b428023;hb=c48d91112209b841444923670dca5454da0e2a4d;hpb=deba0af0ccdcb6005ed5b2b82649b137c63fbdf7 diff --git a/screenseqscommand.h b/screenseqscommand.h index 284c9a5..aeaddae 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -43,30 +43,44 @@ private: vector lines; - int screenNameGroupFile(set); - int screenGroupFile(set); - int screenCountFile(set); - int screenAlignReport(set); - int screenQual(set); - int screenTaxonomy(set); - - int driver(linePair, string, string, string, set&); - int createProcesses(string, string, string, set&); + int screenNameGroupFile(map); + int screenGroupFile(map); + int screenCountFile(map); + int screenAlignReport(map&); + int screenQual(map); + int screenTaxonomy(map); + int optimizeContigs(); + int optimizeAlign(); + int driver(linePair, string, string, string, map&); + int createProcesses(string, string, string, map&); + int screenSummary(map&); + int screenContigs(map&); + int runFastaScreening(map&); + int screenFasta(map&); + int screenReports(map&); + int getSummary(vector&); + int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, vector&, string); + int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, vector&, string, linePair); + int getSummaryReport(); + int driverContigsSummary(vector&, vector&, vector&, vector&, vector&, linePair); + int createProcessesContigsSummary(vector&, vector&, vector&, vector&, vector&, vector); + int driverAlignSummary(vector&, vector&, vector&, linePair); + int createProcessesAlignSummary(vector&, vector&, vector&, vector); + #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&, set&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&, map&); #endif bool abort; - string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile; - int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria; + string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile, contigsreport, summaryfile; + int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria, minOverlap, oStart, oEnd, mismatches, maxN, maxInsert; + float minSim, minScore; vector outputNames; vector optimize; map nameMap; - int getSummary(vector&); - int createProcessesCreateSummary(vector&, vector&, vector&, vector&, vector&, string); - int driverCreateSummary(vector&, vector&, vector&, vector&, vector&, string, linePair); + }; /**************************************************************************************************/ @@ -79,7 +93,8 @@ struct sumData { vector seqLength; vector ambigBases; vector longHomoPolymer; - string filename, namefile; + vector numNs; + string filename, namefile, countfile; unsigned long long start; unsigned long long end; int count; @@ -88,9 +103,65 @@ struct sumData { sumData(){} - sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, map nam) { + sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map nam) { + filename = f; + namefile = nf; + countfile = cf; + m = mout; + start = st; + end = en; + nameMap = nam; + count = 0; + } +}; +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct contigsSumData { + vector ostartPosition; + vector oendPosition; + vector oLength; + vector omismatches; + vector numNs; + string filename, namefile, countfile; + unsigned long long start; + unsigned long long end; + int count; + MothurOut* m; + map nameMap; + + + contigsSumData(){} + contigsSumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map nam) { + filename = f; + namefile = nf; + countfile = cf; + m = mout; + start = st; + end = en; + nameMap = nam; + count = 0; + } +}; +/**************************************************************************************************/ +struct alignsData { + vector sims; + vector scores; + vector inserts; + string filename, namefile, countfile; + unsigned long long start; + unsigned long long end; + int count; + MothurOut* m; + map nameMap; + + + alignsData(){} + alignsData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map nam) { filename = f; namefile = nf; + countfile = cf; m = mout; start = st; end = en; @@ -98,34 +169,40 @@ struct sumData { count = 0; } }; + /**************************************************************************************************/ //custom data structure for threads to use. // This is passed by void pointer so it can be any data type // that can be passed using a single void pointer (LPVOID). struct sumScreenData { - int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength; + int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN; unsigned long long start; unsigned long long end; int count; MothurOut* m; string goodFName, badAccnosFName, filename; - set badSeqNames; + map badSeqNames; + string summaryfile, contigsreport; sumScreenData(){} - sumScreenData(int s, int e, int a, int h, int minl, int maxl, string f, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) { + sumScreenData(int s, int e, int a, int h, int minl, int maxl, int mn, map bs, string f, string sum, string cont, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) { startPos = s; endPos = e; minLength = minl; maxLength = maxl; maxAmbig = a; maxHomoP = h; + maxN = mn; filename = f; goodFName = gf; badAccnosFName = bf; m = mout; start = st; end = en; + summaryfile = sum; + contigsreport = cont; + badSeqNames = bs; count = 0; } }; @@ -149,9 +226,11 @@ static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){ in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); } - pDataArray->count = pDataArray->end; + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + pDataArray->count++; + if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; } Sequence current(in); pDataArray->m->gobble(in); @@ -159,7 +238,7 @@ static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){ if (current.getName() != "") { int num = 1; - if (pDataArray->namefile != "") { + if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){ //make sure this sequence is in the namefile, else error map::iterator it = pDataArray->nameMap.find(current.getName()); @@ -168,12 +247,14 @@ static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){ } //for each sequence this sequence represents + int numns = current.getNumNs(); for (int i = 0; i < num; i++) { pDataArray->startPosition.push_back(current.getStartPos()); pDataArray->endPosition.push_back(current.getEndPos()); pDataArray->seqLength.push_back(current.getNumBases()); pDataArray->ambigBases.push_back(current.getAmbigBases()); pDataArray->longHomoPolymer.push_back(current.getLongHomoPolymer()); + pDataArray->numNs.push_back(numns); } } } @@ -190,7 +271,124 @@ static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){ } /**************************************************************************************************/ +static DWORD WINAPI MyContigsSumThreadFunction(LPVOID lpParam){ + contigsSumData* pDataArray; + pDataArray = (contigsSumData*)lpParam; + + try { + string name; + //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns + int length, OLength, thisOStart, thisOEnd, numMisMatches, numns; + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + pDataArray->count++; + + if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; } + + //seqname start end nbases ambigs polymer numSeqs + in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; pDataArray->m->gobble(in); + + int num = 1; + if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){ + //make sure this sequence is in the namefile, else error + map::iterator it = pDataArray->nameMap.find(name); + + if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + pDataArray->ostartPosition.push_back(thisOStart); + pDataArray->oendPosition.push_back(thisOEnd); + pDataArray->oLength.push_back(OLength); + pDataArray->omismatches.push_back(numMisMatches); + pDataArray->numNs.push_back(numns); + } + } + + in.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyContigsThreadFunction"); + exit(1); + } +} +/**************************************************************************************************/ +static DWORD WINAPI MyAlignsThreadFunction(LPVOID lpParam){ + alignsData* pDataArray; + pDataArray = (alignsData*)lpParam; + + try { + + string name, TemplateName, SearchMethod, AlignmentMethod; + //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template + //checking for minScore, maxInsert, minSim + int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert; + float SearchScore, SimBtwnQueryTemplate; + + ifstream in; + pDataArray->m->openInputFile(pDataArray->filename, in); + + //print header if you are process 0 + if ((pDataArray->start == 0) || (pDataArray->start == 1)) { + in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in); + }else { //this accounts for the difference in line endings. + in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); + } + + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + + pDataArray->count++; + + if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; } + + in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; pDataArray->m->gobble(in); + cout << i << '\t' << name << endl; + int num = 1; + if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){ + //make sure this sequence is in the namefile, else error + map::iterator it = pDataArray->nameMap.find(name); + + if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + else { num = it->second; } + } + + //for each sequence this sequence represents + for (int i = 0; i < num; i++) { + pDataArray->sims.push_back(SimBtwnQueryTemplate); + pDataArray->scores.push_back(SearchScore); + pDataArray->inserts.push_back(LongestInsert); + } + } + + in.close(); + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyAlignsThreadFunction"); + exit(1); + } +} +/**************************************************************************************************/ static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){ sumScreenData* pDataArray; pDataArray = (sumScreenData*)lpParam; @@ -212,37 +410,49 @@ static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){ }else { //this accounts for the difference in line endings. in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); } - - pDataArray->count = pDataArray->end; + for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process + pDataArray->count++; + if (pDataArray->m->control_pressed) { in.close(); badAccnosFile.close(); goodFile.close(); pDataArray->count = 1; return 1; } Sequence currSeq(in); pDataArray->m->gobble(in); if (currSeq.getName() != "") { bool goodSeq = 1; // innocent until proven guilty - if(goodSeq == 1 && pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->maxAmbig != -1 && pDataArray->maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases()) { goodSeq = 0; } - if(goodSeq == 1 && pDataArray->maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases()) { goodSeq = 0; } + string trashCode = ""; + //have the report files found you bad + map::iterator it = pDataArray->badSeqNames.find(currSeq.getName()); + if (it != pDataArray->badSeqNames.end()) { goodSeq = 0; trashCode = it->second; } //found it + + if (pDataArray->summaryfile == "") { + if(pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos()) { goodSeq = 0; trashCode += "start|"; } + if(pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos()) { goodSeq = 0; trashCode += "end|"; } + if(pDataArray->maxAmbig != -1 && pDataArray->maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; trashCode += "ambig|"; } + if(pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; trashCode += "homop|"; } + if(pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases()) { goodSeq = 0; trashCode += "maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases()) { goodSeq = 0; trashCode += ">length|"; } + } + if (pDataArray->contigsreport == "") { //contigs report includes this so no need to check again + if(pDataArray->maxN != -1 && pDataArray->maxN < currSeq.getNumNs()) { goodSeq = 0; trashCode += "n|"; } + } + if(goodSeq == 1){ currSeq.printSequence(goodFile); } else{ - badAccnosFile << currSeq.getName() << endl; - pDataArray->badSeqNames.insert(currSeq.getName()); + badAccnosFile << currSeq.getName() << '\t' << trashCode.substr(0, trashCode.length()-1) << endl; + pDataArray->badSeqNames[currSeq.getName()] = trashCode; } } //report progress - if((i+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine(); } + if((i+1) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(i+1)+"\n"); } } //report progress - if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); } + if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); }