1 #ifndef SCREENSEQSCOMMAND_H
2 #define SCREENSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/3/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
13 #include "command.hpp"
14 #include "sequence.hpp"
16 class ScreenSeqsCommand : public Command {
19 ScreenSeqsCommand(string);
21 ~ScreenSeqsCommand() {}
23 vector<string> setParameters();
24 string getCommandName() { return "screen.seqs"; }
25 string getCommandCategory() { return "Sequence Processing"; }
27 string getHelpString();
28 string getOutputPattern(string);
29 string getCitation() { return "http://www.mothur.org/wiki/Screen.seqs"; }
30 string getDescription() { return "enables you to keep sequences that fulfill certain user defined criteria"; }
33 void help() { m->mothurOut(getHelpString()); }
39 unsigned long long start;
40 unsigned long long end;
41 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
44 vector<linePair> lines;
46 int screenNameGroupFile(map<string, string>);
47 int screenGroupFile(map<string, string>);
48 int screenCountFile(map<string, string>);
49 int screenAlignReport(map<string, string>&);
50 int screenQual(map<string, string>);
51 int screenTaxonomy(map<string, string>);
53 int optimizeContigs();
55 int driver(linePair, string, string, string, map<string, string>&);
56 int createProcesses(string, string, string, map<string, string>&);
57 int screenSummary(map<string, string>&);
58 int screenContigs(map<string, string>&);
59 int runFastaScreening(map<string, string>&);
60 int screenFasta(map<string, string>&);
61 int screenReports(map<string, string>&);
62 int getSummary(vector<unsigned long long>&);
63 int createProcessesCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string);
64 int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, linePair);
65 int getSummaryReport();
66 int driverContigsSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, linePair);
67 int createProcessesContigsSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<linePair>);
68 int driverAlignSummary(vector<float>&, vector<float>&, vector<int>&, linePair);
69 int createProcessesAlignSummary(vector<float>&, vector<float>&, vector<int>&, vector<linePair>);
72 int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, map<string, string>&);
76 string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile, contigsreport, summaryfile;
77 int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria, minOverlap, oStart, oEnd, mismatches, maxN, maxInsert;
78 float minSim, minScore;
79 vector<string> outputNames;
80 vector<string> optimize;
81 map<string, int> nameMap;
86 /**************************************************************************************************/
87 //custom data structure for threads to use.
88 // This is passed by void pointer so it can be any data type
89 // that can be passed using a single void pointer (LPVOID).
91 vector<int> startPosition;
92 vector<int> endPosition;
93 vector<int> seqLength;
94 vector<int> ambigBases;
95 vector<int> longHomoPolymer;
97 string filename, namefile, countfile;
98 unsigned long long start;
99 unsigned long long end;
102 map<string, int> nameMap;
106 sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map<string, int> nam) {
117 /**************************************************************************************************/
118 //custom data structure for threads to use.
119 // This is passed by void pointer so it can be any data type
120 // that can be passed using a single void pointer (LPVOID).
121 struct contigsSumData {
122 vector<int> ostartPosition;
123 vector<int> oendPosition;
125 vector<int> omismatches;
127 string filename, namefile, countfile;
128 unsigned long long start;
129 unsigned long long end;
132 map<string, int> nameMap;
136 contigsSumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map<string, int> nam) {
147 /**************************************************************************************************/
150 vector<float> scores;
152 string filename, namefile, countfile;
153 unsigned long long start;
154 unsigned long long end;
157 map<string, int> nameMap;
161 alignsData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map<string, int> nam) {
173 /**************************************************************************************************/
174 //custom data structure for threads to use.
175 // This is passed by void pointer so it can be any data type
176 // that can be passed using a single void pointer (LPVOID).
177 struct sumScreenData {
178 int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN;
179 unsigned long long start;
180 unsigned long long end;
183 string goodFName, badAccnosFName, filename;
184 map<string, string> badSeqNames;
185 string summaryfile, contigsreport;
189 sumScreenData(int s, int e, int a, int h, int minl, int maxl, int mn, map<string, string> bs, string f, string sum, string cont, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) {
204 contigsreport = cont;
211 /**************************************************************************************************/
212 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
214 static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){
216 pDataArray = (sumData*)lpParam;
220 pDataArray->m->openInputFile(pDataArray->filename, in);
222 //print header if you are process 0
223 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
225 }else { //this accounts for the difference in line endings.
226 in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
230 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
234 if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }
236 Sequence current(in); pDataArray->m->gobble(in);
238 if (current.getName() != "") {
241 if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){
242 //make sure this sequence is in the namefile, else error
243 map<string, int>::iterator it = pDataArray->nameMap.find(current.getName());
245 if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
246 else { num = it->second; }
249 //for each sequence this sequence represents
250 int numns = current.getNumNs();
251 for (int i = 0; i < num; i++) {
252 pDataArray->startPosition.push_back(current.getStartPos());
253 pDataArray->endPosition.push_back(current.getEndPos());
254 pDataArray->seqLength.push_back(current.getNumBases());
255 pDataArray->ambigBases.push_back(current.getAmbigBases());
256 pDataArray->longHomoPolymer.push_back(current.getLongHomoPolymer());
257 pDataArray->numNs.push_back(numns);
267 catch(exception& e) {
268 pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MySumThreadFunction");
273 /**************************************************************************************************/
274 static DWORD WINAPI MyContigsSumThreadFunction(LPVOID lpParam){
275 contigsSumData* pDataArray;
276 pDataArray = (contigsSumData*)lpParam;
280 //Name Length Overlap_Length Overlap_Start Overlap_End MisMatches Num_Ns
281 int length, OLength, thisOStart, thisOEnd, numMisMatches, numns;
284 pDataArray->m->openInputFile(pDataArray->filename, in);
286 //print header if you are process 0
287 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
288 in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in);
289 }else { //this accounts for the difference in line endings.
290 in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
294 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
298 if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }
300 //seqname start end nbases ambigs polymer numSeqs
301 in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; pDataArray->m->gobble(in);
304 if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){
305 //make sure this sequence is in the namefile, else error
306 map<string, int>::iterator it = pDataArray->nameMap.find(name);
308 if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
309 else { num = it->second; }
312 //for each sequence this sequence represents
313 for (int i = 0; i < num; i++) {
314 pDataArray->ostartPosition.push_back(thisOStart);
315 pDataArray->oendPosition.push_back(thisOEnd);
316 pDataArray->oLength.push_back(OLength);
317 pDataArray->omismatches.push_back(numMisMatches);
318 pDataArray->numNs.push_back(numns);
327 catch(exception& e) {
328 pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyContigsThreadFunction");
332 /**************************************************************************************************/
333 static DWORD WINAPI MyAlignsThreadFunction(LPVOID lpParam){
334 alignsData* pDataArray;
335 pDataArray = (alignsData*)lpParam;
339 string name, TemplateName, SearchMethod, AlignmentMethod;
340 //QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template
341 //checking for minScore, maxInsert, minSim
342 int length, TemplateLength, QueryStart, QueryEnd, TemplateStart, TemplateEnd, PairwiseAlignmentLength, GapsInQuery, GapsInTemplate, LongestInsert;
343 float SearchScore, SimBtwnQueryTemplate;
346 pDataArray->m->openInputFile(pDataArray->filename, in);
348 //print header if you are process 0
349 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
350 in.seekg(0); pDataArray->m->getline(in); pDataArray->m->gobble(in);
351 }else { //this accounts for the difference in line endings.
352 in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
355 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
359 if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }
361 in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; pDataArray->m->gobble(in);
362 cout << i << '\t' << name << endl;
364 if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){
365 //make sure this sequence is in the namefile, else error
366 map<string, int>::iterator it = pDataArray->nameMap.find(name);
368 if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
369 else { num = it->second; }
372 //for each sequence this sequence represents
373 for (int i = 0; i < num; i++) {
374 pDataArray->sims.push_back(SimBtwnQueryTemplate);
375 pDataArray->scores.push_back(SearchScore);
376 pDataArray->inserts.push_back(LongestInsert);
385 catch(exception& e) {
386 pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyAlignsThreadFunction");
391 /**************************************************************************************************/
392 static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){
393 sumScreenData* pDataArray;
394 pDataArray = (sumScreenData*)lpParam;
399 pDataArray->m->openOutputFile(pDataArray->goodFName, goodFile);
401 ofstream badAccnosFile;
402 pDataArray->m->openOutputFile(pDataArray->badAccnosFName, badAccnosFile);
405 pDataArray->m->openInputFile(pDataArray->filename, in);
407 //print header if you are process 0
408 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
410 }else { //this accounts for the difference in line endings.
411 in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
414 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
418 if (pDataArray->m->control_pressed) { in.close(); badAccnosFile.close(); goodFile.close(); pDataArray->count = 1; return 1; }
420 Sequence currSeq(in); pDataArray->m->gobble(in);
422 if (currSeq.getName() != "") {
423 bool goodSeq = 1; // innocent until proven guilty
424 string trashCode = "";
425 //have the report files found you bad
426 map<string, string>::iterator it = pDataArray->badSeqNames.find(currSeq.getName());
427 if (it != pDataArray->badSeqNames.end()) { goodSeq = 0; trashCode = it->second; } //found it
429 if (pDataArray->summaryfile == "") {
430 if(pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos()) { goodSeq = 0; trashCode += "start|"; }
431 if(pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos()) { goodSeq = 0; trashCode += "end|"; }
432 if(pDataArray->maxAmbig != -1 && pDataArray->maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; trashCode += "ambig|"; }
433 if(pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; trashCode += "homop|"; }
434 if(pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases()) { goodSeq = 0; trashCode += "<length|"; }
435 if(pDataArray->maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases()) { goodSeq = 0; trashCode += ">length|"; }
437 if (pDataArray->contigsreport == "") { //contigs report includes this so no need to check again
438 if(pDataArray->maxN != -1 && pDataArray->maxN < currSeq.getNumNs()) { goodSeq = 0; trashCode += "n|"; }
443 currSeq.printSequence(goodFile);
446 badAccnosFile << currSeq.getName() << '\t' << trashCode.substr(0, trashCode.length()-1) << endl;
447 pDataArray->badSeqNames[currSeq.getName()] = trashCode;
452 if((i+1) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(i+1)+"\n"); }
455 if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); }
461 badAccnosFile.close();
466 catch(exception& e) {
467 pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MySumScreenThreadFunction");
474 /**************************************************************************************************/