]> git.donarmstrong.com Git - mothur.git/blob - screenseqscommand.h
changed random forest output filename
[mothur.git] / screenseqscommand.h
1 #ifndef SCREENSEQSCOMMAND_H
2 #define SCREENSEQSCOMMAND_H
3
4 /*
5  *  screenseqscommand.h
6  *  Mothur
7  *
8  *  Created by Pat Schloss on 6/3/09.
9  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
10  *
11  */
12 #include "mothur.h"
13 #include "command.hpp"
14 #include "sequence.hpp"
15
16 class ScreenSeqsCommand : public Command {
17         
18 public:
19         ScreenSeqsCommand(string);
20         ScreenSeqsCommand();
21         ~ScreenSeqsCommand() {}
22         
23         vector<string> setParameters();
24         string getCommandName()                 { return "screen.seqs";                         }
25         string getCommandCategory()             { return "Sequence Processing";         }
26         
27         string getHelpString(); 
28     string getOutputPattern(string);    
29         string getCitation() { return "http://www.mothur.org/wiki/Screen.seqs"; }
30         string getDescription()         { return "enables you to keep sequences that fulfill certain user defined criteria"; }
31
32         int execute(); 
33         void help() { m->mothurOut(getHelpString()); }  
34         
35         
36 private:
37
38         struct linePair {
39                 unsigned long long start;
40                 unsigned long long end;
41                 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
42         };
43
44         vector<linePair> lines;
45
46         int screenNameGroupFile(map<string, string>);
47         int screenGroupFile(map<string, string>);
48     int screenCountFile(map<string, string>);
49         int screenAlignReport(map<string, string>&);
50         int screenQual(map<string, string>);
51         int screenTaxonomy(map<string, string>);
52         
53     int optimizeContigs();
54     int optimizeAlign();
55         int driver(linePair, string, string, string, map<string, string>&);
56         int createProcesses(string, string, string, map<string, string>&);
57     int screenSummary(map<string, string>&);
58     int screenContigs(map<string, string>&);
59     int runFastaScreening(map<string, string>&);
60     int screenFasta(map<string, string>&);
61     int screenReports(map<string, string>&);
62         int getSummary(vector<unsigned long long>&);
63         int createProcessesCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string);
64         int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, linePair);  
65         int getSummaryReport();
66     int driverContigsSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, linePair);
67     int createProcessesContigsSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<linePair>);
68     int driverAlignSummary(vector<float>&, vector<float>&, vector<int>&, linePair);
69     int createProcessesAlignSummary(vector<float>&, vector<float>&, vector<int>&, vector<linePair>);
70     
71         #ifdef USE_MPI
72         int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, map<string, string>&);
73         #endif
74
75         bool abort;
76         string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile, contigsreport, summaryfile;
77         int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, minOverlap, oStart, oEnd, mismatches, maxN, maxInsert;
78     float minSim, minScore, criteria;
79         vector<string> outputNames;
80         vector<string> optimize;
81         map<string, int> nameMap;
82         
83     
84 };
85
86 /**************************************************************************************************/
87 //custom data structure for threads to use.
88 // This is passed by void pointer so it can be any data type
89 // that can be passed using a single void pointer (LPVOID).
90 struct sumData {
91         vector<int> startPosition;
92         vector<int> endPosition;
93         vector<int> seqLength; 
94         vector<int> ambigBases; 
95         vector<int> longHomoPolymer; 
96     vector<int> numNs;
97         string filename, namefile, countfile; 
98         unsigned long long start;
99         unsigned long long end;
100         int count;
101         MothurOut* m;
102         map<string, int> nameMap;
103         
104         
105         sumData(){}
106         sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map<string, int> nam) {
107                 filename = f;
108         namefile = nf;
109         countfile = cf;
110                 m = mout;
111                 start = st;
112                 end = en;
113                 nameMap = nam;
114                 count = 0;
115         }
116 };
117 /**************************************************************************************************/
118 //custom data structure for threads to use.
119 // This is passed by void pointer so it can be any data type
120 // that can be passed using a single void pointer (LPVOID).
121 struct contigsSumData {
122         vector<int> ostartPosition;
123         vector<int> oendPosition;
124         vector<int> oLength; 
125         vector<int> omismatches; 
126     vector<int> numNs;
127         string filename, namefile, countfile; 
128         unsigned long long start;
129         unsigned long long end;
130         int count;
131         MothurOut* m;
132         map<string, int> nameMap;
133         
134         
135         contigsSumData(){}
136         contigsSumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map<string, int> nam) {
137                 filename = f;
138         namefile = nf;
139         countfile = cf;
140                 m = mout;
141                 start = st;
142                 end = en;
143                 nameMap = nam;
144                 count = 0;
145         }
146 };
147 /**************************************************************************************************/
148 struct alignsData {
149         vector<float> sims;
150         vector<float> scores;
151         vector<int> inserts;
152         string filename, namefile, countfile; 
153         unsigned long long start;
154         unsigned long long end;
155         int count;
156         MothurOut* m;
157         map<string, int> nameMap;
158         
159         
160         alignsData(){}
161         alignsData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, string cf, map<string, int> nam) {
162                 filename = f;
163         namefile = nf;
164         countfile = cf;
165                 m = mout;
166                 start = st;
167                 end = en;
168                 nameMap = nam;
169                 count = 0;
170         }
171 };
172
173 /**************************************************************************************************/
174 //custom data structure for threads to use.
175 // This is passed by void pointer so it can be any data type
176 // that can be passed using a single void pointer (LPVOID).
177 struct sumScreenData {
178     int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, maxN;
179         unsigned long long start;
180         unsigned long long end;
181         int count;
182         MothurOut* m;
183         string goodFName, badAccnosFName, filename;
184     map<string, string> badSeqNames;
185     string summaryfile, contigsreport;
186         
187         
188         sumScreenData(){}
189         sumScreenData(int s, int e, int a, int h, int minl, int maxl, int mn, map<string, string> bs, string f, string sum, string cont, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) {
190                 startPos = s;
191                 endPos = e;
192                 minLength = minl;
193         maxLength = maxl;
194                 maxAmbig = a;
195                 maxHomoP = h;
196         maxN = mn;
197                 filename = f;
198         goodFName = gf;
199         badAccnosFName = bf;
200                 m = mout;
201                 start = st;
202                 end = en;
203         summaryfile = sum;
204         contigsreport = cont;
205         badSeqNames = bs;
206                 count = 0;
207         }
208 };
209
210
211 /**************************************************************************************************/
212 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
213 #else
214 static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){ 
215         sumData* pDataArray;
216         pDataArray = (sumData*)lpParam;
217         
218         try {
219                 ifstream in;
220                 pDataArray->m->openInputFile(pDataArray->filename, in);
221         
222                 //print header if you are process 0
223                 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
224                         in.seekg(0);
225                 }else { //this accounts for the difference in line endings. 
226                         in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); 
227                 }
228                 
229                 
230                 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
231                         
232             pDataArray->count++;
233             
234                         if (pDataArray->m->control_pressed) { in.close();  pDataArray->count = 1; return 1; }
235                         
236                         Sequence current(in); pDataArray->m->gobble(in); 
237                         
238                         if (current.getName() != "") {
239                                 
240                                 int num = 1;
241                                 if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){
242                                         //make sure this sequence is in the namefile, else error 
243                                         map<string, int>::iterator it = pDataArray->nameMap.find(current.getName());
244                                         
245                                         if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
246                                         else { num = it->second; }
247                                 }
248                                 
249                                 //for each sequence this sequence represents
250                 int numns = current.getNumNs();
251                                 for (int i = 0; i < num; i++) {
252                                         pDataArray->startPosition.push_back(current.getStartPos());
253                                         pDataArray->endPosition.push_back(current.getEndPos());
254                                         pDataArray->seqLength.push_back(current.getNumBases());
255                                         pDataArray->ambigBases.push_back(current.getAmbigBases());
256                                         pDataArray->longHomoPolymer.push_back(current.getLongHomoPolymer());
257                     pDataArray->numNs.push_back(numns);
258                                 }
259             }
260                 }
261                 
262                 in.close();
263                 
264                 return 0;
265                 
266         }
267         catch(exception& e) {
268                 pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MySumThreadFunction");
269                 exit(1);
270         }
271
272
273 /**************************************************************************************************/
274 static DWORD WINAPI MyContigsSumThreadFunction(LPVOID lpParam){ 
275         contigsSumData* pDataArray;
276         pDataArray = (contigsSumData*)lpParam;
277         
278         try {
279         string name;
280         //Name  Length  Overlap_Length  Overlap_Start   Overlap_End     MisMatches      Num_Ns
281         int length, OLength, thisOStart, thisOEnd, numMisMatches, numns;
282         
283                 ifstream in;
284                 pDataArray->m->openInputFile(pDataArray->filename, in);
285         
286                 //print header if you are process 0
287                 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
288                         in.seekg(0);  pDataArray->m->getline(in); pDataArray->m->gobble(in);
289                 }else { //this accounts for the difference in line endings. 
290                         in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); 
291                 }
292                 
293                 
294                 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
295             
296             pDataArray->count++;
297             
298                         if (pDataArray->m->control_pressed) { in.close();  pDataArray->count = 1; return 1; }
299                         
300             //seqname   start   end     nbases  ambigs  polymer numSeqs
301             in >> name >> length >> OLength >> thisOStart >> thisOEnd >> numMisMatches >> numns; pDataArray->m->gobble(in);
302             
303             int num = 1;
304             if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){
305                 //make sure this sequence is in the namefile, else error 
306                 map<string, int>::iterator it = pDataArray->nameMap.find(name);
307                 
308                 if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
309                 else { num = it->second; }
310             }
311             
312             //for each sequence this sequence represents
313             for (int i = 0; i < num; i++) {
314                 pDataArray->ostartPosition.push_back(thisOStart);
315                 pDataArray->oendPosition.push_back(thisOEnd);
316                 pDataArray->oLength.push_back(OLength);
317                 pDataArray->omismatches.push_back(numMisMatches);
318                 pDataArray->numNs.push_back(numns);
319             }
320                 }
321                 
322                 in.close();
323                 
324                 return 0;
325                 
326         }
327         catch(exception& e) {
328                 pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyContigsThreadFunction");
329                 exit(1);
330         }
331
332 /**************************************************************************************************/
333 static DWORD WINAPI MyAlignsThreadFunction(LPVOID lpParam){ 
334         alignsData* pDataArray;
335         pDataArray = (alignsData*)lpParam;
336         
337         try {
338         
339         string name, TemplateName, SearchMethod, AlignmentMethod;
340         //QueryName     QueryLength     TemplateName    TemplateLength  SearchMethod    SearchScore     AlignmentMethod QueryStart      QueryEnd        TemplateStart   TemplateEnd     PairwiseAlignmentLength GapsInQuery     GapsInTemplate  LongestInsert   SimBtwnQuery&Template
341         //checking for minScore, maxInsert, minSim
342         int length, TemplateLength,      QueryStart,    QueryEnd,       TemplateStart,  TemplateEnd,    PairwiseAlignmentLength,        GapsInQuery,    GapsInTemplate, LongestInsert;
343         float SearchScore, SimBtwnQueryTemplate;
344         
345         ifstream in;
346                 pDataArray->m->openInputFile(pDataArray->filename, in);
347         
348                 //print header if you are process 0
349                 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
350                         in.seekg(0);  pDataArray->m->getline(in); pDataArray->m->gobble(in);
351                 }else { //this accounts for the difference in line endings. 
352                         in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); 
353                 }
354                 
355                 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
356             
357             pDataArray->count++;
358             
359                         if (pDataArray->m->control_pressed) { in.close();  pDataArray->count = 1; return 1; }
360
361             in >> name >> length >> TemplateName >> TemplateLength >> SearchMethod >> SearchScore >> AlignmentMethod >> QueryStart >> QueryEnd >> TemplateStart >> TemplateEnd >> PairwiseAlignmentLength >> GapsInQuery >> GapsInTemplate >> LongestInsert >> SimBtwnQueryTemplate; pDataArray->m->gobble(in);
362             cout << i << '\t' << name << endl;
363             int num = 1;
364             if ((pDataArray->namefile != "") || (pDataArray->countfile !="")){
365                 //make sure this sequence is in the namefile, else error 
366                 map<string, int>::iterator it = pDataArray->nameMap.find(name);
367                 
368                 if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + name + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
369                 else { num = it->second; }
370             }
371             
372             //for each sequence this sequence represents
373             for (int i = 0; i < num; i++) {
374                 pDataArray->sims.push_back(SimBtwnQueryTemplate);
375                 pDataArray->scores.push_back(SearchScore);
376                 pDataArray->inserts.push_back(LongestInsert);
377             }
378                 }
379                 
380                 in.close();
381                 
382                 return 0;
383         
384     }
385         catch(exception& e) {
386                 pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MyAlignsThreadFunction");
387                 exit(1);
388         }
389
390
391 /**************************************************************************************************/
392 static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){ 
393         sumScreenData* pDataArray;
394         pDataArray = (sumScreenData*)lpParam;
395         
396         try {
397         
398         ofstream goodFile;
399                 pDataArray->m->openOutputFile(pDataArray->goodFName, goodFile);
400                 
401                 ofstream badAccnosFile;
402                 pDataArray->m->openOutputFile(pDataArray->badAccnosFName, badAccnosFile);
403                 
404                 ifstream in;
405                 pDataArray->m->openInputFile(pDataArray->filename, in);
406         
407                 //print header if you are process 0
408                 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
409                         in.seekg(0);
410                 }else { //this accounts for the difference in line endings. 
411                         in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); 
412                 }
413         
414                 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
415                         
416             pDataArray->count++;
417             
418                         if (pDataArray->m->control_pressed) { in.close(); badAccnosFile.close(); goodFile.close(); pDataArray->count = 1; return 1; }
419                         
420                         Sequence currSeq(in); pDataArray->m->gobble(in); 
421                         
422                         if (currSeq.getName() != "") {
423                                 bool goodSeq = 1;               //      innocent until proven guilty
424                 string trashCode = "";
425                 //have the report files found you bad
426                 map<string, string>::iterator it = pDataArray->badSeqNames.find(currSeq.getName());
427                 if (it != pDataArray->badSeqNames.end()) { goodSeq = 0;  trashCode = it->second; } //found it 
428                 
429                 if (pDataArray->summaryfile == "") {
430                     if(pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos())                      {       goodSeq = 0;    trashCode += "start|"; }
431                     if(pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos())                            {       goodSeq = 0;    trashCode += "end|"; }
432                     if(pDataArray->maxAmbig != -1 && pDataArray->maxAmbig <     currSeq.getAmbigBases())                {       goodSeq = 0;    trashCode += "ambig|"; }
433                     if(pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer())       {       goodSeq = 0;    trashCode += "homop|"; }
434                     if(pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases())            {       goodSeq = 0;    trashCode += "<length|"; }
435                     if(pDataArray->maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases())            {       goodSeq = 0;    trashCode += ">length|"; }
436                 }
437                 if (pDataArray->contigsreport == "") { //contigs report includes this so no need to check again
438                     if(pDataArray->maxN != -1 && pDataArray->maxN < currSeq.getNumNs())                     {   goodSeq = 0;    trashCode += "n|"; }
439                 }
440                                 
441                 
442                                 if(goodSeq == 1){
443                                         currSeq.printSequence(goodFile);        
444                                 }
445                                 else{
446                                         badAccnosFile << currSeq.getName() << '\t' << trashCode.substr(0, trashCode.length()-1) << endl;
447                                         pDataArray->badSeqNames[currSeq.getName()] = trashCode;
448                                 }
449     
450                         }               
451             //report progress
452                         if((i+1) % 100 == 0){   pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(i+1)+"\n");             }
453                 }
454                 //report progress
455                 if((pDataArray->count) % 100 != 0){     pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n");               }
456                 
457
458                 
459                 in.close();
460         goodFile.close();
461         badAccnosFile.close();
462                 
463                 return 0;
464                 
465         }
466         catch(exception& e) {
467                 pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MySumScreenThreadFunction");
468                 exit(1);
469         }
470
471
472 #endif
473
474 /**************************************************************************************************/
475
476
477
478 #endif