]> git.donarmstrong.com Git - mothur.git/blob - pcrseqscommand.h
added classify.shared command and random forest files. added count file to pcr.seqs...
[mothur.git] / pcrseqscommand.h
1 #ifndef Mothur_pcrseqscommand_h
2 #define Mothur_pcrseqscommand_h
3
4 //
5 //  pcrseqscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 3/14/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12
13 #include "command.hpp"
14 #include "sequence.hpp"
15 #include "trimoligos.h"
16 #include "alignment.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "counttable.h"
19
20 class PcrSeqsCommand : public Command {
21 public:
22         PcrSeqsCommand(string);
23         PcrSeqsCommand();
24         ~PcrSeqsCommand(){}
25         
26         vector<string> setParameters();
27         string getCommandName()                 { return "pcr.seqs";    }
28         string getCommandCategory()             { return "Sequence Processing";         }
29         string getOutputFileNameTag(string, string);
30         string getHelpString(); 
31         string getCitation() { return "http://www.mothur.org/wiki/Pcr.seqs"; }
32         string getDescription()         { return "pcr.seqs"; }
33     
34         int execute(); 
35         void help() { m->mothurOut(getHelpString()); }  
36         
37 private:
38     
39     struct linePair {
40         unsigned long long start;
41         unsigned long long end;
42         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
43         linePair() {}
44     };
45     
46     vector<linePair> lines;
47         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
48     bool abort, keepprimer, keepdots;
49         string fastafile, oligosfile, taxfile, groupfile, namefile, countfile, ecolifile, outputDir, nomatch;
50         int start, end, processors, length;
51         
52     vector<string> revPrimer, outputNames;
53         vector<string> primers;
54     
55     int writeAccnos(set<string>);
56     int readName(set<string>&);
57     int readGroup(set<string>);
58     int readTax(set<string>);
59     int readCount(set<string>);
60     bool readOligos();
61     bool readEcoli();
62         int driverPcr(string, string, string, set<string>&, linePair);  
63         int createProcesses(string, string, string, set<string>&);
64     bool findForward(Sequence&, int&, int&);
65     bool findReverse(Sequence&, int&, int&);
66     bool isAligned(string, map<int, int>&);
67     bool compareDNASeq(string, string);
68     string reverseOligo(string);
69 };
70
71 /**************************************************************************************************/
72 //custom data structure for threads to use.
73 // This is passed by void pointer so it can be any data type
74 // that can be passed using a single void pointer (LPVOID).
75 struct pcrData {
76         string filename; 
77     string goodFasta, badFasta, oligosfile, ecolifile, nomatch;
78         unsigned long long fstart;
79         unsigned long long fend;
80         int count, start, end, length;
81         MothurOut* m;
82         vector<string> primers;
83     vector<string> revPrimer;
84     set<string> badSeqNames;
85     bool keepprimer, keepdots;
86         
87         
88         pcrData(){}
89         pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector<string> pr, vector<string> rpr, string nm, bool kp, bool kd, int st, int en, int l, unsigned long long fst, unsigned long long fen) {
90                 filename = f;
91         goodFasta = gf;
92         badFasta = bfn;
93                 m = mout;
94         oligosfile = ol;
95         ecolifile = ec;
96         primers = pr;
97         revPrimer = rpr;
98         nomatch = nm;
99         keepprimer = kp;
100         keepdots = kd;
101                 start = st;
102                 end = en;
103         length = l;
104                 fstart = fst;
105         fend = fen;
106                 count = 0;
107         }
108 };
109 /**************************************************************************************************/
110 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
111 #else
112 static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ 
113         pcrData* pDataArray;
114         pDataArray = (pcrData*)lpParam;
115         
116         try {
117         ofstream goodFile;
118                 pDataArray->m->openOutputFile(pDataArray->goodFasta, goodFile);
119         
120         ofstream badFile;
121                 pDataArray->m->openOutputFile(pDataArray->badFasta, badFile);
122                 
123                 ifstream inFASTA;
124                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
125         
126                 //print header if you are process 0
127                 if ((pDataArray->fstart == 0) || (pDataArray->fstart == 1)) {
128                         inFASTA.seekg(0);
129                 }else { //this accounts for the difference in line endings. 
130                         inFASTA.seekg(pDataArray->fstart-1); pDataArray->m->gobble(inFASTA); 
131                 }
132         
133         set<int> lengths;
134                 pDataArray->count = pDataArray->fend;
135                 for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process
136             
137                         if (pDataArray->m->control_pressed) {  break; }
138                         
139                         Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
140           
141             string trashCode = "";
142                         if (currSeq.getName() != "") {
143                 
144                 bool goodSeq = true;
145                 if (pDataArray->oligosfile != "") {
146                     map<int, int> mapAligned;
147                     //bool aligned = isAligned(currSeq.getAligned(), mapAligned);
148                     ///////////////////////////////////////////////////////////////
149                     bool aligned = false;
150                     string seq = currSeq.getAligned(); 
151                     int countBases = 0;
152                     for (int k = 0; k < seq.length(); k++) {
153                         if (!isalpha(seq[k])) { aligned = true; }
154                         else { mapAligned[countBases] = k; countBases++; } //maps location in unaligned -> location in aligned.
155                     }                                                   //ie. the 3rd base may be at spot 10 in the alignment
156                                                                         //later when we trim we want to trim from spot 10.
157                     ///////////////////////////////////////////////////////////////
158                     
159                     //process primers
160                     if (pDataArray->primers.size() != 0) {
161                         int primerStart = 0; int primerEnd = 0;
162                         //bool good = findForward(currSeq, primerStart, primerEnd);
163                         ///////////////////////////////////////////////////////////////
164                         bool good = false;
165                         string rawSequence = currSeq.getUnaligned();
166                         
167                         for(int j=0;j<pDataArray->primers.size();j++){
168                             string oligo = pDataArray->primers[j];
169                             
170                             if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
171                             
172                             if(rawSequence.length() < oligo.length()) {  break;  }
173                             
174                             //search for primer
175                             int olength = oligo.length();
176                             for (int l = 0; l < rawSequence.length()-olength; l++){
177                                 if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
178                                 string rawChunk = rawSequence.substr(l, olength);
179                                 //compareDNASeq(oligo, rawChunk)
180                                 ////////////////////////////////////////////////////////
181                                 bool success = 1;
182                                 for(int k=0;k<olength;k++){
183                                     
184                                     if(oligo[k] != rawChunk[k]){
185                                         if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')    {       success = 0;    }
186                                         else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                           {       success = 0;    }
187                                         else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                  {       success = 0;    }
188                                         else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                  {       success = 0;    }
189                                         else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                  {       success = 0;    }
190                                         else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                  {       success = 0;    }
191                                         else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                  {       success = 0;    }
192                                         else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                  {       success = 0;    }
193                                         else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
194                                         else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
195                                         else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))    {       success = 0;    }
196                                         else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))    {       success = 0;    }                       
197                                         
198                                         if(success == 0)        {       break;   }
199                                     }
200                                     else{
201                                         success = 1;
202                                     }
203                                 }
204                                 
205                                 ////////////////////////////////////////////////////////////////////
206                                 if(success) {
207                                     primerStart = j;
208                                     primerEnd = primerStart + olength;
209                                     good = true; break;
210                                 }
211                             }
212                             if (good) { break; }
213                         }       
214                         
215                         if (!good) { primerStart = 0; primerEnd = 0; }
216                         ///////////////////////////////////////////////////////////////
217                         
218                         
219                         if(!good){      if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "f";     }
220                         else{
221                             //are you aligned
222                             if (aligned) { 
223                                 if (!pDataArray->keepprimer)    {  
224                                     if (pDataArray->keepdots)   { currSeq.filterToPos(mapAligned[primerEnd]);   }
225                                     else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd]));                                              }
226                                 } 
227                                 else                {  
228                                     if (pDataArray->keepdots)   { currSeq.filterToPos(mapAligned[primerStart]);  }
229                                     else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));                                              }
230                                 }
231                             }else { 
232                                 if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } 
233                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } 
234                             }
235                         }
236                     }
237                     
238                     //process reverse primers
239                     if (pDataArray->revPrimer.size() != 0) {
240                         int primerStart = 0; int primerEnd = 0;
241                         bool good = false;
242                         //findReverse(currSeq, primerStart, primerEnd);
243                          ///////////////////////////////////////////////////////////////
244                         string rawSequence = currSeq.getUnaligned();
245                         
246                         for(int j=0;j<pDataArray->revPrimer.size();j++){
247                             string oligo = pDataArray->revPrimer[j];
248                             if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
249                             if(rawSequence.length() < oligo.length()) {  break;  }
250                             
251                             //search for primer
252                             int olength = oligo.length();
253                             for (int l = rawSequence.length()-olength; l >= 0; l--){
254                                 
255                                 string rawChunk = rawSequence.substr(l, olength);
256                                 //compareDNASeq(oligo, rawChunk)
257                                 ////////////////////////////////////////////////////////
258                                 bool success = 1;
259                                 for(int k=0;k<olength;k++){
260                                     
261                                     if(oligo[k] != rawChunk[k]){
262                                         if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')    {       success = 0;    }
263                                         else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                           {       success = 0;    }
264                                         else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                  {       success = 0;    }
265                                         else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                  {       success = 0;    }
266                                         else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                  {       success = 0;    }
267                                         else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                  {       success = 0;    }
268                                         else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                  {       success = 0;    }
269                                         else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                  {       success = 0;    }
270                                         else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
271                                         else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
272                                         else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))    {       success = 0;    }
273                                         else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))    {       success = 0;    }                       
274                                         
275                                         if(success == 0)        {       break;   }
276                                     }
277                                     else{
278                                         success = 1;
279                                     }
280                                 }
281                                 
282                                 ////////////////////////////////////////////////////////////////////
283                                 if(success) {
284                                     primerStart = j;
285                                     primerEnd = primerStart + olength;
286                                     good = true; break;
287                                 }
288                             }
289                             if (good) { break; }
290                         }       
291                         
292                         if (!good) { primerStart = 0; primerEnd = 0; }
293
294                          ///////////////////////////////////////////////////////////////
295                         if(!good){      if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "r";     }
296                         else{ 
297                             //are you aligned
298                             if (aligned) { 
299                                 if (!pDataArray->keepprimer)    {  
300                                     if (pDataArray->keepdots)   { currSeq.filterFromPos(mapAligned[primerStart]); }
301                                     else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));   }
302                                 } 
303                                 else                {  
304                                     if (pDataArray->keepdots)   { currSeq.filterFromPos(mapAligned[primerEnd]); }
305                                     else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd]));   }
306                                 }                             }
307                             else { 
308                                 if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart));   } 
309                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd));     }
310                             }
311                         }
312                     }
313                 }else if (pDataArray->ecolifile != "") {
314                     //make sure the seqs are aligned
315                     lengths.insert(currSeq.getAligned().length());
316                     if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
317                     else if (currSeq.getAligned().length() != pDataArray->length) {
318                         pDataArray->m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); pDataArray->m->control_pressed = true; break; 
319                     }else {
320                         if (pDataArray->keepdots)   { 
321                             currSeq.filterToPos(pDataArray->start); 
322                             currSeq.filterFromPos(pDataArray->end);
323                         }else {
324                             string seqString = currSeq.getAligned().substr(0, pDataArray->end);
325                             seqString = seqString.substr(pDataArray->start);
326                             currSeq.setAligned(seqString); 
327                         }
328                     }
329                 }else{ //using start and end to trim
330                     //make sure the seqs are aligned
331                     lengths.insert(currSeq.getAligned().length());
332                     if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
333                     else {
334                         if (pDataArray->end != -1) {
335                             if (pDataArray->end > currSeq.getAligned().length()) {  pDataArray->m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); pDataArray->m->control_pressed = true; break; }
336                             else {
337                                 if (pDataArray->keepdots)   { currSeq.filterFromPos(pDataArray->end); }
338                                 else {
339                                     string seqString = currSeq.getAligned().substr(0, pDataArray->end);
340                                     currSeq.setAligned(seqString); 
341                                 }
342                             }
343                         }
344                         if (pDataArray->start != -1) { 
345                             if (pDataArray->keepdots)   {  currSeq.filterToPos(pDataArray->start);  }
346                             else {
347                                 string seqString = currSeq.getAligned().substr(pDataArray->start);
348                                 currSeq.setAligned(seqString); 
349                             }
350                         }
351                         
352                     }
353                 }
354                 
355                                 if(goodSeq == 1)    {   currSeq.printSequence(goodFile);        }
356                                 else {  
357                     pDataArray->badSeqNames.insert(currSeq.getName()); 
358                     currSeq.setName(currSeq.getName() + '|' + trashCode);
359                     currSeq.printSequence(badFile); 
360                 }
361                         }
362                                                 
363                         //report progress
364                         if((i+1) % 100 == 0){   pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine();           }
365                 }
366                 //report progress
367                 if((pDataArray->count) % 100 != 0){     pDataArray->m->mothurOut("Thread Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();              }
368                 
369                 goodFile.close();
370                 inFASTA.close();
371         badFile.close();
372         
373         return 0;
374                 
375         }
376         catch(exception& e) {
377                 pDataArray->m->errorOut(e, "PcrSeqsCommand", "MyPcrThreadFunction");
378                 exit(1);
379         }
380
381
382 #endif
383
384 /**************************************************************************************************/
385
386
387
388 #endif