]> git.donarmstrong.com Git - mothur.git/blob - pcrseqscommand.h
fixed bug in phylo.diversity rooting. added filename patterns and create filename...
[mothur.git] / pcrseqscommand.h
1 #ifndef Mothur_pcrseqscommand_h
2 #define Mothur_pcrseqscommand_h
3
4 //
5 //  pcrseqscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 3/14/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12
13 #include "command.hpp"
14 #include "sequence.hpp"
15 #include "trimoligos.h"
16 #include "alignment.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "counttable.h"
19
20 class PcrSeqsCommand : public Command {
21 public:
22         PcrSeqsCommand(string);
23         PcrSeqsCommand();
24         ~PcrSeqsCommand(){}
25         
26         vector<string> setParameters();
27         string getCommandName()                 { return "pcr.seqs";    }
28         string getCommandCategory()             { return "Sequence Processing";         }
29         
30         string getHelpString(); 
31     string getOutputPattern(string);    
32         string getCitation() { return "http://www.mothur.org/wiki/Pcr.seqs"; }
33         string getDescription()         { return "pcr.seqs"; }
34     
35         int execute(); 
36         void help() { m->mothurOut(getHelpString()); }  
37         
38 private:
39     
40     struct linePair {
41         unsigned long long start;
42         unsigned long long end;
43         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
44         linePair() {}
45     };
46     
47     vector<linePair> lines;
48         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
49     bool abort, keepprimer, keepdots;
50         string fastafile, oligosfile, taxfile, groupfile, namefile, countfile, ecolifile, outputDir, nomatch;
51         int start, end, processors, length;
52         
53     vector<string> revPrimer, outputNames;
54         vector<string> primers;
55     
56     int writeAccnos(set<string>);
57     int readName(set<string>&);
58     int readGroup(set<string>);
59     int readTax(set<string>);
60     int readCount(set<string>);
61     bool readOligos();
62     bool readEcoli();
63         int driverPcr(string, string, string, set<string>&, linePair);  
64         int createProcesses(string, string, string, set<string>&);
65     bool findForward(Sequence&, int&, int&);
66     bool findReverse(Sequence&, int&, int&);
67     bool isAligned(string, map<int, int>&);
68     bool compareDNASeq(string, string);
69     string reverseOligo(string);
70 };
71
72 /**************************************************************************************************/
73 //custom data structure for threads to use.
74 // This is passed by void pointer so it can be any data type
75 // that can be passed using a single void pointer (LPVOID).
76 struct pcrData {
77         string filename; 
78     string goodFasta, badFasta, oligosfile, ecolifile, nomatch;
79         unsigned long long fstart;
80         unsigned long long fend;
81         int count, start, end, length;
82         MothurOut* m;
83         vector<string> primers;
84     vector<string> revPrimer;
85     set<string> badSeqNames;
86     bool keepprimer, keepdots;
87         
88         
89         pcrData(){}
90         pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector<string> pr, vector<string> rpr, string nm, bool kp, bool kd, int st, int en, int l, unsigned long long fst, unsigned long long fen) {
91                 filename = f;
92         goodFasta = gf;
93         badFasta = bfn;
94                 m = mout;
95         oligosfile = ol;
96         ecolifile = ec;
97         primers = pr;
98         revPrimer = rpr;
99         nomatch = nm;
100         keepprimer = kp;
101         keepdots = kd;
102                 start = st;
103                 end = en;
104         length = l;
105                 fstart = fst;
106         fend = fen;
107                 count = 0;
108         }
109 };
110 /**************************************************************************************************/
111 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
112 #else
113 static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ 
114         pcrData* pDataArray;
115         pDataArray = (pcrData*)lpParam;
116         
117         try {
118         ofstream goodFile;
119                 pDataArray->m->openOutputFile(pDataArray->goodFasta, goodFile);
120         
121         ofstream badFile;
122                 pDataArray->m->openOutputFile(pDataArray->badFasta, badFile);
123                 
124                 ifstream inFASTA;
125                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
126         
127                 //print header if you are process 0
128                 if ((pDataArray->fstart == 0) || (pDataArray->fstart == 1)) {
129                         inFASTA.seekg(0);
130                 }else { //this accounts for the difference in line endings. 
131                         inFASTA.seekg(pDataArray->fstart-1); pDataArray->m->gobble(inFASTA); 
132                 }
133         
134         set<int> lengths;
135                 pDataArray->count = pDataArray->fend;
136                 for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process
137             
138                         if (pDataArray->m->control_pressed) {  break; }
139                         
140                         Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
141           
142             string trashCode = "";
143                         if (currSeq.getName() != "") {
144                 
145                 bool goodSeq = true;
146                 if (pDataArray->oligosfile != "") {
147                     map<int, int> mapAligned;
148                     //bool aligned = isAligned(currSeq.getAligned(), mapAligned);
149                     ///////////////////////////////////////////////////////////////
150                     bool aligned = false;
151                     string seq = currSeq.getAligned(); 
152                     int countBases = 0;
153                     for (int k = 0; k < seq.length(); k++) {
154                         if (!isalpha(seq[k])) { aligned = true; }
155                         else { mapAligned[countBases] = k; countBases++; } //maps location in unaligned -> location in aligned.
156                     }                                                   //ie. the 3rd base may be at spot 10 in the alignment
157                                                                         //later when we trim we want to trim from spot 10.
158                     ///////////////////////////////////////////////////////////////
159                     
160                     //process primers
161                     if (pDataArray->primers.size() != 0) {
162                         int primerStart = 0; int primerEnd = 0;
163                         //bool good = findForward(currSeq, primerStart, primerEnd);
164                         ///////////////////////////////////////////////////////////////
165                         bool good = false;
166                         string rawSequence = currSeq.getUnaligned();
167                         
168                         for(int j=0;j<pDataArray->primers.size();j++){
169                             string oligo = pDataArray->primers[j];
170                             
171                             if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
172                             
173                             if(rawSequence.length() < oligo.length()) {  break;  }
174                             
175                             //search for primer
176                             int olength = oligo.length();
177                             for (int l = 0; l < rawSequence.length()-olength; l++){
178                                 if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
179                                 string rawChunk = rawSequence.substr(l, olength);
180                                 //compareDNASeq(oligo, rawChunk)
181                                 ////////////////////////////////////////////////////////
182                                 bool success = 1;
183                                 for(int k=0;k<olength;k++){
184                                     
185                                     if(oligo[k] != rawChunk[k]){
186                                         if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')    {       success = 0;    }
187                                         else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                           {       success = 0;    }
188                                         else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                  {       success = 0;    }
189                                         else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                  {       success = 0;    }
190                                         else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                  {       success = 0;    }
191                                         else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                  {       success = 0;    }
192                                         else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                  {       success = 0;    }
193                                         else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                  {       success = 0;    }
194                                         else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
195                                         else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
196                                         else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))    {       success = 0;    }
197                                         else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))    {       success = 0;    }                       
198                                         
199                                         if(success == 0)        {       break;   }
200                                     }
201                                     else{
202                                         success = 1;
203                                     }
204                                 }
205                                 
206                                 ////////////////////////////////////////////////////////////////////
207                                 if(success) {
208                                     primerStart = j;
209                                     primerEnd = primerStart + olength;
210                                     good = true; break;
211                                 }
212                             }
213                             if (good) { break; }
214                         }       
215                         
216                         if (!good) { primerStart = 0; primerEnd = 0; }
217                         ///////////////////////////////////////////////////////////////
218                         
219                         
220                         if(!good){      if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "f";     }
221                         else{
222                             //are you aligned
223                             if (aligned) { 
224                                 if (!pDataArray->keepprimer)    {  
225                                     if (pDataArray->keepdots)   { currSeq.filterToPos(mapAligned[primerEnd]);   }
226                                     else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd]));                                              }
227                                 } 
228                                 else                {  
229                                     if (pDataArray->keepdots)   { currSeq.filterToPos(mapAligned[primerStart]);  }
230                                     else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));                                              }
231                                 }
232                             }else { 
233                                 if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } 
234                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } 
235                             }
236                         }
237                     }
238                     
239                     //process reverse primers
240                     if (pDataArray->revPrimer.size() != 0) {
241                         int primerStart = 0; int primerEnd = 0;
242                         bool good = false;
243                         //findReverse(currSeq, primerStart, primerEnd);
244                          ///////////////////////////////////////////////////////////////
245                         string rawSequence = currSeq.getUnaligned();
246                         
247                         for(int j=0;j<pDataArray->revPrimer.size();j++){
248                             string oligo = pDataArray->revPrimer[j];
249                             if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
250                             if(rawSequence.length() < oligo.length()) {  break;  }
251                             
252                             //search for primer
253                             int olength = oligo.length();
254                             for (int l = rawSequence.length()-olength; l >= 0; l--){
255                                 
256                                 string rawChunk = rawSequence.substr(l, olength);
257                                 //compareDNASeq(oligo, rawChunk)
258                                 ////////////////////////////////////////////////////////
259                                 bool success = 1;
260                                 for(int k=0;k<olength;k++){
261                                     
262                                     if(oligo[k] != rawChunk[k]){
263                                         if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')    {       success = 0;    }
264                                         else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                           {       success = 0;    }
265                                         else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                  {       success = 0;    }
266                                         else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                  {       success = 0;    }
267                                         else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                  {       success = 0;    }
268                                         else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                  {       success = 0;    }
269                                         else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                  {       success = 0;    }
270                                         else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                  {       success = 0;    }
271                                         else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
272                                         else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
273                                         else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))    {       success = 0;    }
274                                         else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))    {       success = 0;    }                       
275                                         
276                                         if(success == 0)        {       break;   }
277                                     }
278                                     else{
279                                         success = 1;
280                                     }
281                                 }
282                                 
283                                 ////////////////////////////////////////////////////////////////////
284                                 if(success) {
285                                     primerStart = j;
286                                     primerEnd = primerStart + olength;
287                                     good = true; break;
288                                 }
289                             }
290                             if (good) { break; }
291                         }       
292                         
293                         if (!good) { primerStart = 0; primerEnd = 0; }
294
295                          ///////////////////////////////////////////////////////////////
296                         if(!good){      if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "r";     }
297                         else{ 
298                             //are you aligned
299                             if (aligned) { 
300                                 if (!pDataArray->keepprimer)    {  
301                                     if (pDataArray->keepdots)   { currSeq.filterFromPos(mapAligned[primerStart]); }
302                                     else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));   }
303                                 } 
304                                 else                {  
305                                     if (pDataArray->keepdots)   { currSeq.filterFromPos(mapAligned[primerEnd]); }
306                                     else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd]));   }
307                                 }                             }
308                             else { 
309                                 if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart));   } 
310                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd));     }
311                             }
312                         }
313                     }
314                 }else if (pDataArray->ecolifile != "") {
315                     //make sure the seqs are aligned
316                     lengths.insert(currSeq.getAligned().length());
317                     if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
318                     else if (currSeq.getAligned().length() != pDataArray->length) {
319                         pDataArray->m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); pDataArray->m->control_pressed = true; break; 
320                     }else {
321                         if (pDataArray->keepdots)   { 
322                             currSeq.filterToPos(pDataArray->start); 
323                             currSeq.filterFromPos(pDataArray->end);
324                         }else {
325                             string seqString = currSeq.getAligned().substr(0, pDataArray->end);
326                             seqString = seqString.substr(pDataArray->start);
327                             currSeq.setAligned(seqString); 
328                         }
329                     }
330                 }else{ //using start and end to trim
331                     //make sure the seqs are aligned
332                     lengths.insert(currSeq.getAligned().length());
333                     if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
334                     else {
335                         if (pDataArray->end != -1) {
336                             if (pDataArray->end > currSeq.getAligned().length()) {  pDataArray->m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); pDataArray->m->control_pressed = true; break; }
337                             else {
338                                 if (pDataArray->keepdots)   { currSeq.filterFromPos(pDataArray->end); }
339                                 else {
340                                     string seqString = currSeq.getAligned().substr(0, pDataArray->end);
341                                     currSeq.setAligned(seqString); 
342                                 }
343                             }
344                         }
345                         if (pDataArray->start != -1) { 
346                             if (pDataArray->keepdots)   {  currSeq.filterToPos(pDataArray->start);  }
347                             else {
348                                 string seqString = currSeq.getAligned().substr(pDataArray->start);
349                                 currSeq.setAligned(seqString); 
350                             }
351                         }
352                         
353                     }
354                 }
355                 
356                                 if(goodSeq == 1)    {   currSeq.printSequence(goodFile);        }
357                                 else {  
358                     pDataArray->badSeqNames.insert(currSeq.getName()); 
359                     currSeq.setName(currSeq.getName() + '|' + trashCode);
360                     currSeq.printSequence(badFile); 
361                 }
362                         }
363                                                 
364                         //report progress
365                         if((i+1) % 100 == 0){   pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine();           }
366                 }
367                 //report progress
368                 if((pDataArray->count) % 100 != 0){     pDataArray->m->mothurOut("Thread Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();              }
369                 
370                 goodFile.close();
371                 inFASTA.close();
372         badFile.close();
373         
374         return 0;
375                 
376         }
377         catch(exception& e) {
378                 pDataArray->m->errorOut(e, "PcrSeqsCommand", "MyPcrThreadFunction");
379                 exit(1);
380         }
381
382
383 #endif
384
385 /**************************************************************************************************/
386
387
388
389 #endif