]> git.donarmstrong.com Git - mothur.git/blob - pcrseqscommand.h
bugs found while testing
[mothur.git] / pcrseqscommand.h
1 #ifndef Mothur_pcrseqscommand_h
2 #define Mothur_pcrseqscommand_h
3
4 //
5 //  pcrseqscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 3/14/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12
13 #include "command.hpp"
14 #include "sequence.hpp"
15 #include "trimoligos.h"
16 #include "alignment.hpp"
17 #include "needlemanoverlap.hpp"
18
19 class PcrSeqsCommand : public Command {
20 public:
21         PcrSeqsCommand(string);
22         PcrSeqsCommand();
23         ~PcrSeqsCommand(){}
24         
25         vector<string> setParameters();
26         string getCommandName()                 { return "pcr.seqs";    }
27         string getCommandCategory()             { return "Sequence Processing";         }
28         string getHelpString(); 
29         string getCitation() { return "http://www.mothur.org/wiki/Pcr.seqs"; }
30         string getDescription()         { return "pcr.seqs"; }
31     
32         int execute(); 
33         void help() { m->mothurOut(getHelpString()); }  
34         
35 private:
36     
37     struct linePair {
38         unsigned long long start;
39         unsigned long long end;
40         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
41         linePair() {}
42     };
43     
44     vector<linePair> lines;
45         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
46     bool abort, keepprimer, keepdots;
47         string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch;
48         int start, end, processors, length;
49         
50     vector<string> revPrimer, outputNames;
51         vector<string> primers;
52     
53     int writeAccnos(set<string>);
54     int readName(set<string>&);
55     int readGroup(set<string>);
56     int readTax(set<string>);
57     bool readOligos();
58     bool readEcoli();
59         int driverPcr(string, string, string, set<string>&, linePair);  
60         int createProcesses(string, string, string, set<string>&);
61     bool findForward(Sequence&, int&, int&);
62     bool findReverse(Sequence&, int&, int&);
63     bool isAligned(string, map<int, int>&);
64     bool compareDNASeq(string, string);
65     string reverseOligo(string);
66 };
67
68 /**************************************************************************************************/
69 //custom data structure for threads to use.
70 // This is passed by void pointer so it can be any data type
71 // that can be passed using a single void pointer (LPVOID).
72 struct pcrData {
73         string filename; 
74     string goodFasta, badFasta, oligosfile, ecolifile, nomatch;
75         unsigned long long fstart;
76         unsigned long long fend;
77         int count, start, end, length;
78         MothurOut* m;
79         vector<string> primers;
80     vector<string> revPrimer;
81     set<string> badSeqNames;
82     bool keepprimer, keepdots;
83         
84         
85         pcrData(){}
86         pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector<string> pr, vector<string> rpr, string nm, bool kp, bool kd, int st, int en, int l, unsigned long long fst, unsigned long long fen) {
87                 filename = f;
88         goodFasta = gf;
89         badFasta = bfn;
90                 m = mout;
91         oligosfile = ol;
92         ecolifile = ec;
93         primers = pr;
94         revPrimer = rpr;
95         nomatch = nm;
96         keepprimer = kp;
97         keepdots = kd;
98                 start = st;
99                 end = en;
100         length = l;
101                 fstart = fst;
102         fend = fen;
103                 count = 0;
104         }
105 };
106 /**************************************************************************************************/
107 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
108 #else
109 static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ 
110         pcrData* pDataArray;
111         pDataArray = (pcrData*)lpParam;
112         
113         try {
114         ofstream goodFile;
115                 pDataArray->m->openOutputFile(pDataArray->goodFasta, goodFile);
116         
117         ofstream badFile;
118                 pDataArray->m->openOutputFile(pDataArray->badFasta, badFile);
119                 
120                 ifstream inFASTA;
121                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
122         
123                 //print header if you are process 0
124                 if ((pDataArray->fstart == 0) || (pDataArray->fstart == 1)) {
125                         inFASTA.seekg(0);
126                 }else { //this accounts for the difference in line endings. 
127                         inFASTA.seekg(pDataArray->fstart-1); pDataArray->m->gobble(inFASTA); 
128                 }
129         
130         set<int> lengths;
131                 pDataArray->count = pDataArray->fend;
132                 for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process
133             
134                         if (pDataArray->m->control_pressed) {  break; }
135                         
136                         Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
137           
138             string trashCode = "";
139                         if (currSeq.getName() != "") {
140                 
141                 bool goodSeq = true;
142                 if (pDataArray->oligosfile != "") {
143                     map<int, int> mapAligned;
144                     //bool aligned = isAligned(currSeq.getAligned(), mapAligned);
145                     ///////////////////////////////////////////////////////////////
146                     bool aligned = false;
147                     string seq = currSeq.getAligned(); 
148                     int countBases = 0;
149                     for (int k = 0; k < seq.length(); k++) {
150                         if (!isalpha(seq[k])) { aligned = true; }
151                         else { mapAligned[countBases] = k; countBases++; } //maps location in unaligned -> location in aligned.
152                     }                                                   //ie. the 3rd base may be at spot 10 in the alignment
153                                                                         //later when we trim we want to trim from spot 10.
154                     ///////////////////////////////////////////////////////////////
155                     
156                     //process primers
157                     if (pDataArray->primers.size() != 0) {
158                         int primerStart = 0; int primerEnd = 0;
159                         //bool good = findForward(currSeq, primerStart, primerEnd);
160                         ///////////////////////////////////////////////////////////////
161                         bool good = false;
162                         string rawSequence = currSeq.getUnaligned();
163                         
164                         for(int j=0;j<pDataArray->primers.size();j++){
165                             string oligo = pDataArray->primers[j];
166                             
167                             if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
168                             
169                             if(rawSequence.length() < oligo.length()) {  break;  }
170                             
171                             //search for primer
172                             int olength = oligo.length();
173                             for (int l = 0; l < rawSequence.length()-olength; l++){
174                                 if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
175                                 string rawChunk = rawSequence.substr(l, olength);
176                                 //compareDNASeq(oligo, rawChunk)
177                                 ////////////////////////////////////////////////////////
178                                 bool success = 1;
179                                 for(int k=0;k<olength;k++){
180                                     
181                                     if(oligo[k] != rawChunk[k]){
182                                         if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')    {       success = 0;    }
183                                         else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                           {       success = 0;    }
184                                         else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                  {       success = 0;    }
185                                         else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                  {       success = 0;    }
186                                         else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                  {       success = 0;    }
187                                         else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                  {       success = 0;    }
188                                         else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                  {       success = 0;    }
189                                         else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                  {       success = 0;    }
190                                         else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
191                                         else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
192                                         else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))    {       success = 0;    }
193                                         else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))    {       success = 0;    }                       
194                                         
195                                         if(success == 0)        {       break;   }
196                                     }
197                                     else{
198                                         success = 1;
199                                     }
200                                 }
201                                 
202                                 ////////////////////////////////////////////////////////////////////
203                                 if(success) {
204                                     primerStart = j;
205                                     primerEnd = primerStart + olength;
206                                     good = true; break;
207                                 }
208                             }
209                             if (good) { break; }
210                         }       
211                         
212                         if (!good) { primerStart = 0; primerEnd = 0; }
213                         ///////////////////////////////////////////////////////////////
214                         
215                         
216                         if(!good){      if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "f";     }
217                         else{
218                             //are you aligned
219                             if (aligned) { 
220                                 if (!pDataArray->keepprimer)    {  
221                                     if (pDataArray->keepdots)   { currSeq.filterToPos(mapAligned[primerEnd]);   }
222                                     else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd]));                                              }
223                                 } 
224                                 else                {  
225                                     if (pDataArray->keepdots)   { currSeq.filterToPos(mapAligned[primerStart]);  }
226                                     else            { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));                                              }
227                                 }
228                             }else { 
229                                 if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } 
230                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } 
231                             }
232                         }
233                     }
234                     
235                     //process reverse primers
236                     if (pDataArray->revPrimer.size() != 0) {
237                         int primerStart = 0; int primerEnd = 0;
238                         bool good = false;
239                         //findReverse(currSeq, primerStart, primerEnd);
240                          ///////////////////////////////////////////////////////////////
241                         string rawSequence = currSeq.getUnaligned();
242                         
243                         for(int j=0;j<pDataArray->revPrimer.size();j++){
244                             string oligo = pDataArray->revPrimer[j];
245                             if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
246                             if(rawSequence.length() < oligo.length()) {  break;  }
247                             
248                             //search for primer
249                             int olength = oligo.length();
250                             for (int l = rawSequence.length()-olength; l >= 0; l--){
251                                 
252                                 string rawChunk = rawSequence.substr(l, olength);
253                                 //compareDNASeq(oligo, rawChunk)
254                                 ////////////////////////////////////////////////////////
255                                 bool success = 1;
256                                 for(int k=0;k<olength;k++){
257                                     
258                                     if(oligo[k] != rawChunk[k]){
259                                         if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')    {       success = 0;    }
260                                         else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                           {       success = 0;    }
261                                         else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                  {       success = 0;    }
262                                         else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                  {       success = 0;    }
263                                         else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                  {       success = 0;    }
264                                         else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                  {       success = 0;    }
265                                         else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                  {       success = 0;    }
266                                         else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                  {       success = 0;    }
267                                         else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
268                                         else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
269                                         else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))    {       success = 0;    }
270                                         else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))    {       success = 0;    }                       
271                                         
272                                         if(success == 0)        {       break;   }
273                                     }
274                                     else{
275                                         success = 1;
276                                     }
277                                 }
278                                 
279                                 ////////////////////////////////////////////////////////////////////
280                                 if(success) {
281                                     primerStart = j;
282                                     primerEnd = primerStart + olength;
283                                     good = true; break;
284                                 }
285                             }
286                             if (good) { break; }
287                         }       
288                         
289                         if (!good) { primerStart = 0; primerEnd = 0; }
290
291                          ///////////////////////////////////////////////////////////////
292                         if(!good){      if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "r";     }
293                         else{ 
294                             //are you aligned
295                             if (aligned) { 
296                                 if (!pDataArray->keepprimer)    {  
297                                     if (pDataArray->keepdots)   { currSeq.filterFromPos(mapAligned[primerStart]); }
298                                     else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));   }
299                                 } 
300                                 else                {  
301                                     if (pDataArray->keepdots)   { currSeq.filterFromPos(mapAligned[primerEnd]); }
302                                     else            { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd]));   }
303                                 }                             }
304                             else { 
305                                 if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart));   } 
306                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd));     }
307                             }
308                         }
309                     }
310                 }else if (pDataArray->ecolifile != "") {
311                     //make sure the seqs are aligned
312                     lengths.insert(currSeq.getAligned().length());
313                     if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
314                     else if (currSeq.getAligned().length() != pDataArray->length) {
315                         pDataArray->m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); pDataArray->m->control_pressed = true; break; 
316                     }else {
317                         if (pDataArray->keepdots)   { 
318                             currSeq.filterToPos(pDataArray->start); 
319                             currSeq.filterFromPos(pDataArray->end);
320                         }else {
321                             string seqString = currSeq.getAligned().substr(0, pDataArray->end);
322                             seqString = seqString.substr(pDataArray->start);
323                             currSeq.setAligned(seqString); 
324                         }
325                     }
326                 }else{ //using start and end to trim
327                     //make sure the seqs are aligned
328                     lengths.insert(currSeq.getAligned().length());
329                     if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
330                     else {
331                         if (pDataArray->end != -1) {
332                             if (pDataArray->end > currSeq.getAligned().length()) {  pDataArray->m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); pDataArray->m->control_pressed = true; break; }
333                             else {
334                                 if (pDataArray->keepdots)   { currSeq.filterFromPos(pDataArray->end); }
335                                 else {
336                                     string seqString = currSeq.getAligned().substr(0, pDataArray->end);
337                                     currSeq.setAligned(seqString); 
338                                 }
339                             }
340                         }
341                         if (pDataArray->start != -1) { 
342                             if (pDataArray->keepdots)   {  currSeq.filterToPos(pDataArray->start);  }
343                             else {
344                                 string seqString = currSeq.getAligned().substr(pDataArray->start);
345                                 currSeq.setAligned(seqString); 
346                             }
347                         }
348                         
349                     }
350                 }
351                 
352                                 if(goodSeq == 1)    {   currSeq.printSequence(goodFile);        }
353                                 else {  
354                     pDataArray->badSeqNames.insert(currSeq.getName()); 
355                     currSeq.setName(currSeq.getName() + '|' + trashCode);
356                     currSeq.printSequence(badFile); 
357                 }
358                         }
359                                                 
360                         //report progress
361                         if((i+1) % 100 == 0){   pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine();           }
362                 }
363                 //report progress
364                 if((pDataArray->count) % 100 != 0){     pDataArray->m->mothurOut("Thread Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();              }
365                 
366                 goodFile.close();
367                 inFASTA.close();
368         badFile.close();
369         
370         return 0;
371                 
372         }
373         catch(exception& e) {
374                 pDataArray->m->errorOut(e, "PcrSeqsCommand", "MyPcrThreadFunction");
375                 exit(1);
376         }
377
378
379 #endif
380
381 /**************************************************************************************************/
382
383
384
385 #endif