]> git.donarmstrong.com Git - mothur.git/blob - pcrseqscommand.h
added pcr.seqs command. fixed bug in rarefacftion.single that caused parsing error...
[mothur.git] / pcrseqscommand.h
1 #ifndef Mothur_pcrseqscommand_h
2 #define Mothur_pcrseqscommand_h
3
4 //
5 //  pcrseqscommand.h
6 //  Mothur
7 //
8 //  Created by Sarah Westcott on 3/14/12.
9 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
10 //
11
12
13 #include "command.hpp"
14 #include "sequence.hpp"
15 #include "trimoligos.h"
16 #include "alignment.hpp"
17 #include "needlemanoverlap.hpp"
18
19 class PcrSeqsCommand : public Command {
20 public:
21         PcrSeqsCommand(string);
22         PcrSeqsCommand();
23         ~PcrSeqsCommand(){}
24         
25         vector<string> setParameters();
26         string getCommandName()                 { return "pcr.seqs";    }
27         string getCommandCategory()             { return "Sequence Processing";         }
28         string getHelpString(); 
29         string getCitation() { return "http://www.mothur.org/wiki/Pcr.seqs"; }
30         string getDescription()         { return "pcr.seqs"; }
31     
32         int execute(); 
33         void help() { m->mothurOut(getHelpString()); }  
34         
35 private:
36     
37     struct linePair {
38         unsigned long long start;
39         unsigned long long end;
40         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
41         linePair() {}
42     };
43     
44     vector<linePair> lines;
45         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
46     bool abort, keepprimer;
47         string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch;
48         int start, end, pdiffs, processors, length;
49         
50     vector<string> revPrimer, outputNames;
51         vector<string> primers;
52     
53     int writeAccnos(set<string>);
54     int readName(set<string>&);
55     int readGroup(set<string>);
56     int readTax(set<string>);
57     bool readOligos();
58     bool readEcoli();
59         int driverPcr(string, string, string, set<string>&, linePair);  
60         int createProcesses(string, string, string, set<string>&);
61     bool findForward(Sequence&, int&, int&);
62     bool findReverse(Sequence&, int&, int&);
63     bool isAligned(string, map<int, int>&);
64     bool compareDNASeq(string, string);
65     string reverseOligo(string);
66 };
67
68 /**************************************************************************************************/
69 //custom data structure for threads to use.
70 // This is passed by void pointer so it can be any data type
71 // that can be passed using a single void pointer (LPVOID).
72 struct pcrData {
73         string filename; 
74     string goodFasta, badFasta, oligosfile, ecolifile, nomatch;
75         unsigned long long fstart;
76         unsigned long long fend;
77         int count, start, end, length;
78         MothurOut* m;
79         vector<string> primers;
80     vector<string> revPrimer;
81     set<string> badSeqNames;
82     bool keepprimer;
83         
84         
85         pcrData(){}
86         pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector<string> pr, vector<string> rpr, string nm, bool kp, int st, int en, int l, unsigned long long fst, unsigned long long fen) {
87                 filename = f;
88         goodFasta = gf;
89         badFasta = bfn;
90                 m = mout;
91         oligosfile = ol;
92         ecolifile = ec;
93         primers = pr;
94         revPrimer = rpr;
95         nomatch = nm;
96         keepprimer = kp;
97                 start = st;
98                 end = en;
99         length = l;
100                 fstart = fst;
101         fend = fen;
102                 count = 0;
103         }
104 };
105 /**************************************************************************************************/
106 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
107 #else
108 static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ 
109         pcrData* pDataArray;
110         pDataArray = (pcrData*)lpParam;
111         
112         try {
113         ofstream goodFile;
114                 pDataArray->m->openOutputFile(pDataArray->goodFasta, goodFile);
115         
116         ofstream badFile;
117                 pDataArray->m->openOutputFile(pDataArray->badFasta, badFile);
118                 
119                 ifstream inFASTA;
120                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
121         
122                 //print header if you are process 0
123                 if ((pDataArray->fstart == 0) || (pDataArray->fstart == 1)) {
124                         inFASTA.seekg(0);
125                 }else { //this accounts for the difference in line endings. 
126                         inFASTA.seekg(pDataArray->fstart-1); pDataArray->m->gobble(inFASTA); 
127                 }
128         
129         set<int> lengths;
130                 pDataArray->count = pDataArray->fend;
131                 for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process
132             
133                         if (pDataArray->m->control_pressed) {  break; }
134                         
135                         Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
136           
137             string trashCode = "";
138                         if (currSeq.getName() != "") {
139                 
140                 bool goodSeq = true;
141                 if (pDataArray->oligosfile != "") {
142                     map<int, int> mapAligned;
143                     //bool aligned = isAligned(currSeq.getAligned(), mapAligned);
144                     ///////////////////////////////////////////////////////////////
145                     bool aligned = false;
146                     string seq = currSeq.getAligned(); 
147                     int countBases = 0;
148                     for (int k = 0; k < seq.length(); k++) {
149                         if (!isalpha(seq[k])) { aligned = true; }
150                         else { mapAligned[countBases] = k; countBases++; } //maps location in unaligned -> location in aligned.
151                     }                                                   //ie. the 3rd base may be at spot 10 in the alignment
152                                                                         //later when we trim we want to trim from spot 10.
153                     ///////////////////////////////////////////////////////////////
154                     
155                     //process primers
156                     if (pDataArray->primers.size() != 0) {
157                         int primerStart = 0; int primerEnd = 0;
158                         //bool good = findForward(currSeq, primerStart, primerEnd);
159                         ///////////////////////////////////////////////////////////////
160                         bool good = false;
161                         string rawSequence = currSeq.getUnaligned();
162                         
163                         for(int j=0;j<pDataArray->primers.size();j++){
164                             string oligo = pDataArray->primers[j];
165                             
166                             if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
167                             
168                             if(rawSequence.length() < oligo.length()) {  break;  }
169                             
170                             //search for primer
171                             int olength = oligo.length();
172                             for (int l = 0; l < rawSequence.length()-olength; l++){
173                                 if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
174                                 string rawChunk = rawSequence.substr(l, olength);
175                                 //compareDNASeq(oligo, rawChunk)
176                                 ////////////////////////////////////////////////////////
177                                 bool success = 1;
178                                 for(int k=0;k<olength;k++){
179                                     
180                                     if(oligo[k] != rawChunk[k]){
181                                         if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')    {       success = 0;    }
182                                         else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                           {       success = 0;    }
183                                         else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                  {       success = 0;    }
184                                         else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                  {       success = 0;    }
185                                         else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                  {       success = 0;    }
186                                         else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                  {       success = 0;    }
187                                         else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                  {       success = 0;    }
188                                         else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                  {       success = 0;    }
189                                         else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
190                                         else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
191                                         else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))    {       success = 0;    }
192                                         else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))    {       success = 0;    }                       
193                                         
194                                         if(success == 0)        {       break;   }
195                                     }
196                                     else{
197                                         success = 1;
198                                     }
199                                 }
200                                 
201                                 ////////////////////////////////////////////////////////////////////
202                                 if(success) {
203                                     primerStart = j;
204                                     primerEnd = primerStart + olength;
205                                     good = true; break;
206                                 }
207                             }
208                             if (good) { break; }
209                         }       
210                         
211                         if (!good) { primerStart = 0; primerEnd = 0; }
212                         ///////////////////////////////////////////////////////////////
213                         
214                         
215                         if(!good){      if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "f";     }
216                         else{
217                             //are you aligned
218                             if (aligned) { 
219                                 if (!pDataArray->keepprimer)    {  currSeq.padToPos(mapAligned[primerEnd]); } 
220                                 else                {  currSeq.padToPos(mapAligned[primerStart]); }
221                             }else { 
222                                 if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } 
223                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } 
224                             }
225                         }
226                     }
227                     
228                     //process reverse primers
229                     if (pDataArray->revPrimer.size() != 0) {
230                         int primerStart = 0; int primerEnd = 0;
231                         bool good = false;
232                         //findReverse(currSeq, primerStart, primerEnd);
233                          ///////////////////////////////////////////////////////////////
234                         string rawSequence = currSeq.getUnaligned();
235                         
236                         for(int j=0;j<pDataArray->revPrimer.size();j++){
237                             string oligo = pDataArray->revPrimer[j];
238                             if (pDataArray->m->control_pressed) {  primerStart = 0; primerEnd = 0; good = false; break; }
239                             if(rawSequence.length() < oligo.length()) {  break;  }
240                             
241                             //search for primer
242                             int olength = oligo.length();
243                             for (int l = rawSequence.length()-olength; l >= 0; l--){
244                                 
245                                 string rawChunk = rawSequence.substr(l, olength);
246                                 //compareDNASeq(oligo, rawChunk)
247                                 ////////////////////////////////////////////////////////
248                                 bool success = 1;
249                                 for(int k=0;k<olength;k++){
250                                     
251                                     if(oligo[k] != rawChunk[k]){
252                                         if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C')    {       success = 0;    }
253                                         else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N'))                           {       success = 0;    }
254                                         else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G'))                                  {       success = 0;    }
255                                         else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T'))                                  {       success = 0;    }
256                                         else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A'))                                  {       success = 0;    }
257                                         else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G'))                                  {       success = 0;    }
258                                         else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A'))                                  {       success = 0;    }
259                                         else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G'))                                  {       success = 0;    }
260                                         else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
261                                         else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G'))    {       success = 0;    }
262                                         else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C'))    {       success = 0;    }
263                                         else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G'))    {       success = 0;    }                       
264                                         
265                                         if(success == 0)        {       break;   }
266                                     }
267                                     else{
268                                         success = 1;
269                                     }
270                                 }
271                                 
272                                 ////////////////////////////////////////////////////////////////////
273                                 if(success) {
274                                     primerStart = j;
275                                     primerEnd = primerStart + olength;
276                                     good = true; break;
277                                 }
278                             }
279                             if (good) { break; }
280                         }       
281                         
282                         if (!good) { primerStart = 0; primerEnd = 0; }
283
284                          ///////////////////////////////////////////////////////////////
285                         if(!good){      if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "r";     }
286                         else{ 
287                             //are you aligned
288                             if (aligned) { 
289                                 if (!pDataArray->keepprimer)    {  currSeq.padFromPos(mapAligned[primerStart]); } 
290                                 else                {  currSeq.padFromPos(mapAligned[primerEnd]); } 
291                             }
292                             else { 
293                                 if (!pDataArray->keepprimer)    { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart));   } 
294                                 else                { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd));     }
295                             }
296                         }
297                     }
298                 }else if (pDataArray->ecolifile != "") {
299                     //make sure the seqs are aligned
300                     lengths.insert(currSeq.getAligned().length());
301                     if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
302                     else if (currSeq.getAligned().length() != pDataArray->length) {
303                         pDataArray->m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); pDataArray->m->control_pressed = true; break; 
304                     }else {
305                         currSeq.padToPos(pDataArray->start); 
306                         currSeq.padFromPos(pDataArray->end);
307                     }
308                 }else{ //using start and end to trim
309                     //make sure the seqs are aligned
310                     lengths.insert(currSeq.getAligned().length());
311                     if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
312                     else {
313                         if (pDataArray->start != -1) { currSeq.padToPos(pDataArray->start); }
314                         if (pDataArray->end != -1) {
315                             if (pDataArray->end > currSeq.getAligned().length()) {  pDataArray->m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); pDataArray->m->control_pressed = true; break; }
316                             else {
317                                 currSeq.padFromPos(pDataArray->end);
318                             }
319                         }
320                     }
321                 }
322                 
323                                 if(goodSeq == 1)    {   currSeq.printSequence(goodFile);        }
324                                 else {  
325                     pDataArray->badSeqNames.insert(currSeq.getName()); 
326                     currSeq.setName(currSeq.getName() + '|' + trashCode);
327                     currSeq.printSequence(badFile); 
328                 }
329                         }
330                                                 
331                         //report progress
332                         if((i+1) % 100 == 0){   pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine();           }
333                 }
334                 //report progress
335                 if((pDataArray->count) % 100 != 0){     pDataArray->m->mothurOut("Thread Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();              }
336                 
337                 goodFile.close();
338                 inFASTA.close();
339         badFile.close();
340         
341         return 0;
342                 
343         }
344         catch(exception& e) {
345                 pDataArray->m->errorOut(e, "PcrSeqsCommand", "MyPcrThreadFunction");
346                 exit(1);
347         }
348
349
350 #endif
351
352 /**************************************************************************************************/
353
354
355
356 #endif