1 #ifndef Mothur_pcrseqscommand_h
2 #define Mothur_pcrseqscommand_h
8 // Created by Sarah Westcott on 3/14/12.
9 // Copyright (c) 2012 Schloss Lab. All rights reserved.
13 #include "command.hpp"
14 #include "sequence.hpp"
15 #include "trimoligos.h"
16 #include "alignment.hpp"
17 #include "needlemanoverlap.hpp"
18 #include "counttable.h"
20 class PcrSeqsCommand : public Command {
22 PcrSeqsCommand(string);
26 vector<string> setParameters();
27 string getCommandName() { return "pcr.seqs"; }
28 string getCommandCategory() { return "Sequence Processing"; }
30 string getHelpString();
31 string getOutputPattern(string);
32 string getCitation() { return "http://www.mothur.org/wiki/Pcr.seqs"; }
33 string getDescription() { return "pcr.seqs"; }
36 void help() { m->mothurOut(getHelpString()); }
41 unsigned long long start;
42 unsigned long long end;
43 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
47 vector<linePair> lines;
48 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
49 bool abort, keepprimer, keepdots;
50 string fastafile, oligosfile, taxfile, groupfile, namefile, countfile, ecolifile, outputDir, nomatch;
51 int start, end, processors, length;
53 vector<string> revPrimer, outputNames;
54 vector<string> primers;
56 int writeAccnos(set<string>);
57 int readName(set<string>&);
58 int readGroup(set<string>);
59 int readTax(set<string>);
60 int readCount(set<string>);
63 int driverPcr(string, string, string, set<string>&, linePair);
64 int createProcesses(string, string, string, set<string>&);
65 bool findForward(Sequence&, int&, int&);
66 bool findReverse(Sequence&, int&, int&);
67 bool isAligned(string, map<int, int>&);
68 bool compareDNASeq(string, string);
69 string reverseOligo(string);
72 /**************************************************************************************************/
73 //custom data structure for threads to use.
74 // This is passed by void pointer so it can be any data type
75 // that can be passed using a single void pointer (LPVOID).
78 string goodFasta, badFasta, oligosfile, ecolifile, nomatch;
79 unsigned long long fstart;
80 unsigned long long fend;
81 int count, start, end, length;
83 vector<string> primers;
84 vector<string> revPrimer;
85 set<string> badSeqNames;
86 bool keepprimer, keepdots;
90 pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector<string> pr, vector<string> rpr, string nm, bool kp, bool kd, int st, int en, int l, unsigned long long fst, unsigned long long fen) {
110 /**************************************************************************************************/
111 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
113 static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){
115 pDataArray = (pcrData*)lpParam;
119 pDataArray->m->openOutputFile(pDataArray->goodFasta, goodFile);
122 pDataArray->m->openOutputFile(pDataArray->badFasta, badFile);
125 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
127 //print header if you are process 0
128 if ((pDataArray->fstart == 0) || (pDataArray->fstart == 1)) {
130 }else { //this accounts for the difference in line endings.
131 inFASTA.seekg(pDataArray->fstart-1); pDataArray->m->gobble(inFASTA);
135 pDataArray->count = pDataArray->fend;
136 for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process
138 if (pDataArray->m->control_pressed) { break; }
140 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
142 string trashCode = "";
143 if (currSeq.getName() != "") {
146 if (pDataArray->oligosfile != "") {
147 map<int, int> mapAligned;
148 //bool aligned = isAligned(currSeq.getAligned(), mapAligned);
149 ///////////////////////////////////////////////////////////////
150 bool aligned = false;
151 string seq = currSeq.getAligned();
153 for (int k = 0; k < seq.length(); k++) {
154 if (!isalpha(seq[k])) { aligned = true; }
155 else { mapAligned[countBases] = k; countBases++; } //maps location in unaligned -> location in aligned.
156 } //ie. the 3rd base may be at spot 10 in the alignment
157 //later when we trim we want to trim from spot 10.
158 ///////////////////////////////////////////////////////////////
161 if (pDataArray->primers.size() != 0) {
162 int primerStart = 0; int primerEnd = 0;
163 //bool good = findForward(currSeq, primerStart, primerEnd);
164 ///////////////////////////////////////////////////////////////
166 string rawSequence = currSeq.getUnaligned();
168 for(int j=0;j<pDataArray->primers.size();j++){
169 string oligo = pDataArray->primers[j];
171 if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; }
173 if(rawSequence.length() < oligo.length()) { break; }
176 int olength = oligo.length();
177 for (int l = 0; l < rawSequence.length()-olength; l++){
178 if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; }
179 string rawChunk = rawSequence.substr(l, olength);
180 //compareDNASeq(oligo, rawChunk)
181 ////////////////////////////////////////////////////////
183 for(int k=0;k<olength;k++){
185 if(oligo[k] != rawChunk[k]){
186 if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C') { success = 0; }
187 else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N')) { success = 0; }
188 else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G')) { success = 0; }
189 else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T')) { success = 0; }
190 else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A')) { success = 0; }
191 else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
192 else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A')) { success = 0; }
193 else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
194 else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
195 else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
196 else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C')) { success = 0; }
197 else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
199 if(success == 0) { break; }
206 ////////////////////////////////////////////////////////////////////
209 primerEnd = primerStart + olength;
216 if (!good) { primerStart = 0; primerEnd = 0; }
217 ///////////////////////////////////////////////////////////////
220 if(!good){ if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
224 if (!pDataArray->keepprimer) {
225 if (pDataArray->keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
226 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
229 if (pDataArray->keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
230 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
233 if (!pDataArray->keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
234 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
239 //process reverse primers
240 if (pDataArray->revPrimer.size() != 0) {
241 int primerStart = 0; int primerEnd = 0;
243 //findReverse(currSeq, primerStart, primerEnd);
244 ///////////////////////////////////////////////////////////////
245 string rawSequence = currSeq.getUnaligned();
247 for(int j=0;j<pDataArray->revPrimer.size();j++){
248 string oligo = pDataArray->revPrimer[j];
249 if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; }
250 if(rawSequence.length() < oligo.length()) { break; }
253 int olength = oligo.length();
254 for (int l = rawSequence.length()-olength; l >= 0; l--){
256 string rawChunk = rawSequence.substr(l, olength);
257 //compareDNASeq(oligo, rawChunk)
258 ////////////////////////////////////////////////////////
260 for(int k=0;k<olength;k++){
262 if(oligo[k] != rawChunk[k]){
263 if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C') { success = 0; }
264 else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N')) { success = 0; }
265 else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G')) { success = 0; }
266 else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T')) { success = 0; }
267 else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A')) { success = 0; }
268 else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
269 else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A')) { success = 0; }
270 else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
271 else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
272 else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
273 else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C')) { success = 0; }
274 else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
276 if(success == 0) { break; }
283 ////////////////////////////////////////////////////////////////////
286 primerEnd = primerStart + olength;
293 if (!good) { primerStart = 0; primerEnd = 0; }
295 ///////////////////////////////////////////////////////////////
296 if(!good){ if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
300 if (!pDataArray->keepprimer) {
301 if (pDataArray->keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
302 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
305 if (pDataArray->keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
306 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
309 if (!pDataArray->keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
310 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
314 }else if (pDataArray->ecolifile != "") {
315 //make sure the seqs are aligned
316 lengths.insert(currSeq.getAligned().length());
317 if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
318 else if (currSeq.getAligned().length() != pDataArray->length) {
319 pDataArray->m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); pDataArray->m->control_pressed = true; break;
321 if (pDataArray->keepdots) {
322 currSeq.filterToPos(pDataArray->start);
323 currSeq.filterFromPos(pDataArray->end);
325 string seqString = currSeq.getAligned().substr(0, pDataArray->end);
326 seqString = seqString.substr(pDataArray->start);
327 currSeq.setAligned(seqString);
330 }else{ //using start and end to trim
331 //make sure the seqs are aligned
332 lengths.insert(currSeq.getAligned().length());
333 if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
335 if (pDataArray->end != -1) {
336 if (pDataArray->end > currSeq.getAligned().length()) { pDataArray->m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); pDataArray->m->control_pressed = true; break; }
338 if (pDataArray->keepdots) { currSeq.filterFromPos(pDataArray->end); }
340 string seqString = currSeq.getAligned().substr(0, pDataArray->end);
341 currSeq.setAligned(seqString);
345 if (pDataArray->start != -1) {
346 if (pDataArray->keepdots) { currSeq.filterToPos(pDataArray->start); }
348 string seqString = currSeq.getAligned().substr(pDataArray->start);
349 currSeq.setAligned(seqString);
356 if(goodSeq == 1) { currSeq.printSequence(goodFile); }
358 pDataArray->badSeqNames.insert(currSeq.getName());
359 currSeq.setName(currSeq.getName() + '|' + trashCode);
360 currSeq.printSequence(badFile);
365 if((i+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine(); }
368 if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut("Thread Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
377 catch(exception& e) {
378 pDataArray->m->errorOut(e, "PcrSeqsCommand", "MyPcrThreadFunction");
385 /**************************************************************************************************/