]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.h
ba4e61411b8498820139e8ae2b1ed71cb37b0432
[mothur.git] / trimseqscommand.h
1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
3
4 /*
5  *  trimseqscommand.h
6  *  Mothur
7  *
8  *  Created by Pat Schloss on 6/6/09.
9  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
10  *
11  */
12
13 #include "mothur.h"
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "groupmap.h"
18 #include "trimoligos.h"
19
20
21 class TrimSeqsCommand : public Command {
22 public:
23         TrimSeqsCommand(string);
24         TrimSeqsCommand();
25         ~TrimSeqsCommand(){}
26         
27         vector<string> setParameters();
28         string getCommandName()                 { return "trim.seqs";   }
29         string getCommandCategory()             { return "Sequence Processing";         }
30         string getHelpString(); 
31         string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
32         string getDescription()         { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
33
34         int execute(); 
35         void help() { m->mothurOut(getHelpString()); }  
36         
37 private:
38         
39         GroupMap* groupMap;
40     
41     struct linePair {
42         unsigned long long start;
43         unsigned long long end;
44         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
45         linePair() {}
46     };
47
48         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
49         bool keepFirstTrim(Sequence&, QualityScores&);
50         bool removeLastTrim(Sequence&, QualityScores&);
51         bool cullLength(Sequence&);
52         bool cullHomoP(Sequence&);
53         bool cullAmbigs(Sequence&);
54     string reverseOligo(string);
55
56         bool abort, createGroup;
57         string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
58         
59         bool flip, allFiles, qtrim, keepforward;
60         int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
61         int qWindowSize, qWindowStep, keepFirst, removeLast;
62         double qRollAverage, qThreshold, qWindowAverage, qAverage;
63         vector<string> revPrimer, outputNames;
64         set<string> filesToRemove;
65         map<string, int> barcodes;
66     map<string, int> rbarcodes;
67         vector<string> groupVector;
68         map<string, int> primers;
69     vector<string>  linker;
70     vector<string>  spacer;
71         map<string, int> combos;
72         map<string, int> groupToIndex;
73         vector<string> primerNameVector;        //needed here?
74         vector<string> barcodeNameVector;       //needed here?
75         map<string, int> groupCounts;  
76         map<string, string> nameMap;
77
78         vector<int> processIDS;   //processid
79         vector<linePair> lines;
80         vector<linePair> qLines;
81         
82         int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);    
83         int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
84         int setLines(string, string);
85 };
86
87 /**************************************************************************************************/
88 //custom data structure for threads to use.
89 // This is passed by void pointer so it can be any data type
90 // that can be passed using a single void pointer (LPVOID).
91 struct trimData {
92     unsigned long long start, end;
93     MothurOut* m;
94     string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
95         vector<vector<string> > fastaFileNames;
96     vector<vector<string> > qualFileNames;
97     vector<vector<string> > nameFileNames;
98     unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
99     bool flip, allFiles, qtrim, keepforward, createGroup;
100         int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
101         int qWindowSize, qWindowStep, keepFirst, removeLast, count;
102         double qRollAverage, qThreshold, qWindowAverage, qAverage;
103     vector<string> revPrimer;
104         map<string, int> barcodes;
105     map<string, int> rbarcodes;
106         map<string, int> primers;
107     vector<string>  linker;
108     vector<string>  spacer;
109         map<string, int> combos;
110         vector<string> primerNameVector;        
111         vector<string> barcodeNameVector;       
112         map<string, int> groupCounts;  
113         map<string, string> nameMap;
114     
115         trimData(){}
116         trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
117                       int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa, 
118                       vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
119                       int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
120                       int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
121         filename = fn;
122         qFileName = qn;
123         nameFile = nf;
124         trimFileName = tn;
125         scrapFileName = sn;
126         trimQFileName = tqn;
127         scrapQFileName = sqn;
128         trimNFileName = tnn;
129         scrapNFileName = snn;
130         groupFileName = gn;
131         fastaFileNames = ffn;
132         qualFileNames = qfn;
133         nameFileNames = nfn;
134         lineStart = lstart;
135         lineEnd = lend;
136         qlineStart = qstart;
137         qlineEnd = qend;
138                 m = mout;
139         
140         pdiffs = pd;
141         bdiffs = bd;
142         ldiffs = ld;
143         sdiffs = sd;
144         tdiffs = td;
145         barcodes = bar;
146         rbarcodes = rbar;
147         primers = pri;      numFPrimers = primers.size();
148         revPrimer = revP;   numRPrimers = revPrimer.size();
149         linker = li;        numLinkers = linker.size();
150         spacer = spa;       numSpacers = spacer.size();
151         primerNameVector = priNameVector;
152         barcodeNameVector = barNameVector;
153         
154         createGroup = cGroup;
155         allFiles = aFiles;
156         keepforward = keepF;
157         keepFirst = keepfi;
158         removeLast = removeL;
159         qWindowStep = WindowStep;
160         qWindowSize = WindowSize;
161         qWindowAverage = WindowAverage;
162         qtrim = trim;
163         qThreshold = Threshold;
164         qAverage = Average;
165         qRollAverage = RollAverage;
166         minLength = minL;
167         maxAmbig = maxA;
168         maxHomoP = maxH;
169         maxLength = maxL;
170         flip = fli;
171         nameMap = nm;
172         count = 0;
173         }
174 };
175 /**************************************************************************************************/
176 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
177 #else
178 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ 
179         trimData* pDataArray;
180         pDataArray = (trimData*)lpParam;
181         
182         try {
183         ofstream trimFASTAFile;
184                 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
185                 
186                 ofstream scrapFASTAFile;
187                 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
188                 
189                 ofstream trimQualFile;
190                 ofstream scrapQualFile;
191                 if(pDataArray->qFileName != ""){
192                         pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
193                         pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
194                 }
195                 
196                 ofstream trimNameFile;
197                 ofstream scrapNameFile;
198                 if(pDataArray->nameFile != ""){
199                         pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
200                         pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
201                 }
202                 
203                 
204                 ofstream outGroupsFile;
205                 if (pDataArray->createGroup){   pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile);   }
206                 if(pDataArray->allFiles){
207                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
208                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
209                                         if (pDataArray->fastaFileNames[i][j] != "") {
210                                                 ofstream temp;
211                                                 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
212                                                 if(pDataArray->qFileName != ""){
213                                                         pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp);                   temp.close();
214                                                 }
215                                                 
216                                                 if(pDataArray->nameFile != ""){
217                                                         pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp);                   temp.close();
218                                                 }
219                                         }
220                                 }
221                         }
222                 }
223                 
224                 ifstream inFASTA;
225                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
226                 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
227                         inFASTA.seekg(0);
228                 }else { //this accounts for the difference in line endings. 
229                         inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA); 
230                 }
231                 
232                 ifstream qFile;
233                 if(pDataArray->qFileName != "") {
234                         pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
235                         if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
236                 qFile.seekg(0);
237             }else { //this accounts for the difference in line endings. 
238                 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile); 
239             } 
240                 }
241                 
242                 
243                 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
244         
245                 pDataArray->count = pDataArray->lineEnd;
246                 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
247                                    
248                         if (pDataArray->m->control_pressed) { 
249                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
250                                 if (pDataArray->createGroup) {   outGroupsFile.close();   }
251                                 if(pDataArray->qFileName != ""){ qFile.close(); }
252                                 return 0;
253                         }
254                         
255                         int success = 1;
256                         string trashCode = "";
257                         int currentSeqsDiffs = 0;
258             
259                         Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
260                         
261                         QualityScores currQual;
262                         if(pDataArray->qFileName != ""){
263                                 currQual = QualityScores(qFile);  pDataArray->m->gobble(qFile);
264                         }
265                         
266                         string origSeq = currSeq.getUnaligned();
267                         if (origSeq != "") {
268                                 
269                                 int barcodeIndex = 0;
270                                 int primerIndex = 0;
271                                 
272                 if(pDataArray->numLinkers != 0){
273                                         success = trimOligos.stripLinker(currSeq, currQual);
274                                         if(success > pDataArray->ldiffs)                {       trashCode += 'k';       }
275                                         else{ currentSeqsDiffs += success;  }
276                                 }
277                 
278                                 if(pDataArray->barcodes.size() != 0){
279                                         success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
280                                         if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
281                                         else{ currentSeqsDiffs += success;  }
282                                 }
283                 
284                                 if(pDataArray->rbarcodes.size() != 0){
285                                         success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
286                                         if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
287                                         else{ currentSeqsDiffs += success;  }
288                                 }
289                 
290                 if(pDataArray->numSpacers != 0){
291                                         success = trimOligos.stripSpacer(currSeq, currQual);
292                                         if(success > pDataArray->sdiffs)                {       trashCode += 's';       }
293                                         else{ currentSeqsDiffs += success;  }
294
295                                 }
296                 
297                                 if(pDataArray->numFPrimers != 0){
298                                         success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
299                                         if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
300                                         else{ currentSeqsDiffs += success;  }
301                                 }
302                                 
303                                 if (currentSeqsDiffs > pDataArray->tdiffs)      {       trashCode += 't';   }
304                                 
305                                 if(pDataArray->numRPrimers != 0){
306                                         success = trimOligos.stripReverse(currSeq, currQual);
307                                         if(!success)                            {       trashCode += 'r';       }
308                                 }
309                 
310                                 if(pDataArray->keepFirst != 0){
311                                         //success = keepFirstTrim(currSeq, currQual);
312                     success = 1;
313                     if(currQual.getName() != ""){
314                         currQual.trimQScores(-1, pDataArray->keepFirst);
315                     }
316                     currSeq.trim(pDataArray->keepFirst);
317                                 }
318                                 
319                                 if(pDataArray->removeLast != 0){
320                                         //success = removeLastTrim(currSeq, currQual);
321                     success = 0;
322                     int length = currSeq.getNumBases() - pDataArray->removeLast;
323                     
324                     if(length > 0){
325                         if(currQual.getName() != ""){
326                             currQual.trimQScores(-1, length);
327                         }
328                         currSeq.trim(length);
329                         success = 1;
330                     }
331                     else{ success = 0; }
332                     
333                                         if(!success)                            {       trashCode += 'l';       }
334                                 }
335                 
336                                 
337                                 if(pDataArray->qFileName != ""){
338                                         int origLength = currSeq.getNumBases();
339                                         
340                                         if(pDataArray->qThreshold != 0)                 {       success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold);                 }
341                                         else if(pDataArray->qAverage != 0)              {       success = currQual.cullQualAverage(currSeq, pDataArray->qAverage);                              }
342                                         else if(pDataArray->qRollAverage != 0)  {       success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage);  }
343                                         else if(pDataArray->qWindowAverage != 0){       success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage);       }
344                                         else                                            {       success = 1;                            }
345                                         
346                                         //you don't want to trim, if it fails above then scrap it
347                                         if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
348                                         
349                                         if(!success)                            {       trashCode += 'q';       }
350                                 }                               
351                 
352                                 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
353                                         //success = cullLength(currSeq);
354                     int length = currSeq.getNumBases();
355                     success = 0;        //guilty until proven innocent
356                     if(length >= pDataArray->minLength && pDataArray->maxLength == 0)                   {       success = 1;    }
357                     else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) {       success = 1;    }
358                     else                                                                                                {       success = 0;    }
359                     
360                                         if(!success)                            {       trashCode += 'l';       }
361                                 }
362                                 if(pDataArray->maxHomoP > 0){
363                                         //success = cullHomoP(currSeq);
364                     int longHomoP = currSeq.getLongHomoPolymer();
365                     success = 0;        //guilty until proven innocent
366                     if(longHomoP <= pDataArray->maxHomoP){      success = 1;    }
367                     else                                        {       success = 0;    }
368                     
369                                         if(!success)                            {       trashCode += 'h';       }
370                                 }
371                                 if(pDataArray->maxAmbig != -1){
372                                         //success = cullAmbigs(currSeq);
373                     int numNs = currSeq.getAmbigBases();
374                     success = 0;        //guilty until proven innocent
375                     if(numNs <= pDataArray->maxAmbig)   {       success = 1;    }
376                     else                                        {       success = 0;    }
377                                         if(!success)                            {       trashCode += 'n';       }
378                                 }
379                                 
380                                 if(pDataArray->flip){           // should go last                       
381                                         currSeq.reverseComplement();
382                                         if(pDataArray->qFileName != ""){
383                                                 currQual.flipQScores(); 
384                                         }
385                                 }
386                                 
387                                 if(trashCode.length() == 0){
388                                         currSeq.setAligned(currSeq.getUnaligned());
389                                         currSeq.printSequence(trimFASTAFile);
390                                         
391                                         if(pDataArray->qFileName != ""){
392                                                 currQual.printQScores(trimQualFile);
393                                         }
394                                         
395                                         if(pDataArray->nameFile != ""){
396                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
397                                                 if (itName != pDataArray->nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
398                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
399                                         }
400                                         
401                                         if (pDataArray->createGroup) {
402                                                 if(pDataArray->barcodes.size() != 0){
403                                                         string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
404                                                         if (pDataArray->primers.size() != 0) { 
405                                                                 if (pDataArray->primerNameVector[primerIndex] != "") { 
406                                                                         if(thisGroup != "") {
407                                                                                 thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
408                                                                         }else {
409                                                                                 thisGroup = pDataArray->primerNameVector[primerIndex]; 
410                                                                         }
411                                                                 } 
412                                                         }
413                                                         
414                                                         outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
415                                                         
416                                                         if (pDataArray->nameFile != "") {
417                                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
418                                                                 if (itName != pDataArray->nameMap.end()) { 
419                                                                         vector<string> thisSeqsNames; 
420                                                                         pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
421                                                                         for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
422                                                                                 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
423                                                                         }
424                                                                 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }                                                   
425                                                         }
426                                                         
427                                                         map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
428                                                         if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[thisGroup] = 1; }
429                                                         else { pDataArray->groupCounts[it->first]++; }
430                             
431                                                 }
432                                         }
433                                         
434                                         if(pDataArray->allFiles){
435                                                 ofstream output;
436                                                 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
437                                                 currSeq.printSequence(output);
438                                                 output.close();
439                                                 
440                                                 if(pDataArray->qFileName != ""){
441                                                         pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
442                                                         currQual.printQScores(output);
443                                                         output.close();                                                 
444                                                 }
445                                                 
446                                                 if(pDataArray->nameFile != ""){
447                                                         map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
448                                                         if (itName != pDataArray->nameMap.end()) { 
449                                                                 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
450                                                                 output << itName->first << '\t' << itName->second << endl; 
451                                                                 output.close();
452                                                         }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
453                                                 }
454                                         }
455                                 }
456                                 else{
457                                         if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
458                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
459                                                 if (itName != pDataArray->nameMap.end()) {  scrapNameFile << itName->first << '\t' << itName->second << endl; }
460                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
461                                         }
462                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
463                                         currSeq.setUnaligned(origSeq);
464                                         currSeq.setAligned(origSeq);
465                                         currSeq.printSequence(scrapFASTAFile);
466                                         if(pDataArray->qFileName != ""){
467                                                 currQual.printQScores(scrapQualFile);
468                                         }
469                                 }
470                                 
471                         }
472                         
473                         //report progress
474                         if((i) % 1000 == 0){    pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine();               }
475                         
476                 }
477                 //report progress
478                 if((pDataArray->count) % 1000 != 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
479                 
480                 
481                 inFASTA.close();
482                 trimFASTAFile.close();
483                 scrapFASTAFile.close();
484                 if (pDataArray->createGroup) {   outGroupsFile.close();   }
485                 if(pDataArray->qFileName != "") {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
486                 if(pDataArray->nameFile != "")  {       scrapNameFile.close(); trimNameFile.close();    }
487                 
488         return 0;
489             
490         }
491         catch(exception& e) {
492             pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
493             exit(1);
494         }
495     } 
496 #endif
497     
498
499 /**************************************************************************************************/
500
501 #endif