]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.h
5006ce923d00183b0bc22ed08adbfaa6240df247
[mothur.git] / trimseqscommand.h
1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
3
4 /*
5  *  trimseqscommand.h
6  *  Mothur
7  *
8  *  Created by Pat Schloss on 6/6/09.
9  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
10  *
11  */
12
13 #include "mothur.h"
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "groupmap.h"
18 #include "trimoligos.h"
19
20
21 class TrimSeqsCommand : public Command {
22 public:
23         TrimSeqsCommand(string);
24         TrimSeqsCommand();
25         ~TrimSeqsCommand(){}
26         
27         vector<string> setParameters();
28         string getCommandName()                 { return "trim.seqs";   }
29         string getCommandCategory()             { return "Sequence Processing";         }
30         string getHelpString(); 
31         string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
32         string getDescription()         { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
33
34         int execute(); 
35         void help() { m->mothurOut(getHelpString()); }  
36         
37 private:
38         
39         GroupMap* groupMap;
40     
41     struct linePair {
42         unsigned long long start;
43         unsigned long long end;
44         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
45         linePair() {}
46     };
47
48         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
49         bool keepFirstTrim(Sequence&, QualityScores&);
50         bool removeLastTrim(Sequence&, QualityScores&);
51         bool cullLength(Sequence&);
52         bool cullHomoP(Sequence&);
53         bool cullAmbigs(Sequence&);
54
55         bool abort, createGroup;
56         string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
57         
58         bool flip, allFiles, qtrim, keepforward;
59         int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
60         int qWindowSize, qWindowStep, keepFirst, removeLast;
61         double qRollAverage, qThreshold, qWindowAverage, qAverage;
62         vector<string> revPrimer, outputNames;
63         set<string> filesToRemove;
64         map<string, int> barcodes;
65         vector<string> groupVector;
66         map<string, int> primers;
67     vector<string>  linker;
68     vector<string>  spacer;
69         map<string, int> combos;
70         map<string, int> groupToIndex;
71         vector<string> primerNameVector;        //needed here?
72         vector<string> barcodeNameVector;       //needed here?
73         map<string, int> groupCounts;  
74         map<string, string> nameMap;
75
76         vector<int> processIDS;   //processid
77         vector<linePair> lines;
78         vector<linePair> qLines;
79         
80         int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);    
81         int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
82         int setLines(string, string);
83 };
84
85 /**************************************************************************************************/
86 //custom data structure for threads to use.
87 // This is passed by void pointer so it can be any data type
88 // that can be passed using a single void pointer (LPVOID).
89 struct trimData {
90     unsigned long long start, end;
91     MothurOut* m;
92     string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
93         vector<vector<string> > fastaFileNames;
94     vector<vector<string> > qualFileNames;
95     vector<vector<string> > nameFileNames;
96     unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
97     bool flip, allFiles, qtrim, keepforward, createGroup;
98         int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
99         int qWindowSize, qWindowStep, keepFirst, removeLast, count;
100         double qRollAverage, qThreshold, qWindowAverage, qAverage;
101     vector<string> revPrimer;
102         map<string, int> barcodes;
103         map<string, int> primers;
104     vector<string>  linker;
105     vector<string>  spacer;
106         map<string, int> combos;
107         vector<string> primerNameVector;        
108         vector<string> barcodeNameVector;       
109         map<string, int> groupCounts;  
110         map<string, string> nameMap;
111     
112         trimData(){}
113         trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
114                       int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, 
115                       vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
116                       int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
117                       int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
118         filename = fn;
119         qFileName = qn;
120         nameFile = nf;
121         trimFileName = tn;
122         scrapFileName = sn;
123         trimQFileName = tqn;
124         scrapQFileName = sqn;
125         trimNFileName = tnn;
126         scrapNFileName = snn;
127         groupFileName = gn;
128         fastaFileNames = ffn;
129         qualFileNames = qfn;
130         nameFileNames = nfn;
131         lineStart = lstart;
132         lineEnd = lend;
133         qlineStart = qstart;
134         qlineEnd = qend;
135                 m = mout;
136         
137         pdiffs = pd;
138         bdiffs = bd;
139         ldiffs = ld;
140         sdiffs = sd;
141         tdiffs = td;
142         barcodes = bar;
143         primers = pri;      numFPrimers = primers.size();
144         revPrimer = revP;   numRPrimers = revPrimer.size();
145         linker = li;        numLinkers = linker.size();
146         spacer = spa;       numSpacers = spacer.size();
147         primerNameVector = priNameVector;
148         barcodeNameVector = barNameVector;
149         
150         createGroup = cGroup;
151         allFiles = aFiles;
152         keepforward = keepF;
153         keepFirst = keepfi;
154         removeLast = removeL;
155         qWindowStep = WindowStep;
156         qWindowSize = WindowSize;
157         qWindowAverage = WindowAverage;
158         qtrim = trim;
159         qThreshold = Threshold;
160         qAverage = Average;
161         qRollAverage = RollAverage;
162         minLength = minL;
163         maxAmbig = maxA;
164         maxHomoP = maxH;
165         maxLength = maxL;
166         flip = fli;
167         nameMap = nm;
168         count = 0;
169         }
170 };
171 /**************************************************************************************************/
172 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
173 #else
174 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ 
175         trimData* pDataArray;
176         pDataArray = (trimData*)lpParam;
177         
178         try {
179         ofstream trimFASTAFile;
180                 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
181                 
182                 ofstream scrapFASTAFile;
183                 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
184                 
185                 ofstream trimQualFile;
186                 ofstream scrapQualFile;
187                 if(pDataArray->qFileName != ""){
188                         pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
189                         pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
190                 }
191                 
192                 ofstream trimNameFile;
193                 ofstream scrapNameFile;
194                 if(pDataArray->nameFile != ""){
195                         pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
196                         pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
197                 }
198                 
199                 
200                 ofstream outGroupsFile;
201                 if (pDataArray->createGroup){   pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile);   }
202                 if(pDataArray->allFiles){
203                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
204                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
205                                         if (pDataArray->fastaFileNames[i][j] != "") {
206                                                 ofstream temp;
207                                                 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
208                                                 if(pDataArray->qFileName != ""){
209                                                         pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp);                   temp.close();
210                                                 }
211                                                 
212                                                 if(pDataArray->nameFile != ""){
213                                                         pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp);                   temp.close();
214                                                 }
215                                         }
216                                 }
217                         }
218                 }
219                 
220                 ifstream inFASTA;
221                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
222                 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
223                         inFASTA.seekg(0);
224                 }else { //this accounts for the difference in line endings. 
225                         inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA); 
226                 }
227                 
228                 ifstream qFile;
229                 if(pDataArray->qFileName != "") {
230                         pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
231                         if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
232                 qFile.seekg(0);
233             }else { //this accounts for the difference in line endings. 
234                 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile); 
235             } 
236                 }
237                 
238                 
239                 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
240         
241                 pDataArray->count = pDataArray->lineEnd;
242                 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
243                                    
244                         if (pDataArray->m->control_pressed) { 
245                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
246                                 if (pDataArray->createGroup) {   outGroupsFile.close();   }
247                                 if(pDataArray->qFileName != ""){ qFile.close(); }
248                                 return 0;
249                         }
250                         
251                         int success = 1;
252                         string trashCode = "";
253                         int currentSeqsDiffs = 0;
254             
255                         Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
256                         
257                         QualityScores currQual;
258                         if(pDataArray->qFileName != ""){
259                                 currQual = QualityScores(qFile);  pDataArray->m->gobble(qFile);
260                         }
261                         
262                         string origSeq = currSeq.getUnaligned();
263                         if (origSeq != "") {
264                                 
265                                 int barcodeIndex = 0;
266                                 int primerIndex = 0;
267                                 
268                 if(pDataArray->numLinkers != 0){
269                                         success = trimOligos.stripLinker(currSeq, currQual);
270                                         if(success > pDataArray->ldiffs)                {       trashCode += 'k';       }
271                                         else{ currentSeqsDiffs += success;  }
272                                 }
273                 
274                                 if(pDataArray->barcodes.size() != 0){
275                                         success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
276                                         if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
277                                         else{ currentSeqsDiffs += success;  }
278                                 }
279                                 
280                 if(pDataArray->numSpacers != 0){
281                                         success = trimOligos.stripSpacer(currSeq, currQual);
282                                         if(success > pDataArray->sdiffs)                {       trashCode += 's';       }
283                                         else{ currentSeqsDiffs += success;  }
284
285                                 }
286                 
287                                 if(pDataArray->numFPrimers != 0){
288                                         success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
289                                         if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
290                                         else{ currentSeqsDiffs += success;  }
291                                 }
292                                 
293                                 if (currentSeqsDiffs > pDataArray->tdiffs)      {       trashCode += 't';   }
294                                 
295                                 if(pDataArray->numRPrimers != 0){
296                                         success = trimOligos.stripReverse(currSeq, currQual);
297                                         if(!success)                            {       trashCode += 'r';       }
298                                 }
299                 
300                                 if(pDataArray->keepFirst != 0){
301                                         //success = keepFirstTrim(currSeq, currQual);
302                     success = 1;
303                     if(currQual.getName() != ""){
304                         currQual.trimQScores(-1, pDataArray->keepFirst);
305                     }
306                     currSeq.trim(pDataArray->keepFirst);
307                                 }
308                                 
309                                 if(pDataArray->removeLast != 0){
310                                         //success = removeLastTrim(currSeq, currQual);
311                     success = 0;
312                     int length = currSeq.getNumBases() - pDataArray->removeLast;
313                     
314                     if(length > 0){
315                         if(currQual.getName() != ""){
316                             currQual.trimQScores(-1, length);
317                         }
318                         currSeq.trim(length);
319                         success = 1;
320                     }
321                     else{ success = 0; }
322                     
323                                         if(!success)                            {       trashCode += 'l';       }
324                                 }
325                 
326                                 
327                                 if(pDataArray->qFileName != ""){
328                                         int origLength = currSeq.getNumBases();
329                                         
330                                         if(pDataArray->qThreshold != 0)                 {       success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold);                 }
331                                         else if(pDataArray->qAverage != 0)              {       success = currQual.cullQualAverage(currSeq, pDataArray->qAverage);                              }
332                                         else if(pDataArray->qRollAverage != 0)  {       success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage);  }
333                                         else if(pDataArray->qWindowAverage != 0){       success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage);       }
334                                         else                                            {       success = 1;                            }
335                                         
336                                         //you don't want to trim, if it fails above then scrap it
337                                         if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
338                                         
339                                         if(!success)                            {       trashCode += 'q';       }
340                                 }                               
341                 
342                                 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
343                                         //success = cullLength(currSeq);
344                     int length = currSeq.getNumBases();
345                     success = 0;        //guilty until proven innocent
346                     if(length >= pDataArray->minLength && pDataArray->maxLength == 0)                   {       success = 1;    }
347                     else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) {       success = 1;    }
348                     else                                                                                                {       success = 0;    }
349                     
350                                         if(!success)                            {       trashCode += 'l';       }
351                                 }
352                                 if(pDataArray->maxHomoP > 0){
353                                         //success = cullHomoP(currSeq);
354                     int longHomoP = currSeq.getLongHomoPolymer();
355                     success = 0;        //guilty until proven innocent
356                     if(longHomoP <= pDataArray->maxHomoP){      success = 1;    }
357                     else                                        {       success = 0;    }
358                     
359                                         if(!success)                            {       trashCode += 'h';       }
360                                 }
361                                 if(pDataArray->maxAmbig != -1){
362                                         //success = cullAmbigs(currSeq);
363                     int numNs = currSeq.getAmbigBases();
364                     success = 0;        //guilty until proven innocent
365                     if(numNs <= pDataArray->maxAmbig)   {       success = 1;    }
366                     else                                        {       success = 0;    }
367                                         if(!success)                            {       trashCode += 'n';       }
368                                 }
369                                 
370                                 if(pDataArray->flip){           // should go last                       
371                                         currSeq.reverseComplement();
372                                         if(pDataArray->qFileName != ""){
373                                                 currQual.flipQScores(); 
374                                         }
375                                 }
376                                 
377                                 if(trashCode.length() == 0){
378                                         currSeq.setAligned(currSeq.getUnaligned());
379                                         currSeq.printSequence(trimFASTAFile);
380                                         
381                                         if(pDataArray->qFileName != ""){
382                                                 currQual.printQScores(trimQualFile);
383                                         }
384                                         
385                                         if(pDataArray->nameFile != ""){
386                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
387                                                 if (itName != pDataArray->nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
388                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
389                                         }
390                                         
391                                         if (pDataArray->createGroup) {
392                                                 if(pDataArray->barcodes.size() != 0){
393                                                         string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
394                                                         if (pDataArray->primers.size() != 0) { 
395                                                                 if (pDataArray->primerNameVector[primerIndex] != "") { 
396                                                                         if(thisGroup != "") {
397                                                                                 thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
398                                                                         }else {
399                                                                                 thisGroup = pDataArray->primerNameVector[primerIndex]; 
400                                                                         }
401                                                                 } 
402                                                         }
403                                                         
404                                                         outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
405                                                         
406                                                         if (pDataArray->nameFile != "") {
407                                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
408                                                                 if (itName != pDataArray->nameMap.end()) { 
409                                                                         vector<string> thisSeqsNames; 
410                                                                         pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
411                                                                         for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
412                                                                                 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
413                                                                         }
414                                                                 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }                                                   
415                                                         }
416                                                         
417                                                         map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
418                                                         if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[thisGroup] = 1; }
419                                                         else { pDataArray->groupCounts[it->first]++; }
420                             
421                                                 }
422                                         }
423                                         
424                                         if(pDataArray->allFiles){
425                                                 ofstream output;
426                                                 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
427                                                 currSeq.printSequence(output);
428                                                 output.close();
429                                                 
430                                                 if(pDataArray->qFileName != ""){
431                                                         pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
432                                                         currQual.printQScores(output);
433                                                         output.close();                                                 
434                                                 }
435                                                 
436                                                 if(pDataArray->nameFile != ""){
437                                                         map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
438                                                         if (itName != pDataArray->nameMap.end()) { 
439                                                                 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
440                                                                 output << itName->first << '\t' << itName->second << endl; 
441                                                                 output.close();
442                                                         }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
443                                                 }
444                                         }
445                                 }
446                                 else{
447                                         if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
448                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
449                                                 if (itName != pDataArray->nameMap.end()) {  scrapNameFile << itName->first << '\t' << itName->second << endl; }
450                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
451                                         }
452                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
453                                         currSeq.setUnaligned(origSeq);
454                                         currSeq.setAligned(origSeq);
455                                         currSeq.printSequence(scrapFASTAFile);
456                                         if(pDataArray->qFileName != ""){
457                                                 currQual.printQScores(scrapQualFile);
458                                         }
459                                 }
460                                 
461                         }
462                         
463                         //report progress
464                         if((i) % 1000 == 0){    pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine();               }
465                         
466                 }
467                 //report progress
468                 if((pDataArray->count) % 1000 != 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
469                 
470                 
471                 inFASTA.close();
472                 trimFASTAFile.close();
473                 scrapFASTAFile.close();
474                 if (pDataArray->createGroup) {   outGroupsFile.close();   }
475                 if(pDataArray->qFileName != "") {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
476                 if(pDataArray->nameFile != "")  {       scrapNameFile.close(); trimNameFile.close();    }
477                 
478         return 0;
479             
480         }
481         catch(exception& e) {
482             pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
483             exit(1);
484         }
485     } 
486 #endif
487     
488
489 /**************************************************************************************************/
490
491 #endif