]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.h
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / trimseqscommand.h
1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
3
4 /*
5  *  trimseqscommand.h
6  *  Mothur
7  *
8  *  Created by Pat Schloss on 6/6/09.
9  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
10  *
11  */
12
13 #include "mothur.h"
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "groupmap.h"
18 #include "trimoligos.h"
19
20
21 class TrimSeqsCommand : public Command {
22 public:
23         TrimSeqsCommand(string);
24         TrimSeqsCommand();
25         ~TrimSeqsCommand(){}
26         
27         vector<string> setParameters();
28         string getCommandName()                 { return "trim.seqs";   }
29         string getCommandCategory()             { return "Sequence Processing";         }
30         string getOutputFileNameTag(string, string);
31         string getHelpString(); 
32         string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
33         string getDescription()         { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
34
35         int execute(); 
36         void help() { m->mothurOut(getHelpString()); }  
37         
38 private:
39         
40         GroupMap* groupMap;
41     
42     struct linePair {
43         unsigned long long start;
44         unsigned long long end;
45         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
46         linePair() {}
47     };
48
49         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
50         bool keepFirstTrim(Sequence&, QualityScores&);
51         bool removeLastTrim(Sequence&, QualityScores&);
52         bool cullLength(Sequence&);
53         bool cullHomoP(Sequence&);
54         bool cullAmbigs(Sequence&);
55     string reverseOligo(string);
56
57         bool abort, createGroup;
58         string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
59         
60         bool flip, allFiles, qtrim, keepforward;
61         int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
62         int qWindowSize, qWindowStep, keepFirst, removeLast;
63         double qRollAverage, qThreshold, qWindowAverage, qAverage;
64         vector<string> revPrimer, outputNames;
65         set<string> filesToRemove;
66         map<string, int> barcodes;
67     map<string, int> rbarcodes;
68         vector<string> groupVector;
69         map<string, int> primers;
70     vector<string>  linker;
71     vector<string>  spacer;
72         map<string, int> combos;
73         map<string, int> groupToIndex;
74         vector<string> primerNameVector;        //needed here?
75         vector<string> barcodeNameVector;       //needed here?
76         map<string, int> groupCounts;  
77         map<string, string> nameMap;
78
79         vector<int> processIDS;   //processid
80         vector<linePair> lines;
81         vector<linePair> qLines;
82         
83         int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);    
84         int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
85         int setLines(string, string);
86 };
87
88 /**************************************************************************************************/
89 //custom data structure for threads to use.
90 // This is passed by void pointer so it can be any data type
91 // that can be passed using a single void pointer (LPVOID).
92 struct trimData {
93     unsigned long long start, end;
94     MothurOut* m;
95     string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
96         vector<vector<string> > fastaFileNames;
97     vector<vector<string> > qualFileNames;
98     vector<vector<string> > nameFileNames;
99     unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
100     bool flip, allFiles, qtrim, keepforward, createGroup;
101         int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
102         int qWindowSize, qWindowStep, keepFirst, removeLast, count;
103         double qRollAverage, qThreshold, qWindowAverage, qAverage;
104     vector<string> revPrimer;
105         map<string, int> barcodes;
106     map<string, int> rbarcodes;
107         map<string, int> primers;
108     vector<string>  linker;
109     vector<string>  spacer;
110         map<string, int> combos;
111         vector<string> primerNameVector;        
112         vector<string> barcodeNameVector;       
113         map<string, int> groupCounts;  
114         map<string, string> nameMap;
115     
116         trimData(){}
117         trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
118                       int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa, 
119                       vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
120                       int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
121                       int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
122         filename = fn;
123         qFileName = qn;
124         nameFile = nf;
125         trimFileName = tn;
126         scrapFileName = sn;
127         trimQFileName = tqn;
128         scrapQFileName = sqn;
129         trimNFileName = tnn;
130         scrapNFileName = snn;
131         groupFileName = gn;
132         fastaFileNames = ffn;
133         qualFileNames = qfn;
134         nameFileNames = nfn;
135         lineStart = lstart;
136         lineEnd = lend;
137         qlineStart = qstart;
138         qlineEnd = qend;
139                 m = mout;
140         
141         pdiffs = pd;
142         bdiffs = bd;
143         ldiffs = ld;
144         sdiffs = sd;
145         tdiffs = td;
146         barcodes = bar;
147         rbarcodes = rbar;
148         primers = pri;      numFPrimers = primers.size();
149         revPrimer = revP;   numRPrimers = revPrimer.size();
150         linker = li;        numLinkers = linker.size();
151         spacer = spa;       numSpacers = spacer.size();
152         primerNameVector = priNameVector;
153         barcodeNameVector = barNameVector;
154         
155         createGroup = cGroup;
156         allFiles = aFiles;
157         keepforward = keepF;
158         keepFirst = keepfi;
159         removeLast = removeL;
160         qWindowStep = WindowStep;
161         qWindowSize = WindowSize;
162         qWindowAverage = WindowAverage;
163         qtrim = trim;
164         qThreshold = Threshold;
165         qAverage = Average;
166         qRollAverage = RollAverage;
167         minLength = minL;
168         maxAmbig = maxA;
169         maxHomoP = maxH;
170         maxLength = maxL;
171         flip = fli;
172         nameMap = nm;
173         count = 0;
174         }
175 };
176 /**************************************************************************************************/
177 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
178 #else
179 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ 
180         trimData* pDataArray;
181         pDataArray = (trimData*)lpParam;
182         
183         try {
184         ofstream trimFASTAFile;
185                 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
186                 
187                 ofstream scrapFASTAFile;
188                 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
189                 
190                 ofstream trimQualFile;
191                 ofstream scrapQualFile;
192                 if(pDataArray->qFileName != ""){
193                         pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
194                         pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
195                 }
196                 
197                 ofstream trimNameFile;
198                 ofstream scrapNameFile;
199                 if(pDataArray->nameFile != ""){
200                         pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
201                         pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
202                 }
203                 
204                 
205                 ofstream outGroupsFile;
206                 if (pDataArray->createGroup){   pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile);   }
207                 if(pDataArray->allFiles){
208                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
209                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
210                                         if (pDataArray->fastaFileNames[i][j] != "") {
211                                                 ofstream temp;
212                                                 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
213                                                 if(pDataArray->qFileName != ""){
214                                                         pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp);                   temp.close();
215                                                 }
216                                                 
217                                                 if(pDataArray->nameFile != ""){
218                                                         pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp);                   temp.close();
219                                                 }
220                                         }
221                                 }
222                         }
223                 }
224                 
225                 ifstream inFASTA;
226                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
227                 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
228                         inFASTA.seekg(0);
229                 }else { //this accounts for the difference in line endings. 
230                         inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA); 
231                 }
232                 
233                 ifstream qFile;
234                 if(pDataArray->qFileName != "") {
235                         pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
236                         if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
237                 qFile.seekg(0);
238             }else { //this accounts for the difference in line endings. 
239                 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile); 
240             } 
241                 }
242                 
243                 
244                 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
245         
246                 pDataArray->count = pDataArray->lineEnd;
247                 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
248                                    
249                         if (pDataArray->m->control_pressed) { 
250                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
251                                 if (pDataArray->createGroup) {   outGroupsFile.close();   }
252                                 if(pDataArray->qFileName != ""){ qFile.close(); }
253                                 return 0;
254                         }
255                         
256                         int success = 1;
257                         string trashCode = "";
258                         int currentSeqsDiffs = 0;
259             
260                         Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
261                         
262                         QualityScores currQual;
263                         if(pDataArray->qFileName != ""){
264                                 currQual = QualityScores(qFile);  pDataArray->m->gobble(qFile);
265                         }
266                         
267                         string origSeq = currSeq.getUnaligned();
268                         if (origSeq != "") {
269                                 
270                                 int barcodeIndex = 0;
271                                 int primerIndex = 0;
272                                 
273                 if(pDataArray->numLinkers != 0){
274                                         success = trimOligos.stripLinker(currSeq, currQual);
275                                         if(success > pDataArray->ldiffs)                {       trashCode += 'k';       }
276                                         else{ currentSeqsDiffs += success;  }
277                                 }
278                 
279                                 if(pDataArray->barcodes.size() != 0){
280                                         success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
281                                         if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
282                                         else{ currentSeqsDiffs += success;  }
283                                 }
284                 
285                                 if(pDataArray->rbarcodes.size() != 0){
286                                         success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
287                                         if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
288                                         else{ currentSeqsDiffs += success;  }
289                                 }
290                 
291                 if(pDataArray->numSpacers != 0){
292                                         success = trimOligos.stripSpacer(currSeq, currQual);
293                                         if(success > pDataArray->sdiffs)                {       trashCode += 's';       }
294                                         else{ currentSeqsDiffs += success;  }
295
296                                 }
297                 
298                                 if(pDataArray->numFPrimers != 0){
299                                         success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
300                                         if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
301                                         else{ currentSeqsDiffs += success;  }
302                                 }
303                                 
304                                 if (currentSeqsDiffs > pDataArray->tdiffs)      {       trashCode += 't';   }
305                                 
306                                 if(pDataArray->numRPrimers != 0){
307                                         success = trimOligos.stripReverse(currSeq, currQual);
308                                         if(!success)                            {       trashCode += 'r';       }
309                                 }
310                 
311                                 if(pDataArray->keepFirst != 0){
312                                         //success = keepFirstTrim(currSeq, currQual);
313                     success = 1;
314                     if(currQual.getName() != ""){
315                         currQual.trimQScores(-1, pDataArray->keepFirst);
316                     }
317                     currSeq.trim(pDataArray->keepFirst);
318                                 }
319                                 
320                                 if(pDataArray->removeLast != 0){
321                                         //success = removeLastTrim(currSeq, currQual);
322                     success = 0;
323                     int length = currSeq.getNumBases() - pDataArray->removeLast;
324                     
325                     if(length > 0){
326                         if(currQual.getName() != ""){
327                             currQual.trimQScores(-1, length);
328                         }
329                         currSeq.trim(length);
330                         success = 1;
331                     }
332                     else{ success = 0; }
333                     
334                                         if(!success)                            {       trashCode += 'l';       }
335                                 }
336                 
337                                 
338                                 if(pDataArray->qFileName != ""){
339                                         int origLength = currSeq.getNumBases();
340                                         
341                                         if(pDataArray->qThreshold != 0)                 {       success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold);                 }
342                                         else if(pDataArray->qAverage != 0)              {       success = currQual.cullQualAverage(currSeq, pDataArray->qAverage);                              }
343                                         else if(pDataArray->qRollAverage != 0)  {       success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage);  }
344                                         else if(pDataArray->qWindowAverage != 0){       success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage);       }
345                                         else                                            {       success = 1;                            }
346                                         
347                                         //you don't want to trim, if it fails above then scrap it
348                                         if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
349                                         
350                                         if(!success)                            {       trashCode += 'q';       }
351                                 }                               
352                 
353                                 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
354                                         //success = cullLength(currSeq);
355                     int length = currSeq.getNumBases();
356                     success = 0;        //guilty until proven innocent
357                     if(length >= pDataArray->minLength && pDataArray->maxLength == 0)                   {       success = 1;    }
358                     else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) {       success = 1;    }
359                     else                                                                                                {       success = 0;    }
360                     
361                                         if(!success)                            {       trashCode += 'l';       }
362                                 }
363                                 if(pDataArray->maxHomoP > 0){
364                                         //success = cullHomoP(currSeq);
365                     int longHomoP = currSeq.getLongHomoPolymer();
366                     success = 0;        //guilty until proven innocent
367                     if(longHomoP <= pDataArray->maxHomoP){      success = 1;    }
368                     else                                        {       success = 0;    }
369                     
370                                         if(!success)                            {       trashCode += 'h';       }
371                                 }
372                                 if(pDataArray->maxAmbig != -1){
373                                         //success = cullAmbigs(currSeq);
374                     int numNs = currSeq.getAmbigBases();
375                     success = 0;        //guilty until proven innocent
376                     if(numNs <= pDataArray->maxAmbig)   {       success = 1;    }
377                     else                                        {       success = 0;    }
378                                         if(!success)                            {       trashCode += 'n';       }
379                                 }
380                                 
381                                 if(pDataArray->flip){           // should go last                       
382                                         currSeq.reverseComplement();
383                                         if(pDataArray->qFileName != ""){
384                                                 currQual.flipQScores(); 
385                                         }
386                                 }
387                                 
388                                 if(trashCode.length() == 0){
389                                         currSeq.setAligned(currSeq.getUnaligned());
390                                         currSeq.printSequence(trimFASTAFile);
391                                         
392                                         if(pDataArray->qFileName != ""){
393                                                 currQual.printQScores(trimQualFile);
394                                         }
395                                         
396                                         if(pDataArray->nameFile != ""){
397                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
398                                                 if (itName != pDataArray->nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
399                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
400                                         }
401                                         
402                                         if (pDataArray->createGroup) {
403                                                 if(pDataArray->barcodes.size() != 0){
404                                                         string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
405                                                         if (pDataArray->primers.size() != 0) { 
406                                                                 if (pDataArray->primerNameVector[primerIndex] != "") { 
407                                                                         if(thisGroup != "") {
408                                                                                 thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
409                                                                         }else {
410                                                                                 thisGroup = pDataArray->primerNameVector[primerIndex]; 
411                                                                         }
412                                                                 } 
413                                                         }
414                                                         
415                                                         outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
416                                                         
417                             int numRedundants = 0;
418                                                         if (pDataArray->nameFile != "") {
419                                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
420                                                                 if (itName != pDataArray->nameMap.end()) { 
421                                                                         vector<string> thisSeqsNames; 
422                                                                         pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
423                                     numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
424                                                                         for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
425                                                                                 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
426                                                                         }
427                                                                 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }                                                   
428                                                         }
429                                                         
430                                                         map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
431                                                         if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
432                                                         else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
433                             
434                                                 }
435                                         }
436                                         
437                                         if(pDataArray->allFiles){
438                                                 ofstream output;
439                                                 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
440                                                 currSeq.printSequence(output);
441                                                 output.close();
442                                                 
443                                                 if(pDataArray->qFileName != ""){
444                                                         pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
445                                                         currQual.printQScores(output);
446                                                         output.close();                                                 
447                                                 }
448                                                 
449                                                 if(pDataArray->nameFile != ""){
450                                                         map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
451                                                         if (itName != pDataArray->nameMap.end()) { 
452                                                                 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
453                                                                 output << itName->first << '\t' << itName->second << endl; 
454                                                                 output.close();
455                                                         }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
456                                                 }
457                                         }
458                                 }
459                                 else{
460                                         if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
461                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
462                                                 if (itName != pDataArray->nameMap.end()) {  scrapNameFile << itName->first << '\t' << itName->second << endl; }
463                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
464                                         }
465                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
466                                         currSeq.setUnaligned(origSeq);
467                                         currSeq.setAligned(origSeq);
468                                         currSeq.printSequence(scrapFASTAFile);
469                                         if(pDataArray->qFileName != ""){
470                                                 currQual.printQScores(scrapQualFile);
471                                         }
472                                 }
473                                 
474                         }
475                         
476                         //report progress
477                         if((i) % 1000 == 0){    pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine();               }
478                         
479                 }
480                 //report progress
481                 if((pDataArray->count) % 1000 != 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
482                 
483                 
484                 inFASTA.close();
485                 trimFASTAFile.close();
486                 scrapFASTAFile.close();
487                 if (pDataArray->createGroup) {   outGroupsFile.close();   }
488                 if(pDataArray->qFileName != "") {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
489                 if(pDataArray->nameFile != "")  {       scrapNameFile.close(); trimNameFile.close();    }
490                 
491         return 0;
492             
493         }
494         catch(exception& e) {
495             pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
496             exit(1);
497         }
498     } 
499 #endif
500     
501
502 /**************************************************************************************************/
503
504 #endif