]> git.donarmstrong.com Git - mothur.git/blob - trimseqscommand.h
removing chime source files from mother project.
[mothur.git] / trimseqscommand.h
1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
3
4 /*
5  *  trimseqscommand.h
6  *  Mothur
7  *
8  *  Created by Pat Schloss on 6/6/09.
9  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
10  *
11  */
12
13 #include "mothur.h"
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "groupmap.h"
18 #include "trimoligos.h"
19
20
21 class TrimSeqsCommand : public Command {
22 public:
23         TrimSeqsCommand(string);
24         TrimSeqsCommand();
25         ~TrimSeqsCommand(){}
26         
27         vector<string> setParameters();
28         string getCommandName()                 { return "trim.seqs";   }
29         string getCommandCategory()             { return "Sequence Processing";         }
30         string getHelpString(); 
31         string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
32         string getDescription()         { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
33
34         int execute(); 
35         void help() { m->mothurOut(getHelpString()); }  
36         
37 private:
38         
39         GroupMap* groupMap;
40     
41     struct linePair {
42         unsigned long long start;
43         unsigned long long end;
44         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
45         linePair() {}
46     };
47
48         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
49         bool keepFirstTrim(Sequence&, QualityScores&);
50         bool removeLastTrim(Sequence&, QualityScores&);
51         bool cullLength(Sequence&);
52         bool cullHomoP(Sequence&);
53         bool cullAmbigs(Sequence&);
54     string reverseOligo(string);
55
56         bool abort, createGroup;
57         string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
58         
59         bool flip, allFiles, qtrim, keepforward;
60         int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
61         int qWindowSize, qWindowStep, keepFirst, removeLast;
62         double qRollAverage, qThreshold, qWindowAverage, qAverage;
63         vector<string> revPrimer, outputNames;
64         set<string> filesToRemove;
65         map<string, int> barcodes;
66         vector<string> groupVector;
67         map<string, int> primers;
68     vector<string>  linker;
69     vector<string>  spacer;
70         map<string, int> combos;
71         map<string, int> groupToIndex;
72         vector<string> primerNameVector;        //needed here?
73         vector<string> barcodeNameVector;       //needed here?
74         map<string, int> groupCounts;  
75         map<string, string> nameMap;
76
77         vector<int> processIDS;   //processid
78         vector<linePair> lines;
79         vector<linePair> qLines;
80         
81         int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);    
82         int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
83         int setLines(string, string);
84 };
85
86 /**************************************************************************************************/
87 //custom data structure for threads to use.
88 // This is passed by void pointer so it can be any data type
89 // that can be passed using a single void pointer (LPVOID).
90 struct trimData {
91     unsigned long long start, end;
92     MothurOut* m;
93     string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
94         vector<vector<string> > fastaFileNames;
95     vector<vector<string> > qualFileNames;
96     vector<vector<string> > nameFileNames;
97     unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
98     bool flip, allFiles, qtrim, keepforward, createGroup;
99         int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
100         int qWindowSize, qWindowStep, keepFirst, removeLast, count;
101         double qRollAverage, qThreshold, qWindowAverage, qAverage;
102     vector<string> revPrimer;
103         map<string, int> barcodes;
104         map<string, int> primers;
105     vector<string>  linker;
106     vector<string>  spacer;
107         map<string, int> combos;
108         vector<string> primerNameVector;        
109         vector<string> barcodeNameVector;       
110         map<string, int> groupCounts;  
111         map<string, string> nameMap;
112     
113         trimData(){}
114         trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
115                       int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, 
116                       vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
117                       int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
118                       int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
119         filename = fn;
120         qFileName = qn;
121         nameFile = nf;
122         trimFileName = tn;
123         scrapFileName = sn;
124         trimQFileName = tqn;
125         scrapQFileName = sqn;
126         trimNFileName = tnn;
127         scrapNFileName = snn;
128         groupFileName = gn;
129         fastaFileNames = ffn;
130         qualFileNames = qfn;
131         nameFileNames = nfn;
132         lineStart = lstart;
133         lineEnd = lend;
134         qlineStart = qstart;
135         qlineEnd = qend;
136                 m = mout;
137         
138         pdiffs = pd;
139         bdiffs = bd;
140         ldiffs = ld;
141         sdiffs = sd;
142         tdiffs = td;
143         barcodes = bar;
144         primers = pri;      numFPrimers = primers.size();
145         revPrimer = revP;   numRPrimers = revPrimer.size();
146         linker = li;        numLinkers = linker.size();
147         spacer = spa;       numSpacers = spacer.size();
148         primerNameVector = priNameVector;
149         barcodeNameVector = barNameVector;
150         
151         createGroup = cGroup;
152         allFiles = aFiles;
153         keepforward = keepF;
154         keepFirst = keepfi;
155         removeLast = removeL;
156         qWindowStep = WindowStep;
157         qWindowSize = WindowSize;
158         qWindowAverage = WindowAverage;
159         qtrim = trim;
160         qThreshold = Threshold;
161         qAverage = Average;
162         qRollAverage = RollAverage;
163         minLength = minL;
164         maxAmbig = maxA;
165         maxHomoP = maxH;
166         maxLength = maxL;
167         flip = fli;
168         nameMap = nm;
169         count = 0;
170         }
171 };
172 /**************************************************************************************************/
173 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
174 #else
175 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ 
176         trimData* pDataArray;
177         pDataArray = (trimData*)lpParam;
178         
179         try {
180         ofstream trimFASTAFile;
181                 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
182                 
183                 ofstream scrapFASTAFile;
184                 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
185                 
186                 ofstream trimQualFile;
187                 ofstream scrapQualFile;
188                 if(pDataArray->qFileName != ""){
189                         pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
190                         pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
191                 }
192                 
193                 ofstream trimNameFile;
194                 ofstream scrapNameFile;
195                 if(pDataArray->nameFile != ""){
196                         pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
197                         pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
198                 }
199                 
200                 
201                 ofstream outGroupsFile;
202                 if (pDataArray->createGroup){   pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile);   }
203                 if(pDataArray->allFiles){
204                         for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
205                                 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
206                                         if (pDataArray->fastaFileNames[i][j] != "") {
207                                                 ofstream temp;
208                                                 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
209                                                 if(pDataArray->qFileName != ""){
210                                                         pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp);                   temp.close();
211                                                 }
212                                                 
213                                                 if(pDataArray->nameFile != ""){
214                                                         pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp);                   temp.close();
215                                                 }
216                                         }
217                                 }
218                         }
219                 }
220                 
221                 ifstream inFASTA;
222                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
223                 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
224                         inFASTA.seekg(0);
225                 }else { //this accounts for the difference in line endings. 
226                         inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA); 
227                 }
228                 
229                 ifstream qFile;
230                 if(pDataArray->qFileName != "") {
231                         pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
232                         if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
233                 qFile.seekg(0);
234             }else { //this accounts for the difference in line endings. 
235                 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile); 
236             } 
237                 }
238                 
239                 
240                 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
241         
242                 pDataArray->count = pDataArray->lineEnd;
243                 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
244                                    
245                         if (pDataArray->m->control_pressed) { 
246                                 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
247                                 if (pDataArray->createGroup) {   outGroupsFile.close();   }
248                                 if(pDataArray->qFileName != ""){ qFile.close(); }
249                                 return 0;
250                         }
251                         
252                         int success = 1;
253                         string trashCode = "";
254                         int currentSeqsDiffs = 0;
255             
256                         Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
257                         
258                         QualityScores currQual;
259                         if(pDataArray->qFileName != ""){
260                                 currQual = QualityScores(qFile);  pDataArray->m->gobble(qFile);
261                         }
262                         
263                         string origSeq = currSeq.getUnaligned();
264                         if (origSeq != "") {
265                                 
266                                 int barcodeIndex = 0;
267                                 int primerIndex = 0;
268                                 
269                 if(pDataArray->numLinkers != 0){
270                                         success = trimOligos.stripLinker(currSeq, currQual);
271                                         if(success > pDataArray->ldiffs)                {       trashCode += 'k';       }
272                                         else{ currentSeqsDiffs += success;  }
273                                 }
274                 
275                                 if(pDataArray->barcodes.size() != 0){
276                                         success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
277                                         if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
278                                         else{ currentSeqsDiffs += success;  }
279                                 }
280                                 
281                 if(pDataArray->numSpacers != 0){
282                                         success = trimOligos.stripSpacer(currSeq, currQual);
283                                         if(success > pDataArray->sdiffs)                {       trashCode += 's';       }
284                                         else{ currentSeqsDiffs += success;  }
285
286                                 }
287                 
288                                 if(pDataArray->numFPrimers != 0){
289                                         success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
290                                         if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
291                                         else{ currentSeqsDiffs += success;  }
292                                 }
293                                 
294                                 if (currentSeqsDiffs > pDataArray->tdiffs)      {       trashCode += 't';   }
295                                 
296                                 if(pDataArray->numRPrimers != 0){
297                                         success = trimOligos.stripReverse(currSeq, currQual);
298                                         if(!success)                            {       trashCode += 'r';       }
299                                 }
300                 
301                                 if(pDataArray->keepFirst != 0){
302                                         //success = keepFirstTrim(currSeq, currQual);
303                     success = 1;
304                     if(currQual.getName() != ""){
305                         currQual.trimQScores(-1, pDataArray->keepFirst);
306                     }
307                     currSeq.trim(pDataArray->keepFirst);
308                                 }
309                                 
310                                 if(pDataArray->removeLast != 0){
311                                         //success = removeLastTrim(currSeq, currQual);
312                     success = 0;
313                     int length = currSeq.getNumBases() - pDataArray->removeLast;
314                     
315                     if(length > 0){
316                         if(currQual.getName() != ""){
317                             currQual.trimQScores(-1, length);
318                         }
319                         currSeq.trim(length);
320                         success = 1;
321                     }
322                     else{ success = 0; }
323                     
324                                         if(!success)                            {       trashCode += 'l';       }
325                                 }
326                 
327                                 
328                                 if(pDataArray->qFileName != ""){
329                                         int origLength = currSeq.getNumBases();
330                                         
331                                         if(pDataArray->qThreshold != 0)                 {       success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold);                 }
332                                         else if(pDataArray->qAverage != 0)              {       success = currQual.cullQualAverage(currSeq, pDataArray->qAverage);                              }
333                                         else if(pDataArray->qRollAverage != 0)  {       success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage);  }
334                                         else if(pDataArray->qWindowAverage != 0){       success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage);       }
335                                         else                                            {       success = 1;                            }
336                                         
337                                         //you don't want to trim, if it fails above then scrap it
338                                         if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
339                                         
340                                         if(!success)                            {       trashCode += 'q';       }
341                                 }                               
342                 
343                                 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
344                                         //success = cullLength(currSeq);
345                     int length = currSeq.getNumBases();
346                     success = 0;        //guilty until proven innocent
347                     if(length >= pDataArray->minLength && pDataArray->maxLength == 0)                   {       success = 1;    }
348                     else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) {       success = 1;    }
349                     else                                                                                                {       success = 0;    }
350                     
351                                         if(!success)                            {       trashCode += 'l';       }
352                                 }
353                                 if(pDataArray->maxHomoP > 0){
354                                         //success = cullHomoP(currSeq);
355                     int longHomoP = currSeq.getLongHomoPolymer();
356                     success = 0;        //guilty until proven innocent
357                     if(longHomoP <= pDataArray->maxHomoP){      success = 1;    }
358                     else                                        {       success = 0;    }
359                     
360                                         if(!success)                            {       trashCode += 'h';       }
361                                 }
362                                 if(pDataArray->maxAmbig != -1){
363                                         //success = cullAmbigs(currSeq);
364                     int numNs = currSeq.getAmbigBases();
365                     success = 0;        //guilty until proven innocent
366                     if(numNs <= pDataArray->maxAmbig)   {       success = 1;    }
367                     else                                        {       success = 0;    }
368                                         if(!success)                            {       trashCode += 'n';       }
369                                 }
370                                 
371                                 if(pDataArray->flip){           // should go last                       
372                                         currSeq.reverseComplement();
373                                         if(pDataArray->qFileName != ""){
374                                                 currQual.flipQScores(); 
375                                         }
376                                 }
377                                 
378                                 if(trashCode.length() == 0){
379                                         currSeq.setAligned(currSeq.getUnaligned());
380                                         currSeq.printSequence(trimFASTAFile);
381                                         
382                                         if(pDataArray->qFileName != ""){
383                                                 currQual.printQScores(trimQualFile);
384                                         }
385                                         
386                                         if(pDataArray->nameFile != ""){
387                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
388                                                 if (itName != pDataArray->nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
389                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
390                                         }
391                                         
392                                         if (pDataArray->createGroup) {
393                                                 if(pDataArray->barcodes.size() != 0){
394                                                         string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
395                                                         if (pDataArray->primers.size() != 0) { 
396                                                                 if (pDataArray->primerNameVector[primerIndex] != "") { 
397                                                                         if(thisGroup != "") {
398                                                                                 thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
399                                                                         }else {
400                                                                                 thisGroup = pDataArray->primerNameVector[primerIndex]; 
401                                                                         }
402                                                                 } 
403                                                         }
404                                                         
405                                                         outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
406                                                         
407                                                         if (pDataArray->nameFile != "") {
408                                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
409                                                                 if (itName != pDataArray->nameMap.end()) { 
410                                                                         vector<string> thisSeqsNames; 
411                                                                         pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
412                                                                         for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
413                                                                                 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
414                                                                         }
415                                                                 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }                                                   
416                                                         }
417                                                         
418                                                         map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
419                                                         if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[thisGroup] = 1; }
420                                                         else { pDataArray->groupCounts[it->first]++; }
421                             
422                                                 }
423                                         }
424                                         
425                                         if(pDataArray->allFiles){
426                                                 ofstream output;
427                                                 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
428                                                 currSeq.printSequence(output);
429                                                 output.close();
430                                                 
431                                                 if(pDataArray->qFileName != ""){
432                                                         pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
433                                                         currQual.printQScores(output);
434                                                         output.close();                                                 
435                                                 }
436                                                 
437                                                 if(pDataArray->nameFile != ""){
438                                                         map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
439                                                         if (itName != pDataArray->nameMap.end()) { 
440                                                                 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
441                                                                 output << itName->first << '\t' << itName->second << endl; 
442                                                                 output.close();
443                                                         }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
444                                                 }
445                                         }
446                                 }
447                                 else{
448                                         if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
449                                                 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
450                                                 if (itName != pDataArray->nameMap.end()) {  scrapNameFile << itName->first << '\t' << itName->second << endl; }
451                                                 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
452                                         }
453                                         currSeq.setName(currSeq.getName() + '|' + trashCode);
454                                         currSeq.setUnaligned(origSeq);
455                                         currSeq.setAligned(origSeq);
456                                         currSeq.printSequence(scrapFASTAFile);
457                                         if(pDataArray->qFileName != ""){
458                                                 currQual.printQScores(scrapQualFile);
459                                         }
460                                 }
461                                 
462                         }
463                         
464                         //report progress
465                         if((i) % 1000 == 0){    pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine();               }
466                         
467                 }
468                 //report progress
469                 if((pDataArray->count) % 1000 != 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
470                 
471                 
472                 inFASTA.close();
473                 trimFASTAFile.close();
474                 scrapFASTAFile.close();
475                 if (pDataArray->createGroup) {   outGroupsFile.close();   }
476                 if(pDataArray->qFileName != "") {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
477                 if(pDataArray->nameFile != "")  {       scrapNameFile.close(); trimNameFile.close();    }
478                 
479         return 0;
480             
481         }
482         catch(exception& e) {
483             pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
484             exit(1);
485         }
486     } 
487 #endif
488     
489
490 /**************************************************************************************************/
491
492 #endif