1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "trimoligos.h"
18 #include "counttable.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
31 string getHelpString();
32 string getOutputPattern(string);
33 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
34 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
37 void help() { m->mothurOut(getHelpString()); }
41 unsigned long long start;
42 unsigned long long end;
43 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
47 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
48 bool keepFirstTrim(Sequence&, QualityScores&);
49 bool removeLastTrim(Sequence&, QualityScores&);
50 bool cullLength(Sequence&);
51 bool cullHomoP(Sequence&);
52 bool cullAmbigs(Sequence&);
53 string reverseOligo(string);
55 bool abort, createGroup;
56 string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
58 bool flip, allFiles, qtrim, keepforward, pairedOligos, reorient;
59 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
60 int qWindowSize, qWindowStep, keepFirst, removeLast;
61 double qRollAverage, qThreshold, qWindowAverage, qAverage;
62 vector<string> revPrimer, outputNames;
63 set<string> filesToRemove;
64 map<int, oligosPair> pairedBarcodes;
65 map<int, oligosPair> pairedPrimers;
66 map<string, int> barcodes;
67 vector<string> groupVector;
68 map<string, int> primers;
69 vector<string> linker;
70 vector<string> spacer;
71 map<string, int> combos;
72 map<string, int> groupToIndex;
73 vector<string> primerNameVector; //needed here?
74 vector<string> barcodeNameVector; //needed here?
75 map<string, int> groupCounts;
76 map<string, string> nameMap;
77 map<string, int> nameCount; //for countfile name -> repCount
78 map<string, string> groupMap; //for countfile name -> group
80 vector<int> processIDS; //processid
81 vector<linePair> lines;
82 vector<linePair> qLines;
84 int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
85 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
86 int setLines(string, string);
89 /**************************************************************************************************/
90 //custom data structure for threads to use.
91 // This is passed by void pointer so it can be any data type
92 // that can be passed using a single void pointer (LPVOID).
94 unsigned long long start, end;
96 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
97 vector<vector<string> > fastaFileNames;
98 vector<vector<string> > qualFileNames;
99 vector<vector<string> > nameFileNames;
100 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
101 bool flip, allFiles, qtrim, keepforward, createGroup, pairedOligos, reorient;
102 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
103 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
104 double qRollAverage, qThreshold, qWindowAverage, qAverage;
105 vector<string> revPrimer;
106 map<string, int> barcodes;
107 map<string, int> primers;
108 map<string, int> nameCount;
109 vector<string> linker;
110 vector<string> spacer;
111 map<string, int> combos;
112 vector<string> primerNameVector;
113 vector<string> barcodeNameVector;
114 map<string, int> groupCounts;
115 map<string, string> nameMap;
116 map<string, string> groupMap;
117 map<int, oligosPair> pairedBarcodes;
118 map<int, oligosPair> pairedPrimers;
121 trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
122 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, map<int, oligosPair> pbr, map<int, oligosPair> ppr, bool po,
123 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
124 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
125 int minL, int maxA, int maxH, int maxL, bool fli, bool reo, map<string, string> nm, map<string, int> ncount) {
133 scrapQFileName = sqn;
135 scrapNFileName = snn;
137 scrapCFileName = scn;
139 fastaFileNames = ffn;
156 pairedBarcodes = pbr;
158 primers = pri; numFPrimers = primers.size();
159 revPrimer = revP; numRPrimers = revPrimer.size();
160 linker = li; numLinkers = linker.size();
161 spacer = spa; numSpacers = spacer.size();
162 primerNameVector = priNameVector;
163 barcodeNameVector = barNameVector;
165 createGroup = cGroup;
169 removeLast = removeL;
170 qWindowStep = WindowStep;
171 qWindowSize = WindowSize;
172 qWindowAverage = WindowAverage;
174 qThreshold = Threshold;
176 qRollAverage = RollAverage;
187 /**************************************************************************************************/
188 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
190 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
191 trimData* pDataArray;
192 pDataArray = (trimData*)lpParam;
195 ofstream trimFASTAFile;
196 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
198 ofstream scrapFASTAFile;
199 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
201 ofstream trimQualFile;
202 ofstream scrapQualFile;
203 if(pDataArray->qFileName != ""){
204 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
205 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
208 ofstream trimNameFile;
209 ofstream scrapNameFile;
210 if(pDataArray->nameFile != ""){
211 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
212 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
216 ofstream outGroupsFile;
217 if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
218 if(pDataArray->allFiles){
219 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
220 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
221 if (pDataArray->fastaFileNames[i][j] != "") {
223 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
224 if(pDataArray->qFileName != ""){
225 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
228 if(pDataArray->nameFile != ""){
229 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
236 ofstream trimCountFile;
237 ofstream scrapCountFile;
238 if(pDataArray->countfile != ""){
239 pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
240 pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
241 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
245 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
246 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
248 }else { //this accounts for the difference in line endings.
249 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
253 if(pDataArray->qFileName != "") {
254 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
255 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
257 }else { //this accounts for the difference in line endings.
258 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
262 TrimOligos* trimOligos = NULL;
263 int numBarcodes = pDataArray->barcodes.size();
264 if (pDataArray->pairedOligos) { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->pairedPrimers, pDataArray->pairedBarcodes); numBarcodes = pDataArray->pairedBarcodes.size(); pDataArray->numFPrimers = pDataArray->pairedPrimers.size(); }
265 else { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); }
267 TrimOligos* rtrimOligos = NULL;
268 if (pDataArray->reorient) {
269 //create reoriented primer and barcode pairs
270 map<int, oligosPair> rpairedPrimers, rpairedBarcodes;
271 for (map<int, oligosPair>::iterator it = pDataArray->pairedPrimers.begin(); it != pDataArray->pairedPrimers.end(); it++) {
272 oligosPair tempPair(trimOligos->reverseOligo((it->second).reverse), (trimOligos->reverseOligo((it->second).forward))); //reversePrimer, rc ForwardPrimer
273 rpairedPrimers[it->first] = tempPair;
275 for (map<int, oligosPair>::iterator it = pDataArray->pairedBarcodes.begin(); it != pDataArray->pairedBarcodes.end(); it++) {
276 oligosPair tempPair(trimOligos->reverseOligo((it->second).reverse), (trimOligos->reverseOligo((it->second).forward))); //reverseBarcode, rc ForwardBarcode
277 rpairedBarcodes[it->first] = tempPair;
279 rtrimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, rpairedPrimers, rpairedBarcodes); numBarcodes = rpairedBarcodes.size();
282 pDataArray->count = 0;
283 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
285 if (pDataArray->m->control_pressed) {
286 delete trimOligos; if (pDataArray->reorient) { delete rtrimOligos; }
287 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
288 if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
289 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
290 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
291 if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
293 if(pDataArray->qFileName != ""){ qFile.close(); }
298 string trashCode = "";
299 int currentSeqsDiffs = 0;
301 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
303 QualityScores currQual;
304 if(pDataArray->qFileName != ""){
305 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
308 string origSeq = currSeq.getUnaligned();
312 int barcodeIndex = 0;
315 if(pDataArray->numLinkers != 0){
316 success = trimOligos->stripLinker(currSeq, currQual);
317 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
318 else{ currentSeqsDiffs += success; }
321 if(numBarcodes != 0){
322 success = trimOligos->stripBarcode(currSeq, currQual, barcodeIndex);
323 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
324 else{ currentSeqsDiffs += success; }
327 if(pDataArray->numSpacers != 0){
328 success = trimOligos->stripSpacer(currSeq, currQual);
329 if(success > pDataArray->sdiffs) { trashCode += 's'; }
330 else{ currentSeqsDiffs += success; }
334 if(pDataArray->numFPrimers != 0){
335 success = trimOligos->stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
336 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
337 else{ currentSeqsDiffs += success; }
340 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
342 if(pDataArray->numRPrimers != 0){
343 success = trimOligos->stripReverse(currSeq, currQual);
344 if(!success) { trashCode += 'r'; }
347 if (pDataArray->reorient && (trashCode != "")) { //if you failed and want to check the reverse
349 string thisTrashCode = "";
350 int thisCurrentSeqsDiffs = 0;
352 int thisBarcodeIndex = 0;
353 int thisPrimerIndex = 0;
355 if(numBarcodes != 0){
356 thisSuccess = rtrimOligos->stripBarcode(currSeq, currQual, thisBarcodeIndex);
357 if(thisSuccess > pDataArray->bdiffs) { thisTrashCode += 'b'; }
358 else{ thisCurrentSeqsDiffs += thisSuccess; }
361 if(pDataArray->numFPrimers != 0){
362 thisSuccess = rtrimOligos->stripForward(currSeq, currQual, thisPrimerIndex, pDataArray->keepforward);
363 if(thisSuccess > pDataArray->pdiffs) { thisTrashCode += 'f'; }
364 else{ thisCurrentSeqsDiffs += thisSuccess; }
367 if (thisCurrentSeqsDiffs > pDataArray->tdiffs) { thisTrashCode += 't'; }
369 if (thisTrashCode == "") {
370 trashCode = thisTrashCode;
371 success = thisSuccess;
372 currentSeqsDiffs = thisCurrentSeqsDiffs;
373 barcodeIndex = thisBarcodeIndex;
374 primerIndex = thisPrimerIndex;
375 currSeq.reverseComplement();
376 if(pDataArray->qFileName != ""){
377 currQual.flipQScores();
383 if(pDataArray->keepFirst != 0){
384 //success = keepFirstTrim(currSeq, currQual);
386 if(currQual.getName() != ""){
387 currQual.trimQScores(-1, pDataArray->keepFirst);
389 currSeq.trim(pDataArray->keepFirst);
392 if(pDataArray->removeLast != 0){
393 //success = removeLastTrim(currSeq, currQual);
395 int length = currSeq.getNumBases() - pDataArray->removeLast;
398 if(currQual.getName() != ""){
399 currQual.trimQScores(-1, length);
401 currSeq.trim(length);
406 if(!success) { trashCode += 'l'; }
410 if(pDataArray->qFileName != ""){
411 int origLength = currSeq.getNumBases();
413 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
414 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
415 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
416 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
417 else { success = 1; }
419 //you don't want to trim, if it fails above then scrap it
420 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
422 if(!success) { trashCode += 'q'; }
425 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
426 //success = cullLength(currSeq);
427 int length = currSeq.getNumBases();
428 success = 0; //guilty until proven innocent
429 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
430 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
431 else { success = 0; }
433 if(!success) { trashCode += 'l'; }
435 if(pDataArray->maxHomoP > 0){
436 //success = cullHomoP(currSeq);
437 int longHomoP = currSeq.getLongHomoPolymer();
438 success = 0; //guilty until proven innocent
439 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
440 else { success = 0; }
442 if(!success) { trashCode += 'h'; }
444 if(pDataArray->maxAmbig != -1){
445 //success = cullAmbigs(currSeq);
446 int numNs = currSeq.getAmbigBases();
447 success = 0; //guilty until proven innocent
448 if(numNs <= pDataArray->maxAmbig) { success = 1; }
449 else { success = 0; }
450 if(!success) { trashCode += 'n'; }
453 if(pDataArray->flip){ // should go last
454 currSeq.reverseComplement();
455 if(pDataArray->qFileName != ""){
456 currQual.flipQScores();
460 if(trashCode.length() == 0){
461 string thisGroup = "";
462 if (pDataArray->createGroup) {
463 if(numBarcodes != 0){
464 thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
465 if (pDataArray->numFPrimers != 0) {
466 if (pDataArray->primerNameVector[primerIndex] != "") {
467 if(thisGroup != "") {
468 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
470 thisGroup = pDataArray->primerNameVector[primerIndex];
477 int pos = thisGroup.find("ignore");
478 if (pos == string::npos) {
480 currSeq.setAligned(currSeq.getUnaligned());
481 currSeq.printSequence(trimFASTAFile);
483 if(pDataArray->qFileName != ""){
484 currQual.printQScores(trimQualFile);
487 if(pDataArray->nameFile != ""){
488 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
489 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
490 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
493 int numRedundants = 0;
494 if (pDataArray->countfile != "") {
495 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
496 if (itCount != pDataArray->nameCount.end()) {
497 trimCountFile << itCount->first << '\t' << itCount->second << endl;
498 numRedundants = itCount->second-1;
499 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
502 if (pDataArray->createGroup) {
503 if(numBarcodes != 0){
505 if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
506 else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
508 if (pDataArray->nameFile != "") {
509 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
510 if (itName != pDataArray->nameMap.end()) {
511 vector<string> thisSeqsNames;
512 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
513 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
514 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
515 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
517 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
520 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
521 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
522 else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
527 if(pDataArray->allFiles){
529 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
530 currSeq.printSequence(output);
533 if(pDataArray->qFileName != ""){
534 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
535 currQual.printQScores(output);
539 if(pDataArray->nameFile != ""){
540 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
541 if (itName != pDataArray->nameMap.end()) {
542 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
543 output << itName->first << '\t' << itName->second << endl;
545 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
551 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
552 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
553 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
554 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
556 if (pDataArray->countfile != "") {
557 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
558 if (itCount != pDataArray->nameCount.end()) {
559 trimCountFile << itCount->first << '\t' << itCount->second << endl;
560 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
562 currSeq.setName(currSeq.getName() + '|' + trashCode);
563 currSeq.setUnaligned(origSeq);
564 currSeq.setAligned(origSeq);
565 currSeq.printSequence(scrapFASTAFile);
566 if(pDataArray->qFileName != ""){
567 currQual.printQScores(scrapQualFile);
574 if((pDataArray->count) % 1000 == 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
578 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
580 if (pDataArray->reorient) { delete rtrimOligos; }
583 trimFASTAFile.close();
584 scrapFASTAFile.close();
585 if (pDataArray->createGroup) { outGroupsFile.close(); }
586 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
587 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
592 catch(exception& e) {
593 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
600 /**************************************************************************************************/