1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "trimoligos.h"
18 #include "counttable.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
31 string getHelpString();
32 string getOutputPattern(string);
33 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
34 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
37 void help() { m->mothurOut(getHelpString()); }
41 unsigned long long start;
42 unsigned long long end;
43 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
47 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
48 bool keepFirstTrim(Sequence&, QualityScores&);
49 bool removeLastTrim(Sequence&, QualityScores&);
50 bool cullLength(Sequence&);
51 bool cullHomoP(Sequence&);
52 bool cullAmbigs(Sequence&);
53 string reverseOligo(string);
55 bool abort, createGroup;
56 string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
58 bool flip, allFiles, qtrim, keepforward, pairedOligos, reorient;
59 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
60 int qWindowSize, qWindowStep, keepFirst, removeLast;
61 double qRollAverage, qThreshold, qWindowAverage, qAverage;
62 vector<string> revPrimer, outputNames;
63 set<string> filesToRemove;
64 map<int, oligosPair> pairedBarcodes;
65 map<int, oligosPair> pairedPrimers;
66 map<string, int> barcodes;
67 vector<string> groupVector;
68 map<string, int> primers;
69 vector<string> linker;
70 vector<string> spacer;
71 map<string, int> combos;
72 map<string, int> groupToIndex;
73 vector<string> primerNameVector; //needed here?
74 vector<string> barcodeNameVector; //needed here?
75 map<string, int> groupCounts;
76 map<string, string> nameMap;
77 map<string, int> nameCount; //for countfile name -> repCount
78 map<string, string> groupMap; //for countfile name -> group
80 vector<int> processIDS; //processid
81 vector<linePair> lines;
82 vector<linePair> qLines;
84 int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
85 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
86 int setLines(string, string);
89 /**************************************************************************************************/
90 //custom data structure for threads to use.
91 // This is passed by void pointer so it can be any data type
92 // that can be passed using a single void pointer (LPVOID).
94 unsigned long long start, end;
96 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
97 vector<vector<string> > fastaFileNames;
98 vector<vector<string> > qualFileNames;
99 vector<vector<string> > nameFileNames;
100 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
101 bool flip, allFiles, qtrim, keepforward, createGroup, pairedOligos, reorient;
102 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
103 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
104 double qRollAverage, qThreshold, qWindowAverage, qAverage;
105 vector<string> revPrimer;
106 map<string, int> barcodes;
107 map<string, int> primers;
108 map<string, int> nameCount;
109 vector<string> linker;
110 vector<string> spacer;
111 map<string, int> combos;
112 vector<string> primerNameVector;
113 vector<string> barcodeNameVector;
114 map<string, int> groupCounts;
115 map<string, string> nameMap;
116 map<string, string> groupMap;
117 map<int, oligosPair> pairedBarcodes;
118 map<int, oligosPair> pairedPrimers;
121 trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
122 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, map<int, oligosPair> pbr, map<int, oligosPair> ppr, bool po,
123 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
124 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
125 int minL, int maxA, int maxH, int maxL, bool fli, bool reo, map<string, string> nm, map<string, int> ncount) {
133 scrapQFileName = sqn;
135 scrapNFileName = snn;
137 scrapCFileName = scn;
139 fastaFileNames = ffn;
156 pairedBarcodes = pbr;
158 primers = pri; numFPrimers = primers.size();
159 revPrimer = revP; numRPrimers = revPrimer.size();
160 linker = li; numLinkers = linker.size();
161 spacer = spa; numSpacers = spacer.size();
162 primerNameVector = priNameVector;
163 barcodeNameVector = barNameVector;
165 createGroup = cGroup;
169 removeLast = removeL;
170 qWindowStep = WindowStep;
171 qWindowSize = WindowSize;
172 qWindowAverage = WindowAverage;
174 qThreshold = Threshold;
176 qRollAverage = RollAverage;
187 /**************************************************************************************************/
188 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
190 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
191 trimData* pDataArray;
192 pDataArray = (trimData*)lpParam;
195 ofstream trimFASTAFile;
196 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
198 ofstream scrapFASTAFile;
199 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
201 ofstream trimQualFile;
202 ofstream scrapQualFile;
203 if(pDataArray->qFileName != ""){
204 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
205 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
208 ofstream trimNameFile;
209 ofstream scrapNameFile;
210 if(pDataArray->nameFile != ""){
211 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
212 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
216 ofstream outGroupsFile;
217 if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
218 if(pDataArray->allFiles){
219 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
220 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
221 if (pDataArray->fastaFileNames[i][j] != "") {
223 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
224 if(pDataArray->qFileName != ""){
225 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
228 if(pDataArray->nameFile != ""){
229 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
236 ofstream trimCountFile;
237 ofstream scrapCountFile;
238 if(pDataArray->countfile != ""){
239 pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
240 pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
241 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
245 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
246 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
248 }else { //this accounts for the difference in line endings.
249 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
253 if(pDataArray->qFileName != "") {
254 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
255 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
257 }else { //this accounts for the difference in line endings.
258 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
262 TrimOligos* trimOligos = NULL;
263 int numBarcodes = pDataArray->barcodes.size();
264 if (pDataArray->pairedOligos) { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, pDataArray->pairedPrimers, pDataArray->pairedBarcodes); numBarcodes = pDataArray->pairedBarcodes.size(); }
265 else { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer); }
267 TrimOligos* rtrimOligos = NULL;
268 if (pDataArray->reorient) {
269 //create reoriented primer and barcode pairs
270 map<int, oligosPair> rpairedPrimers, rpairedBarcodes;
271 for (map<int, oligosPair>::iterator it = pDataArray->pairedPrimers.begin(); it != pDataArray->pairedPrimers.end(); it++) {
272 oligosPair tempPair(trimOligos->reverseOligo((it->second).reverse), (trimOligos->reverseOligo((it->second).forward))); //reversePrimer, rc ForwardPrimer
273 rpairedPrimers[it->first] = tempPair;
275 for (map<int, oligosPair>::iterator it = pDataArray->pairedBarcodes.begin(); it != pDataArray->pairedBarcodes.end(); it++) {
276 oligosPair tempPair(trimOligos->reverseOligo((it->second).reverse), (trimOligos->reverseOligo((it->second).forward))); //reverseBarcode, rc ForwardBarcode
277 rpairedBarcodes[it->first] = tempPair;
279 rtrimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, rpairedPrimers, rpairedBarcodes); numBarcodes = rpairedBarcodes.size();
282 pDataArray->count = 0;
283 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
285 if (pDataArray->m->control_pressed) {
287 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
288 if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
289 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
290 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
291 if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
293 if(pDataArray->qFileName != ""){ qFile.close(); }
298 string trashCode = "";
299 int currentSeqsDiffs = 0;
301 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
303 QualityScores currQual;
304 if(pDataArray->qFileName != ""){
305 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
308 string origSeq = currSeq.getUnaligned();
312 int barcodeIndex = 0;
315 if(pDataArray->numLinkers != 0){
316 success = trimOligos->stripLinker(currSeq, currQual);
317 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
318 else{ currentSeqsDiffs += success; }
321 if(numBarcodes != 0){
322 success = trimOligos->stripBarcode(currSeq, currQual, barcodeIndex);
323 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
324 else{ currentSeqsDiffs += success; }
327 if(pDataArray->numSpacers != 0){
328 success = trimOligos->stripSpacer(currSeq, currQual);
329 if(success > pDataArray->sdiffs) { trashCode += 's'; }
330 else{ currentSeqsDiffs += success; }
334 if(pDataArray->numFPrimers != 0){
335 success = trimOligos->stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
336 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
337 else{ currentSeqsDiffs += success; }
340 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
342 if(pDataArray->numRPrimers != 0){
343 success = trimOligos->stripReverse(currSeq, currQual);
344 if(!success) { trashCode += 'r'; }
347 if (pDataArray->reorient && (trashCode != "")) { //if you failed and want to check the reverse
349 string thisTrashCode = "";
350 int thisCurrentSeqsDiffs = 0;
352 int thisBarcodeIndex = 0;
353 int thisPrimerIndex = 0;
355 if(numBarcodes != 0){
356 thisSuccess = rtrimOligos->stripBarcode(currSeq, currQual, thisBarcodeIndex);
357 if(thisSuccess > pDataArray->bdiffs) { thisTrashCode += 'b'; }
358 else{ thisCurrentSeqsDiffs += thisSuccess; }
361 if(pDataArray->numFPrimers != 0){
362 thisSuccess = rtrimOligos->stripForward(currSeq, currQual, thisPrimerIndex, pDataArray->keepforward);
363 if(thisSuccess > pDataArray->pdiffs) { thisTrashCode += 'f'; }
364 else{ thisCurrentSeqsDiffs += thisSuccess; }
367 if (thisCurrentSeqsDiffs > pDataArray->tdiffs) { thisTrashCode += 't'; }
369 if (thisTrashCode == "") {
370 trashCode = thisTrashCode;
371 success = thisSuccess;
372 currentSeqsDiffs = thisCurrentSeqsDiffs;
373 barcodeIndex = thisBarcodeIndex;
374 primerIndex = thisPrimerIndex;
379 if(pDataArray->keepFirst != 0){
380 //success = keepFirstTrim(currSeq, currQual);
382 if(currQual.getName() != ""){
383 currQual.trimQScores(-1, pDataArray->keepFirst);
385 currSeq.trim(pDataArray->keepFirst);
388 if(pDataArray->removeLast != 0){
389 //success = removeLastTrim(currSeq, currQual);
391 int length = currSeq.getNumBases() - pDataArray->removeLast;
394 if(currQual.getName() != ""){
395 currQual.trimQScores(-1, length);
397 currSeq.trim(length);
402 if(!success) { trashCode += 'l'; }
406 if(pDataArray->qFileName != ""){
407 int origLength = currSeq.getNumBases();
409 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
410 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
411 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
412 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
413 else { success = 1; }
415 //you don't want to trim, if it fails above then scrap it
416 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
418 if(!success) { trashCode += 'q'; }
421 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
422 //success = cullLength(currSeq);
423 int length = currSeq.getNumBases();
424 success = 0; //guilty until proven innocent
425 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
426 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
427 else { success = 0; }
429 if(!success) { trashCode += 'l'; }
431 if(pDataArray->maxHomoP > 0){
432 //success = cullHomoP(currSeq);
433 int longHomoP = currSeq.getLongHomoPolymer();
434 success = 0; //guilty until proven innocent
435 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
436 else { success = 0; }
438 if(!success) { trashCode += 'h'; }
440 if(pDataArray->maxAmbig != -1){
441 //success = cullAmbigs(currSeq);
442 int numNs = currSeq.getAmbigBases();
443 success = 0; //guilty until proven innocent
444 if(numNs <= pDataArray->maxAmbig) { success = 1; }
445 else { success = 0; }
446 if(!success) { trashCode += 'n'; }
449 if(pDataArray->flip){ // should go last
450 currSeq.reverseComplement();
451 if(pDataArray->qFileName != ""){
452 currQual.flipQScores();
456 if(trashCode.length() == 0){
457 string thisGroup = "";
458 if (pDataArray->createGroup) {
459 if(numBarcodes != 0){
460 thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
461 if (pDataArray->numFPrimers != 0) {
462 if (pDataArray->primerNameVector[primerIndex] != "") {
463 if(thisGroup != "") {
464 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
466 thisGroup = pDataArray->primerNameVector[primerIndex];
473 int pos = thisGroup.find("ignore");
474 if (pos == string::npos) {
476 currSeq.setAligned(currSeq.getUnaligned());
477 currSeq.printSequence(trimFASTAFile);
479 if(pDataArray->qFileName != ""){
480 currQual.printQScores(trimQualFile);
483 if(pDataArray->nameFile != ""){
484 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
485 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
486 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
489 int numRedundants = 0;
490 if (pDataArray->countfile != "") {
491 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
492 if (itCount != pDataArray->nameCount.end()) {
493 trimCountFile << itCount->first << '\t' << itCount->second << endl;
494 numRedundants = itCount->second-1;
495 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
498 if (pDataArray->createGroup) {
499 if(numBarcodes != 0){
501 if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
502 else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
504 if (pDataArray->nameFile != "") {
505 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
506 if (itName != pDataArray->nameMap.end()) {
507 vector<string> thisSeqsNames;
508 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
509 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
510 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
511 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
513 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
516 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
517 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
518 else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
523 if(pDataArray->allFiles){
525 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
526 currSeq.printSequence(output);
529 if(pDataArray->qFileName != ""){
530 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
531 currQual.printQScores(output);
535 if(pDataArray->nameFile != ""){
536 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
537 if (itName != pDataArray->nameMap.end()) {
538 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
539 output << itName->first << '\t' << itName->second << endl;
541 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
547 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
548 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
549 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
550 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
552 if (pDataArray->countfile != "") {
553 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
554 if (itCount != pDataArray->nameCount.end()) {
555 trimCountFile << itCount->first << '\t' << itCount->second << endl;
556 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
558 currSeq.setName(currSeq.getName() + '|' + trashCode);
559 currSeq.setUnaligned(origSeq);
560 currSeq.setAligned(origSeq);
561 currSeq.printSequence(scrapFASTAFile);
562 if(pDataArray->qFileName != ""){
563 currQual.printQScores(scrapQualFile);
570 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
574 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
578 trimFASTAFile.close();
579 scrapFASTAFile.close();
580 if (pDataArray->createGroup) { outGroupsFile.close(); }
581 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
582 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
587 catch(exception& e) {
588 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
595 /**************************************************************************************************/