1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
18 #include "trimoligos.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
30 string getOutputFileNameTag(string, string);
31 string getHelpString();
32 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
33 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
36 void help() { m->mothurOut(getHelpString()); }
43 unsigned long long start;
44 unsigned long long end;
45 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
49 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
50 bool keepFirstTrim(Sequence&, QualityScores&);
51 bool removeLastTrim(Sequence&, QualityScores&);
52 bool cullLength(Sequence&);
53 bool cullHomoP(Sequence&);
54 bool cullAmbigs(Sequence&);
55 string reverseOligo(string);
57 bool abort, createGroup;
58 string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
60 bool flip, allFiles, qtrim, keepforward;
61 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
62 int qWindowSize, qWindowStep, keepFirst, removeLast;
63 double qRollAverage, qThreshold, qWindowAverage, qAverage;
64 vector<string> revPrimer, outputNames;
65 set<string> filesToRemove;
66 map<string, int> barcodes;
67 map<string, int> rbarcodes;
68 vector<string> groupVector;
69 map<string, int> primers;
70 vector<string> linker;
71 vector<string> spacer;
72 map<string, int> combos;
73 map<string, int> groupToIndex;
74 vector<string> primerNameVector; //needed here?
75 vector<string> barcodeNameVector; //needed here?
76 map<string, int> groupCounts;
77 map<string, string> nameMap;
79 vector<int> processIDS; //processid
80 vector<linePair> lines;
81 vector<linePair> qLines;
83 int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
84 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
85 int setLines(string, string);
88 /**************************************************************************************************/
89 //custom data structure for threads to use.
90 // This is passed by void pointer so it can be any data type
91 // that can be passed using a single void pointer (LPVOID).
93 unsigned long long start, end;
95 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
96 vector<vector<string> > fastaFileNames;
97 vector<vector<string> > qualFileNames;
98 vector<vector<string> > nameFileNames;
99 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
100 bool flip, allFiles, qtrim, keepforward, createGroup;
101 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
102 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
103 double qRollAverage, qThreshold, qWindowAverage, qAverage;
104 vector<string> revPrimer;
105 map<string, int> barcodes;
106 map<string, int> rbarcodes;
107 map<string, int> primers;
108 vector<string> linker;
109 vector<string> spacer;
110 map<string, int> combos;
111 vector<string> primerNameVector;
112 vector<string> barcodeNameVector;
113 map<string, int> groupCounts;
114 map<string, string> nameMap;
117 trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
118 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa,
119 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
120 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
121 int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
128 scrapQFileName = sqn;
130 scrapNFileName = snn;
132 fastaFileNames = ffn;
148 primers = pri; numFPrimers = primers.size();
149 revPrimer = revP; numRPrimers = revPrimer.size();
150 linker = li; numLinkers = linker.size();
151 spacer = spa; numSpacers = spacer.size();
152 primerNameVector = priNameVector;
153 barcodeNameVector = barNameVector;
155 createGroup = cGroup;
159 removeLast = removeL;
160 qWindowStep = WindowStep;
161 qWindowSize = WindowSize;
162 qWindowAverage = WindowAverage;
164 qThreshold = Threshold;
166 qRollAverage = RollAverage;
176 /**************************************************************************************************/
177 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
179 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
180 trimData* pDataArray;
181 pDataArray = (trimData*)lpParam;
184 ofstream trimFASTAFile;
185 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
187 ofstream scrapFASTAFile;
188 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
190 ofstream trimQualFile;
191 ofstream scrapQualFile;
192 if(pDataArray->qFileName != ""){
193 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
194 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
197 ofstream trimNameFile;
198 ofstream scrapNameFile;
199 if(pDataArray->nameFile != ""){
200 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
201 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
205 ofstream outGroupsFile;
206 if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
207 if(pDataArray->allFiles){
208 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
209 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
210 if (pDataArray->fastaFileNames[i][j] != "") {
212 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
213 if(pDataArray->qFileName != ""){
214 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
217 if(pDataArray->nameFile != ""){
218 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
226 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
227 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
229 }else { //this accounts for the difference in line endings.
230 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
234 if(pDataArray->qFileName != "") {
235 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
236 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
238 }else { //this accounts for the difference in line endings.
239 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
244 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
246 pDataArray->count = pDataArray->lineEnd;
247 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
249 if (pDataArray->m->control_pressed) {
250 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
251 if (pDataArray->createGroup) { outGroupsFile.close(); }
252 if(pDataArray->qFileName != ""){ qFile.close(); }
257 string trashCode = "";
258 int currentSeqsDiffs = 0;
260 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
262 QualityScores currQual;
263 if(pDataArray->qFileName != ""){
264 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
267 string origSeq = currSeq.getUnaligned();
270 int barcodeIndex = 0;
273 if(pDataArray->numLinkers != 0){
274 success = trimOligos.stripLinker(currSeq, currQual);
275 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
276 else{ currentSeqsDiffs += success; }
279 if(pDataArray->barcodes.size() != 0){
280 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
281 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
282 else{ currentSeqsDiffs += success; }
285 if(pDataArray->rbarcodes.size() != 0){
286 success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
287 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
288 else{ currentSeqsDiffs += success; }
291 if(pDataArray->numSpacers != 0){
292 success = trimOligos.stripSpacer(currSeq, currQual);
293 if(success > pDataArray->sdiffs) { trashCode += 's'; }
294 else{ currentSeqsDiffs += success; }
298 if(pDataArray->numFPrimers != 0){
299 success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
300 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
301 else{ currentSeqsDiffs += success; }
304 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
306 if(pDataArray->numRPrimers != 0){
307 success = trimOligos.stripReverse(currSeq, currQual);
308 if(!success) { trashCode += 'r'; }
311 if(pDataArray->keepFirst != 0){
312 //success = keepFirstTrim(currSeq, currQual);
314 if(currQual.getName() != ""){
315 currQual.trimQScores(-1, pDataArray->keepFirst);
317 currSeq.trim(pDataArray->keepFirst);
320 if(pDataArray->removeLast != 0){
321 //success = removeLastTrim(currSeq, currQual);
323 int length = currSeq.getNumBases() - pDataArray->removeLast;
326 if(currQual.getName() != ""){
327 currQual.trimQScores(-1, length);
329 currSeq.trim(length);
334 if(!success) { trashCode += 'l'; }
338 if(pDataArray->qFileName != ""){
339 int origLength = currSeq.getNumBases();
341 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
342 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
343 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
344 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
345 else { success = 1; }
347 //you don't want to trim, if it fails above then scrap it
348 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
350 if(!success) { trashCode += 'q'; }
353 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
354 //success = cullLength(currSeq);
355 int length = currSeq.getNumBases();
356 success = 0; //guilty until proven innocent
357 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
358 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
359 else { success = 0; }
361 if(!success) { trashCode += 'l'; }
363 if(pDataArray->maxHomoP > 0){
364 //success = cullHomoP(currSeq);
365 int longHomoP = currSeq.getLongHomoPolymer();
366 success = 0; //guilty until proven innocent
367 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
368 else { success = 0; }
370 if(!success) { trashCode += 'h'; }
372 if(pDataArray->maxAmbig != -1){
373 //success = cullAmbigs(currSeq);
374 int numNs = currSeq.getAmbigBases();
375 success = 0; //guilty until proven innocent
376 if(numNs <= pDataArray->maxAmbig) { success = 1; }
377 else { success = 0; }
378 if(!success) { trashCode += 'n'; }
381 if(pDataArray->flip){ // should go last
382 currSeq.reverseComplement();
383 if(pDataArray->qFileName != ""){
384 currQual.flipQScores();
388 if(trashCode.length() == 0){
389 currSeq.setAligned(currSeq.getUnaligned());
390 currSeq.printSequence(trimFASTAFile);
392 if(pDataArray->qFileName != ""){
393 currQual.printQScores(trimQualFile);
396 if(pDataArray->nameFile != ""){
397 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
398 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
399 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
402 if (pDataArray->createGroup) {
403 if(pDataArray->barcodes.size() != 0){
404 string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
405 if (pDataArray->primers.size() != 0) {
406 if (pDataArray->primerNameVector[primerIndex] != "") {
407 if(thisGroup != "") {
408 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
410 thisGroup = pDataArray->primerNameVector[primerIndex];
415 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
417 int numRedundants = 0;
418 if (pDataArray->nameFile != "") {
419 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
420 if (itName != pDataArray->nameMap.end()) {
421 vector<string> thisSeqsNames;
422 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
423 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
424 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
425 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
427 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
430 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
431 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
432 else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
437 if(pDataArray->allFiles){
439 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
440 currSeq.printSequence(output);
443 if(pDataArray->qFileName != ""){
444 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
445 currQual.printQScores(output);
449 if(pDataArray->nameFile != ""){
450 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
451 if (itName != pDataArray->nameMap.end()) {
452 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
453 output << itName->first << '\t' << itName->second << endl;
455 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
460 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
461 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
462 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
463 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
465 currSeq.setName(currSeq.getName() + '|' + trashCode);
466 currSeq.setUnaligned(origSeq);
467 currSeq.setAligned(origSeq);
468 currSeq.printSequence(scrapFASTAFile);
469 if(pDataArray->qFileName != ""){
470 currQual.printQScores(scrapQualFile);
477 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
481 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
485 trimFASTAFile.close();
486 scrapFASTAFile.close();
487 if (pDataArray->createGroup) { outGroupsFile.close(); }
488 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
489 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
494 catch(exception& e) {
495 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
502 /**************************************************************************************************/