1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
18 #include "trimoligos.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
30 string getHelpString();
31 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
32 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
35 void help() { m->mothurOut(getHelpString()); }
42 unsigned long long start;
43 unsigned long long end;
44 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
48 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
49 bool keepFirstTrim(Sequence&, QualityScores&);
50 bool removeLastTrim(Sequence&, QualityScores&);
51 bool cullLength(Sequence&);
52 bool cullHomoP(Sequence&);
53 bool cullAmbigs(Sequence&);
54 string reverseOligo(string);
56 bool abort, createGroup;
57 string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
59 bool flip, allFiles, qtrim, keepforward;
60 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
61 int qWindowSize, qWindowStep, keepFirst, removeLast;
62 double qRollAverage, qThreshold, qWindowAverage, qAverage;
63 vector<string> revPrimer, outputNames;
64 set<string> filesToRemove;
65 map<string, int> barcodes;
66 map<string, int> rbarcodes;
67 vector<string> groupVector;
68 map<string, int> primers;
69 vector<string> linker;
70 vector<string> spacer;
71 map<string, int> combos;
72 map<string, int> groupToIndex;
73 vector<string> primerNameVector; //needed here?
74 vector<string> barcodeNameVector; //needed here?
75 map<string, int> groupCounts;
76 map<string, string> nameMap;
78 vector<int> processIDS; //processid
79 vector<linePair> lines;
80 vector<linePair> qLines;
82 int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
83 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
84 int setLines(string, string);
87 /**************************************************************************************************/
88 //custom data structure for threads to use.
89 // This is passed by void pointer so it can be any data type
90 // that can be passed using a single void pointer (LPVOID).
92 unsigned long long start, end;
94 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
95 vector<vector<string> > fastaFileNames;
96 vector<vector<string> > qualFileNames;
97 vector<vector<string> > nameFileNames;
98 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
99 bool flip, allFiles, qtrim, keepforward, createGroup;
100 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
101 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
102 double qRollAverage, qThreshold, qWindowAverage, qAverage;
103 vector<string> revPrimer;
104 map<string, int> barcodes;
105 map<string, int> rbarcodes;
106 map<string, int> primers;
107 vector<string> linker;
108 vector<string> spacer;
109 map<string, int> combos;
110 vector<string> primerNameVector;
111 vector<string> barcodeNameVector;
112 map<string, int> groupCounts;
113 map<string, string> nameMap;
116 trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
117 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa,
118 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
119 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
120 int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
127 scrapQFileName = sqn;
129 scrapNFileName = snn;
131 fastaFileNames = ffn;
147 primers = pri; numFPrimers = primers.size();
148 revPrimer = revP; numRPrimers = revPrimer.size();
149 linker = li; numLinkers = linker.size();
150 spacer = spa; numSpacers = spacer.size();
151 primerNameVector = priNameVector;
152 barcodeNameVector = barNameVector;
154 createGroup = cGroup;
158 removeLast = removeL;
159 qWindowStep = WindowStep;
160 qWindowSize = WindowSize;
161 qWindowAverage = WindowAverage;
163 qThreshold = Threshold;
165 qRollAverage = RollAverage;
175 /**************************************************************************************************/
176 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
178 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
179 trimData* pDataArray;
180 pDataArray = (trimData*)lpParam;
183 ofstream trimFASTAFile;
184 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
186 ofstream scrapFASTAFile;
187 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
189 ofstream trimQualFile;
190 ofstream scrapQualFile;
191 if(pDataArray->qFileName != ""){
192 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
193 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
196 ofstream trimNameFile;
197 ofstream scrapNameFile;
198 if(pDataArray->nameFile != ""){
199 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
200 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
204 ofstream outGroupsFile;
205 if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
206 if(pDataArray->allFiles){
207 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
208 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
209 if (pDataArray->fastaFileNames[i][j] != "") {
211 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
212 if(pDataArray->qFileName != ""){
213 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
216 if(pDataArray->nameFile != ""){
217 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
225 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
226 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
228 }else { //this accounts for the difference in line endings.
229 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
233 if(pDataArray->qFileName != "") {
234 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
235 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
237 }else { //this accounts for the difference in line endings.
238 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
243 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
245 pDataArray->count = pDataArray->lineEnd;
246 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
248 if (pDataArray->m->control_pressed) {
249 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
250 if (pDataArray->createGroup) { outGroupsFile.close(); }
251 if(pDataArray->qFileName != ""){ qFile.close(); }
256 string trashCode = "";
257 int currentSeqsDiffs = 0;
259 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
261 QualityScores currQual;
262 if(pDataArray->qFileName != ""){
263 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
266 string origSeq = currSeq.getUnaligned();
269 int barcodeIndex = 0;
272 if(pDataArray->numLinkers != 0){
273 success = trimOligos.stripLinker(currSeq, currQual);
274 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
275 else{ currentSeqsDiffs += success; }
278 if(pDataArray->barcodes.size() != 0){
279 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
280 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
281 else{ currentSeqsDiffs += success; }
284 if(pDataArray->rbarcodes.size() != 0){
285 success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
286 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
287 else{ currentSeqsDiffs += success; }
290 if(pDataArray->numSpacers != 0){
291 success = trimOligos.stripSpacer(currSeq, currQual);
292 if(success > pDataArray->sdiffs) { trashCode += 's'; }
293 else{ currentSeqsDiffs += success; }
297 if(pDataArray->numFPrimers != 0){
298 success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
299 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
300 else{ currentSeqsDiffs += success; }
303 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
305 if(pDataArray->numRPrimers != 0){
306 success = trimOligos.stripReverse(currSeq, currQual);
307 if(!success) { trashCode += 'r'; }
310 if(pDataArray->keepFirst != 0){
311 //success = keepFirstTrim(currSeq, currQual);
313 if(currQual.getName() != ""){
314 currQual.trimQScores(-1, pDataArray->keepFirst);
316 currSeq.trim(pDataArray->keepFirst);
319 if(pDataArray->removeLast != 0){
320 //success = removeLastTrim(currSeq, currQual);
322 int length = currSeq.getNumBases() - pDataArray->removeLast;
325 if(currQual.getName() != ""){
326 currQual.trimQScores(-1, length);
328 currSeq.trim(length);
333 if(!success) { trashCode += 'l'; }
337 if(pDataArray->qFileName != ""){
338 int origLength = currSeq.getNumBases();
340 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
341 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
342 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
343 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
344 else { success = 1; }
346 //you don't want to trim, if it fails above then scrap it
347 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
349 if(!success) { trashCode += 'q'; }
352 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
353 //success = cullLength(currSeq);
354 int length = currSeq.getNumBases();
355 success = 0; //guilty until proven innocent
356 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
357 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
358 else { success = 0; }
360 if(!success) { trashCode += 'l'; }
362 if(pDataArray->maxHomoP > 0){
363 //success = cullHomoP(currSeq);
364 int longHomoP = currSeq.getLongHomoPolymer();
365 success = 0; //guilty until proven innocent
366 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
367 else { success = 0; }
369 if(!success) { trashCode += 'h'; }
371 if(pDataArray->maxAmbig != -1){
372 //success = cullAmbigs(currSeq);
373 int numNs = currSeq.getAmbigBases();
374 success = 0; //guilty until proven innocent
375 if(numNs <= pDataArray->maxAmbig) { success = 1; }
376 else { success = 0; }
377 if(!success) { trashCode += 'n'; }
380 if(pDataArray->flip){ // should go last
381 currSeq.reverseComplement();
382 if(pDataArray->qFileName != ""){
383 currQual.flipQScores();
387 if(trashCode.length() == 0){
388 currSeq.setAligned(currSeq.getUnaligned());
389 currSeq.printSequence(trimFASTAFile);
391 if(pDataArray->qFileName != ""){
392 currQual.printQScores(trimQualFile);
395 if(pDataArray->nameFile != ""){
396 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
397 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
398 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
401 if (pDataArray->createGroup) {
402 if(pDataArray->barcodes.size() != 0){
403 string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
404 if (pDataArray->primers.size() != 0) {
405 if (pDataArray->primerNameVector[primerIndex] != "") {
406 if(thisGroup != "") {
407 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
409 thisGroup = pDataArray->primerNameVector[primerIndex];
414 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
416 if (pDataArray->nameFile != "") {
417 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
418 if (itName != pDataArray->nameMap.end()) {
419 vector<string> thisSeqsNames;
420 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
421 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
422 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
424 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
427 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
428 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; }
429 else { pDataArray->groupCounts[it->first]++; }
434 if(pDataArray->allFiles){
436 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
437 currSeq.printSequence(output);
440 if(pDataArray->qFileName != ""){
441 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
442 currQual.printQScores(output);
446 if(pDataArray->nameFile != ""){
447 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
448 if (itName != pDataArray->nameMap.end()) {
449 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
450 output << itName->first << '\t' << itName->second << endl;
452 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
457 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
458 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
459 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
460 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
462 currSeq.setName(currSeq.getName() + '|' + trashCode);
463 currSeq.setUnaligned(origSeq);
464 currSeq.setAligned(origSeq);
465 currSeq.printSequence(scrapFASTAFile);
466 if(pDataArray->qFileName != ""){
467 currQual.printQScores(scrapQualFile);
474 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
478 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
482 trimFASTAFile.close();
483 scrapFASTAFile.close();
484 if (pDataArray->createGroup) { outGroupsFile.close(); }
485 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
486 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
491 catch(exception& e) {
492 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
499 /**************************************************************************************************/