1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "trimoligos.h"
18 #include "counttable.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
31 string getHelpString();
32 string getOutputPattern(string);
33 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
34 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
37 void help() { m->mothurOut(getHelpString()); }
41 unsigned long long start;
42 unsigned long long end;
43 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
47 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
48 bool keepFirstTrim(Sequence&, QualityScores&);
49 bool removeLastTrim(Sequence&, QualityScores&);
50 bool cullLength(Sequence&);
51 bool cullHomoP(Sequence&);
52 bool cullAmbigs(Sequence&);
53 string reverseOligo(string);
55 bool abort, createGroup;
56 string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
58 bool flip, allFiles, qtrim, keepforward;
59 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
60 int qWindowSize, qWindowStep, keepFirst, removeLast;
61 double qRollAverage, qThreshold, qWindowAverage, qAverage;
62 vector<string> revPrimer, outputNames;
63 set<string> filesToRemove;
64 map<string, int> barcodes;
65 vector<string> groupVector;
66 map<string, int> primers;
67 vector<string> linker;
68 vector<string> spacer;
69 map<string, int> combos;
70 map<string, int> groupToIndex;
71 vector<string> primerNameVector; //needed here?
72 vector<string> barcodeNameVector; //needed here?
73 map<string, int> groupCounts;
74 map<string, string> nameMap;
75 map<string, int> nameCount; //for countfile name -> repCount
76 map<string, string> groupMap; //for countfile name -> group
78 vector<int> processIDS; //processid
79 vector<linePair> lines;
80 vector<linePair> qLines;
82 int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
83 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
84 int setLines(string, string);
87 /**************************************************************************************************/
88 //custom data structure for threads to use.
89 // This is passed by void pointer so it can be any data type
90 // that can be passed using a single void pointer (LPVOID).
92 unsigned long long start, end;
94 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
95 vector<vector<string> > fastaFileNames;
96 vector<vector<string> > qualFileNames;
97 vector<vector<string> > nameFileNames;
98 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
99 bool flip, allFiles, qtrim, keepforward, createGroup;
100 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
101 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
102 double qRollAverage, qThreshold, qWindowAverage, qAverage;
103 vector<string> revPrimer;
104 map<string, int> barcodes;
105 map<string, int> primers;
106 map<string, int> nameCount;
107 vector<string> linker;
108 vector<string> spacer;
109 map<string, int> combos;
110 vector<string> primerNameVector;
111 vector<string> barcodeNameVector;
112 map<string, int> groupCounts;
113 map<string, string> nameMap;
114 map<string, string> groupMap;
117 trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
118 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
119 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
120 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
121 int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
129 scrapQFileName = sqn;
131 scrapNFileName = snn;
133 scrapCFileName = scn;
135 fastaFileNames = ffn;
151 primers = pri; numFPrimers = primers.size();
152 revPrimer = revP; numRPrimers = revPrimer.size();
153 linker = li; numLinkers = linker.size();
154 spacer = spa; numSpacers = spacer.size();
155 primerNameVector = priNameVector;
156 barcodeNameVector = barNameVector;
158 createGroup = cGroup;
162 removeLast = removeL;
163 qWindowStep = WindowStep;
164 qWindowSize = WindowSize;
165 qWindowAverage = WindowAverage;
167 qThreshold = Threshold;
169 qRollAverage = RollAverage;
179 /**************************************************************************************************/
180 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
182 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
183 trimData* pDataArray;
184 pDataArray = (trimData*)lpParam;
187 ofstream trimFASTAFile;
188 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
190 ofstream scrapFASTAFile;
191 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
193 ofstream trimQualFile;
194 ofstream scrapQualFile;
195 if(pDataArray->qFileName != ""){
196 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
197 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
200 ofstream trimNameFile;
201 ofstream scrapNameFile;
202 if(pDataArray->nameFile != ""){
203 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
204 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
208 ofstream outGroupsFile;
209 if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
210 if(pDataArray->allFiles){
211 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
212 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
213 if (pDataArray->fastaFileNames[i][j] != "") {
215 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
216 if(pDataArray->qFileName != ""){
217 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
220 if(pDataArray->nameFile != ""){
221 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
228 ofstream trimCountFile;
229 ofstream scrapCountFile;
230 if(pDataArray->countfile != ""){
231 pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
232 pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
233 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
237 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
238 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
240 }else { //this accounts for the difference in line endings.
241 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
245 if(pDataArray->qFileName != "") {
246 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
247 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
249 }else { //this accounts for the difference in line endings.
250 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
255 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
257 pDataArray->count = pDataArray->lineEnd;
258 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
260 if (pDataArray->m->control_pressed) {
261 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
262 if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
263 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
264 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
265 if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
267 if(pDataArray->qFileName != ""){ qFile.close(); }
272 string trashCode = "";
273 int currentSeqsDiffs = 0;
275 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
277 QualityScores currQual;
278 if(pDataArray->qFileName != ""){
279 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
282 string origSeq = currSeq.getUnaligned();
285 int barcodeIndex = 0;
288 if(pDataArray->numLinkers != 0){
289 success = trimOligos.stripLinker(currSeq, currQual);
290 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
291 else{ currentSeqsDiffs += success; }
294 if(pDataArray->barcodes.size() != 0){
295 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
296 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
297 else{ currentSeqsDiffs += success; }
300 if(pDataArray->numSpacers != 0){
301 success = trimOligos.stripSpacer(currSeq, currQual);
302 if(success > pDataArray->sdiffs) { trashCode += 's'; }
303 else{ currentSeqsDiffs += success; }
307 if(pDataArray->numFPrimers != 0){
308 success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
309 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
310 else{ currentSeqsDiffs += success; }
313 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
315 if(pDataArray->numRPrimers != 0){
316 success = trimOligos.stripReverse(currSeq, currQual);
317 if(!success) { trashCode += 'r'; }
320 if(pDataArray->keepFirst != 0){
321 //success = keepFirstTrim(currSeq, currQual);
323 if(currQual.getName() != ""){
324 currQual.trimQScores(-1, pDataArray->keepFirst);
326 currSeq.trim(pDataArray->keepFirst);
329 if(pDataArray->removeLast != 0){
330 //success = removeLastTrim(currSeq, currQual);
332 int length = currSeq.getNumBases() - pDataArray->removeLast;
335 if(currQual.getName() != ""){
336 currQual.trimQScores(-1, length);
338 currSeq.trim(length);
343 if(!success) { trashCode += 'l'; }
347 if(pDataArray->qFileName != ""){
348 int origLength = currSeq.getNumBases();
350 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
351 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
352 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
353 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
354 else { success = 1; }
356 //you don't want to trim, if it fails above then scrap it
357 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
359 if(!success) { trashCode += 'q'; }
362 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
363 //success = cullLength(currSeq);
364 int length = currSeq.getNumBases();
365 success = 0; //guilty until proven innocent
366 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
367 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
368 else { success = 0; }
370 if(!success) { trashCode += 'l'; }
372 if(pDataArray->maxHomoP > 0){
373 //success = cullHomoP(currSeq);
374 int longHomoP = currSeq.getLongHomoPolymer();
375 success = 0; //guilty until proven innocent
376 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
377 else { success = 0; }
379 if(!success) { trashCode += 'h'; }
381 if(pDataArray->maxAmbig != -1){
382 //success = cullAmbigs(currSeq);
383 int numNs = currSeq.getAmbigBases();
384 success = 0; //guilty until proven innocent
385 if(numNs <= pDataArray->maxAmbig) { success = 1; }
386 else { success = 0; }
387 if(!success) { trashCode += 'n'; }
390 if(pDataArray->flip){ // should go last
391 currSeq.reverseComplement();
392 if(pDataArray->qFileName != ""){
393 currQual.flipQScores();
397 if(trashCode.length() == 0){
398 currSeq.setAligned(currSeq.getUnaligned());
399 currSeq.printSequence(trimFASTAFile);
401 if(pDataArray->qFileName != ""){
402 currQual.printQScores(trimQualFile);
405 if(pDataArray->nameFile != ""){
406 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
407 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
408 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
411 int numRedundants = 0;
412 if (pDataArray->countfile != "") {
413 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
414 if (itCount != pDataArray->nameCount.end()) {
415 trimCountFile << itCount->first << '\t' << itCount->second << endl;
416 numRedundants = itCount->second-1;
417 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
420 if (pDataArray->createGroup) {
421 if(pDataArray->barcodes.size() != 0){
422 string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
423 if (pDataArray->primers.size() != 0) {
424 if (pDataArray->primerNameVector[primerIndex] != "") {
425 if(thisGroup != "") {
426 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
428 thisGroup = pDataArray->primerNameVector[primerIndex];
433 if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
434 else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
436 if (pDataArray->nameFile != "") {
437 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
438 if (itName != pDataArray->nameMap.end()) {
439 vector<string> thisSeqsNames;
440 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
441 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
442 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
443 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
445 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
448 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
449 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
450 else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
455 if(pDataArray->allFiles){
457 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
458 currSeq.printSequence(output);
461 if(pDataArray->qFileName != ""){
462 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
463 currQual.printQScores(output);
467 if(pDataArray->nameFile != ""){
468 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
469 if (itName != pDataArray->nameMap.end()) {
470 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
471 output << itName->first << '\t' << itName->second << endl;
473 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
478 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
479 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
480 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
481 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
483 if (pDataArray->countfile != "") {
484 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
485 if (itCount != pDataArray->nameCount.end()) {
486 trimCountFile << itCount->first << '\t' << itCount->second << endl;
487 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
489 currSeq.setName(currSeq.getName() + '|' + trashCode);
490 currSeq.setUnaligned(origSeq);
491 currSeq.setAligned(origSeq);
492 currSeq.printSequence(scrapFASTAFile);
493 if(pDataArray->qFileName != ""){
494 currQual.printQScores(scrapQualFile);
501 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
505 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
509 trimFASTAFile.close();
510 scrapFASTAFile.close();
511 if (pDataArray->createGroup) { outGroupsFile.close(); }
512 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
513 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
518 catch(exception& e) {
519 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
526 /**************************************************************************************************/