1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "trimoligos.h"
18 #include "counttable.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
31 string getHelpString();
32 string getOutputPattern(string);
33 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
34 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
37 void help() { m->mothurOut(getHelpString()); }
41 unsigned long long start;
42 unsigned long long end;
43 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
47 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
48 bool keepFirstTrim(Sequence&, QualityScores&);
49 bool removeLastTrim(Sequence&, QualityScores&);
50 bool cullLength(Sequence&);
51 bool cullHomoP(Sequence&);
52 bool cullAmbigs(Sequence&);
53 string reverseOligo(string);
55 bool abort, createGroup;
56 string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
58 bool flip, allFiles, qtrim, keepforward;
59 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
60 int qWindowSize, qWindowStep, keepFirst, removeLast;
61 double qRollAverage, qThreshold, qWindowAverage, qAverage;
62 vector<string> revPrimer, outputNames;
63 set<string> filesToRemove;
64 map<string, int> barcodes;
65 vector<string> groupVector;
66 map<string, int> primers;
67 vector<string> linker;
68 vector<string> spacer;
69 map<string, int> combos;
70 map<string, int> groupToIndex;
71 vector<string> primerNameVector; //needed here?
72 vector<string> barcodeNameVector; //needed here?
73 map<string, int> groupCounts;
74 map<string, string> nameMap;
75 map<string, int> nameCount; //for countfile name -> repCount
76 map<string, string> groupMap; //for countfile name -> group
78 vector<int> processIDS; //processid
79 vector<linePair> lines;
80 vector<linePair> qLines;
82 int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
83 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
84 int setLines(string, string);
87 /**************************************************************************************************/
88 //custom data structure for threads to use.
89 // This is passed by void pointer so it can be any data type
90 // that can be passed using a single void pointer (LPVOID).
92 unsigned long long start, end;
94 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
95 vector<vector<string> > fastaFileNames;
96 vector<vector<string> > qualFileNames;
97 vector<vector<string> > nameFileNames;
98 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
99 bool flip, allFiles, qtrim, keepforward, createGroup;
100 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
101 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
102 double qRollAverage, qThreshold, qWindowAverage, qAverage;
103 vector<string> revPrimer;
104 map<string, int> barcodes;
105 map<string, int> primers;
106 map<string, int> nameCount;
107 vector<string> linker;
108 vector<string> spacer;
109 map<string, int> combos;
110 vector<string> primerNameVector;
111 vector<string> barcodeNameVector;
112 map<string, int> groupCounts;
113 map<string, string> nameMap;
114 map<string, string> groupMap;
117 trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
118 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
119 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
120 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
121 int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
129 scrapQFileName = sqn;
131 scrapNFileName = snn;
133 scrapCFileName = scn;
135 fastaFileNames = ffn;
151 primers = pri; numFPrimers = primers.size();
152 revPrimer = revP; numRPrimers = revPrimer.size();
153 linker = li; numLinkers = linker.size();
154 spacer = spa; numSpacers = spacer.size();
155 primerNameVector = priNameVector;
156 barcodeNameVector = barNameVector;
158 createGroup = cGroup;
162 removeLast = removeL;
163 qWindowStep = WindowStep;
164 qWindowSize = WindowSize;
165 qWindowAverage = WindowAverage;
167 qThreshold = Threshold;
169 qRollAverage = RollAverage;
179 /**************************************************************************************************/
180 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
182 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
183 trimData* pDataArray;
184 pDataArray = (trimData*)lpParam;
187 ofstream trimFASTAFile;
188 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
190 ofstream scrapFASTAFile;
191 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
193 ofstream trimQualFile;
194 ofstream scrapQualFile;
195 if(pDataArray->qFileName != ""){
196 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
197 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
200 ofstream trimNameFile;
201 ofstream scrapNameFile;
202 if(pDataArray->nameFile != ""){
203 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
204 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
208 ofstream outGroupsFile;
209 if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
210 if(pDataArray->allFiles){
211 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
212 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
213 if (pDataArray->fastaFileNames[i][j] != "") {
215 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
216 if(pDataArray->qFileName != ""){
217 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
220 if(pDataArray->nameFile != ""){
221 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
228 ofstream trimCountFile;
229 ofstream scrapCountFile;
230 if(pDataArray->countfile != ""){
231 pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
232 pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
233 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
237 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
238 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
240 }else { //this accounts for the difference in line endings.
241 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
245 if(pDataArray->qFileName != "") {
246 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
247 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
249 }else { //this accounts for the difference in line endings.
250 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
255 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
257 pDataArray->count = pDataArray->lineEnd;
258 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
260 if (pDataArray->m->control_pressed) {
261 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
262 if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
263 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
264 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
265 if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
267 if(pDataArray->qFileName != ""){ qFile.close(); }
272 string trashCode = "";
273 int currentSeqsDiffs = 0;
275 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
277 QualityScores currQual;
278 if(pDataArray->qFileName != ""){
279 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
282 string origSeq = currSeq.getUnaligned();
285 int barcodeIndex = 0;
288 if(pDataArray->numLinkers != 0){
289 success = trimOligos.stripLinker(currSeq, currQual);
290 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
291 else{ currentSeqsDiffs += success; }
294 if(pDataArray->barcodes.size() != 0){
295 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
296 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
297 else{ currentSeqsDiffs += success; }
300 if(pDataArray->numSpacers != 0){
301 success = trimOligos.stripSpacer(currSeq, currQual);
302 if(success > pDataArray->sdiffs) { trashCode += 's'; }
303 else{ currentSeqsDiffs += success; }
307 if(pDataArray->numFPrimers != 0){
308 success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
309 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
310 else{ currentSeqsDiffs += success; }
313 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
315 if(pDataArray->numRPrimers != 0){
316 success = trimOligos.stripReverse(currSeq, currQual);
317 if(!success) { trashCode += 'r'; }
320 if(pDataArray->keepFirst != 0){
321 //success = keepFirstTrim(currSeq, currQual);
323 if(currQual.getName() != ""){
324 currQual.trimQScores(-1, pDataArray->keepFirst);
326 currSeq.trim(pDataArray->keepFirst);
329 if(pDataArray->removeLast != 0){
330 //success = removeLastTrim(currSeq, currQual);
332 int length = currSeq.getNumBases() - pDataArray->removeLast;
335 if(currQual.getName() != ""){
336 currQual.trimQScores(-1, length);
338 currSeq.trim(length);
343 if(!success) { trashCode += 'l'; }
347 if(pDataArray->qFileName != ""){
348 int origLength = currSeq.getNumBases();
350 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
351 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
352 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
353 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
354 else { success = 1; }
356 //you don't want to trim, if it fails above then scrap it
357 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
359 if(!success) { trashCode += 'q'; }
362 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
363 //success = cullLength(currSeq);
364 int length = currSeq.getNumBases();
365 success = 0; //guilty until proven innocent
366 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
367 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
368 else { success = 0; }
370 if(!success) { trashCode += 'l'; }
372 if(pDataArray->maxHomoP > 0){
373 //success = cullHomoP(currSeq);
374 int longHomoP = currSeq.getLongHomoPolymer();
375 success = 0; //guilty until proven innocent
376 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
377 else { success = 0; }
379 if(!success) { trashCode += 'h'; }
381 if(pDataArray->maxAmbig != -1){
382 //success = cullAmbigs(currSeq);
383 int numNs = currSeq.getAmbigBases();
384 success = 0; //guilty until proven innocent
385 if(numNs <= pDataArray->maxAmbig) { success = 1; }
386 else { success = 0; }
387 if(!success) { trashCode += 'n'; }
390 if(pDataArray->flip){ // should go last
391 currSeq.reverseComplement();
392 if(pDataArray->qFileName != ""){
393 currQual.flipQScores();
397 if(trashCode.length() == 0){
398 string thisGroup = "";
399 if (pDataArray->createGroup) {
400 if(pDataArray->barcodes.size() != 0){
401 string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
402 if (pDataArray->primers.size() != 0) {
403 if (pDataArray->primerNameVector[primerIndex] != "") {
404 if(thisGroup != "") {
405 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
407 thisGroup = pDataArray->primerNameVector[primerIndex];
414 int pos = thisGroup.find("ignore");
415 if (pos == string::npos) {
417 currSeq.setAligned(currSeq.getUnaligned());
418 currSeq.printSequence(trimFASTAFile);
420 if(pDataArray->qFileName != ""){
421 currQual.printQScores(trimQualFile);
424 if(pDataArray->nameFile != ""){
425 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
426 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
427 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
430 int numRedundants = 0;
431 if (pDataArray->countfile != "") {
432 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
433 if (itCount != pDataArray->nameCount.end()) {
434 trimCountFile << itCount->first << '\t' << itCount->second << endl;
435 numRedundants = itCount->second-1;
436 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
439 if (pDataArray->createGroup) {
440 if(pDataArray->barcodes.size() != 0){
442 if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
443 else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
445 if (pDataArray->nameFile != "") {
446 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
447 if (itName != pDataArray->nameMap.end()) {
448 vector<string> thisSeqsNames;
449 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
450 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
451 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
452 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
454 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
457 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
458 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
459 else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
464 if(pDataArray->allFiles){
466 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
467 currSeq.printSequence(output);
470 if(pDataArray->qFileName != ""){
471 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
472 currQual.printQScores(output);
476 if(pDataArray->nameFile != ""){
477 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
478 if (itName != pDataArray->nameMap.end()) {
479 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
480 output << itName->first << '\t' << itName->second << endl;
482 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
488 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
489 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
490 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
491 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
493 if (pDataArray->countfile != "") {
494 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
495 if (itCount != pDataArray->nameCount.end()) {
496 trimCountFile << itCount->first << '\t' << itCount->second << endl;
497 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
499 currSeq.setName(currSeq.getName() + '|' + trashCode);
500 currSeq.setUnaligned(origSeq);
501 currSeq.setAligned(origSeq);
502 currSeq.printSequence(scrapFASTAFile);
503 if(pDataArray->qFileName != ""){
504 currQual.printQScores(scrapQualFile);
511 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
515 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
519 trimFASTAFile.close();
520 scrapFASTAFile.close();
521 if (pDataArray->createGroup) { outGroupsFile.close(); }
522 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
523 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
528 catch(exception& e) {
529 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
536 /**************************************************************************************************/