1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "trimoligos.h"
18 #include "counttable.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
31 string getHelpString();
32 string getOutputPattern(string);
33 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
34 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
37 void help() { m->mothurOut(getHelpString()); }
41 unsigned long long start;
42 unsigned long long end;
43 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
47 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
48 bool keepFirstTrim(Sequence&, QualityScores&);
49 bool removeLastTrim(Sequence&, QualityScores&);
50 bool cullLength(Sequence&);
51 bool cullHomoP(Sequence&);
52 bool cullAmbigs(Sequence&);
53 string reverseOligo(string);
55 bool abort, createGroup;
56 string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
58 bool flip, allFiles, qtrim, keepforward;
59 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
60 int qWindowSize, qWindowStep, keepFirst, removeLast;
61 double qRollAverage, qThreshold, qWindowAverage, qAverage;
62 vector<string> revPrimer, outputNames;
63 set<string> filesToRemove;
64 map<string, int> barcodes;
65 vector<string> groupVector;
66 map<string, int> primers;
67 vector<string> linker;
68 vector<string> spacer;
69 map<string, int> combos;
70 map<string, int> groupToIndex;
71 vector<string> primerNameVector; //needed here?
72 vector<string> barcodeNameVector; //needed here?
73 map<string, int> groupCounts;
74 map<string, string> nameMap;
75 map<string, int> nameCount; //for countfile name -> repCount
76 map<string, string> groupMap; //for countfile name -> group
78 vector<int> processIDS; //processid
79 vector<linePair> lines;
80 vector<linePair> qLines;
82 int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
83 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
84 int setLines(string, string);
87 /**************************************************************************************************/
88 //custom data structure for threads to use.
89 // This is passed by void pointer so it can be any data type
90 // that can be passed using a single void pointer (LPVOID).
92 unsigned long long start, end;
94 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
95 vector<vector<string> > fastaFileNames;
96 vector<vector<string> > qualFileNames;
97 vector<vector<string> > nameFileNames;
98 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
99 bool flip, allFiles, qtrim, keepforward, createGroup;
100 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
101 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
102 double qRollAverage, qThreshold, qWindowAverage, qAverage;
103 vector<string> revPrimer;
104 map<string, int> barcodes;
105 map<string, int> primers;
106 map<string, int> nameCount;
107 vector<string> linker;
108 vector<string> spacer;
109 map<string, int> combos;
110 vector<string> primerNameVector;
111 vector<string> barcodeNameVector;
112 map<string, int> groupCounts;
113 map<string, string> nameMap;
114 map<string, string> groupMap;
117 trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
118 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
119 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
120 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
121 int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
129 scrapQFileName = sqn;
131 scrapNFileName = snn;
133 scrapCFileName = scn;
135 fastaFileNames = ffn;
151 primers = pri; numFPrimers = primers.size();
152 revPrimer = revP; numRPrimers = revPrimer.size();
153 linker = li; numLinkers = linker.size();
154 spacer = spa; numSpacers = spacer.size();
155 primerNameVector = priNameVector;
156 barcodeNameVector = barNameVector;
158 createGroup = cGroup;
162 removeLast = removeL;
163 qWindowStep = WindowStep;
164 qWindowSize = WindowSize;
165 qWindowAverage = WindowAverage;
167 qThreshold = Threshold;
169 qRollAverage = RollAverage;
179 /**************************************************************************************************/
180 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
182 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
183 trimData* pDataArray;
184 pDataArray = (trimData*)lpParam;
187 ofstream trimFASTAFile;
188 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
190 ofstream scrapFASTAFile;
191 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
193 ofstream trimQualFile;
194 ofstream scrapQualFile;
195 if(pDataArray->qFileName != ""){
196 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
197 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
200 ofstream trimNameFile;
201 ofstream scrapNameFile;
202 if(pDataArray->nameFile != ""){
203 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
204 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
208 ofstream outGroupsFile;
209 if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
210 if(pDataArray->allFiles){
211 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
212 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
213 if (pDataArray->fastaFileNames[i][j] != "") {
215 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
216 if(pDataArray->qFileName != ""){
217 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
220 if(pDataArray->nameFile != ""){
221 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
228 ofstream trimCountFile;
229 ofstream scrapCountFile;
230 if(pDataArray->countfile != ""){
231 pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
232 pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
233 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
237 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
238 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
240 }else { //this accounts for the difference in line endings.
241 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
245 if(pDataArray->qFileName != "") {
246 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
247 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
249 }else { //this accounts for the difference in line endings.
250 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
255 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
257 pDataArray->count = 0;
258 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
260 if (pDataArray->m->control_pressed) {
261 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
262 if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
263 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
264 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
265 if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
267 if(pDataArray->qFileName != ""){ qFile.close(); }
272 string trashCode = "";
273 int currentSeqsDiffs = 0;
275 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
277 QualityScores currQual;
278 if(pDataArray->qFileName != ""){
279 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
282 string origSeq = currSeq.getUnaligned();
286 int barcodeIndex = 0;
289 if(pDataArray->numLinkers != 0){
290 success = trimOligos.stripLinker(currSeq, currQual);
291 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
292 else{ currentSeqsDiffs += success; }
295 if(pDataArray->barcodes.size() != 0){
296 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
297 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
298 else{ currentSeqsDiffs += success; }
301 if(pDataArray->numSpacers != 0){
302 success = trimOligos.stripSpacer(currSeq, currQual);
303 if(success > pDataArray->sdiffs) { trashCode += 's'; }
304 else{ currentSeqsDiffs += success; }
308 if(pDataArray->numFPrimers != 0){
309 success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
310 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
311 else{ currentSeqsDiffs += success; }
314 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
316 if(pDataArray->numRPrimers != 0){
317 success = trimOligos.stripReverse(currSeq, currQual);
318 if(!success) { trashCode += 'r'; }
321 if(pDataArray->keepFirst != 0){
322 //success = keepFirstTrim(currSeq, currQual);
324 if(currQual.getName() != ""){
325 currQual.trimQScores(-1, pDataArray->keepFirst);
327 currSeq.trim(pDataArray->keepFirst);
330 if(pDataArray->removeLast != 0){
331 //success = removeLastTrim(currSeq, currQual);
333 int length = currSeq.getNumBases() - pDataArray->removeLast;
336 if(currQual.getName() != ""){
337 currQual.trimQScores(-1, length);
339 currSeq.trim(length);
344 if(!success) { trashCode += 'l'; }
348 if(pDataArray->qFileName != ""){
349 int origLength = currSeq.getNumBases();
351 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
352 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
353 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
354 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
355 else { success = 1; }
357 //you don't want to trim, if it fails above then scrap it
358 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
360 if(!success) { trashCode += 'q'; }
363 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
364 //success = cullLength(currSeq);
365 int length = currSeq.getNumBases();
366 success = 0; //guilty until proven innocent
367 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
368 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
369 else { success = 0; }
371 if(!success) { trashCode += 'l'; }
373 if(pDataArray->maxHomoP > 0){
374 //success = cullHomoP(currSeq);
375 int longHomoP = currSeq.getLongHomoPolymer();
376 success = 0; //guilty until proven innocent
377 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
378 else { success = 0; }
380 if(!success) { trashCode += 'h'; }
382 if(pDataArray->maxAmbig != -1){
383 //success = cullAmbigs(currSeq);
384 int numNs = currSeq.getAmbigBases();
385 success = 0; //guilty until proven innocent
386 if(numNs <= pDataArray->maxAmbig) { success = 1; }
387 else { success = 0; }
388 if(!success) { trashCode += 'n'; }
391 if(pDataArray->flip){ // should go last
392 currSeq.reverseComplement();
393 if(pDataArray->qFileName != ""){
394 currQual.flipQScores();
398 if(trashCode.length() == 0){
399 string thisGroup = "";
400 if (pDataArray->createGroup) {
401 if(pDataArray->barcodes.size() != 0){
402 thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
403 if (pDataArray->primers.size() != 0) {
404 if (pDataArray->primerNameVector[primerIndex] != "") {
405 if(thisGroup != "") {
406 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
408 thisGroup = pDataArray->primerNameVector[primerIndex];
415 int pos = thisGroup.find("ignore");
416 if (pos == string::npos) {
418 currSeq.setAligned(currSeq.getUnaligned());
419 currSeq.printSequence(trimFASTAFile);
421 if(pDataArray->qFileName != ""){
422 currQual.printQScores(trimQualFile);
425 if(pDataArray->nameFile != ""){
426 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
427 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
428 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
431 int numRedundants = 0;
432 if (pDataArray->countfile != "") {
433 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
434 if (itCount != pDataArray->nameCount.end()) {
435 trimCountFile << itCount->first << '\t' << itCount->second << endl;
436 numRedundants = itCount->second-1;
437 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
440 if (pDataArray->createGroup) {
441 if(pDataArray->barcodes.size() != 0){
443 if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
444 else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
446 if (pDataArray->nameFile != "") {
447 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
448 if (itName != pDataArray->nameMap.end()) {
449 vector<string> thisSeqsNames;
450 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
451 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
452 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
453 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
455 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
458 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
459 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
460 else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
465 if(pDataArray->allFiles){
467 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
468 currSeq.printSequence(output);
471 if(pDataArray->qFileName != ""){
472 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
473 currQual.printQScores(output);
477 if(pDataArray->nameFile != ""){
478 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
479 if (itName != pDataArray->nameMap.end()) {
480 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
481 output << itName->first << '\t' << itName->second << endl;
483 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
489 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
490 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
491 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
492 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
494 if (pDataArray->countfile != "") {
495 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
496 if (itCount != pDataArray->nameCount.end()) {
497 trimCountFile << itCount->first << '\t' << itCount->second << endl;
498 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
500 currSeq.setName(currSeq.getName() + '|' + trashCode);
501 currSeq.setUnaligned(origSeq);
502 currSeq.setAligned(origSeq);
503 currSeq.printSequence(scrapFASTAFile);
504 if(pDataArray->qFileName != ""){
505 currQual.printQScores(scrapQualFile);
512 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
516 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
520 trimFASTAFile.close();
521 scrapFASTAFile.close();
522 if (pDataArray->createGroup) { outGroupsFile.close(); }
523 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
524 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
529 catch(exception& e) {
530 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
537 /**************************************************************************************************/