1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "trimoligos.h"
18 #include "counttable.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
30 string getOutputFileNameTag(string, string);
31 string getHelpString();
32 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
33 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
36 void help() { m->mothurOut(getHelpString()); }
40 unsigned long long start;
41 unsigned long long end;
42 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
46 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
47 bool keepFirstTrim(Sequence&, QualityScores&);
48 bool removeLastTrim(Sequence&, QualityScores&);
49 bool cullLength(Sequence&);
50 bool cullHomoP(Sequence&);
51 bool cullAmbigs(Sequence&);
52 string reverseOligo(string);
54 bool abort, createGroup;
55 string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
57 bool flip, allFiles, qtrim, keepforward;
58 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
59 int qWindowSize, qWindowStep, keepFirst, removeLast;
60 double qRollAverage, qThreshold, qWindowAverage, qAverage;
61 vector<string> revPrimer, outputNames;
62 set<string> filesToRemove;
63 map<string, int> barcodes;
64 map<string, int> rbarcodes;
65 vector<string> groupVector;
66 map<string, int> primers;
67 vector<string> linker;
68 vector<string> spacer;
69 map<string, int> combos;
70 map<string, int> groupToIndex;
71 vector<string> primerNameVector; //needed here?
72 vector<string> barcodeNameVector; //needed here?
73 map<string, int> groupCounts;
74 map<string, string> nameMap;
75 map<string, int> nameCount; //for countfile name -> repCount
76 map<string, string> groupMap; //for countfile name -> group
78 vector<int> processIDS; //processid
79 vector<linePair> lines;
80 vector<linePair> qLines;
82 int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
83 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
84 int setLines(string, string);
87 /**************************************************************************************************/
88 //custom data structure for threads to use.
89 // This is passed by void pointer so it can be any data type
90 // that can be passed using a single void pointer (LPVOID).
92 unsigned long long start, end;
94 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
95 vector<vector<string> > fastaFileNames;
96 vector<vector<string> > qualFileNames;
97 vector<vector<string> > nameFileNames;
98 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
99 bool flip, allFiles, qtrim, keepforward, createGroup;
100 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
101 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
102 double qRollAverage, qThreshold, qWindowAverage, qAverage;
103 vector<string> revPrimer;
104 map<string, int> barcodes;
105 map<string, int> rbarcodes;
106 map<string, int> primers;
107 map<string, int> nameCount;
108 vector<string> linker;
109 vector<string> spacer;
110 map<string, int> combos;
111 vector<string> primerNameVector;
112 vector<string> barcodeNameVector;
113 map<string, int> groupCounts;
114 map<string, string> nameMap;
115 map<string, string> groupMap;
118 trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
119 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa,
120 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
121 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
122 int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
130 scrapQFileName = sqn;
132 scrapNFileName = snn;
134 scrapCFileName = scn;
136 fastaFileNames = ffn;
153 primers = pri; numFPrimers = primers.size();
154 revPrimer = revP; numRPrimers = revPrimer.size();
155 linker = li; numLinkers = linker.size();
156 spacer = spa; numSpacers = spacer.size();
157 primerNameVector = priNameVector;
158 barcodeNameVector = barNameVector;
160 createGroup = cGroup;
164 removeLast = removeL;
165 qWindowStep = WindowStep;
166 qWindowSize = WindowSize;
167 qWindowAverage = WindowAverage;
169 qThreshold = Threshold;
171 qRollAverage = RollAverage;
181 /**************************************************************************************************/
182 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
184 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
185 trimData* pDataArray;
186 pDataArray = (trimData*)lpParam;
189 ofstream trimFASTAFile;
190 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
192 ofstream scrapFASTAFile;
193 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
195 ofstream trimQualFile;
196 ofstream scrapQualFile;
197 if(pDataArray->qFileName != ""){
198 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
199 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
202 ofstream trimNameFile;
203 ofstream scrapNameFile;
204 if(pDataArray->nameFile != ""){
205 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
206 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
210 ofstream outGroupsFile;
211 if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
212 if(pDataArray->allFiles){
213 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
214 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
215 if (pDataArray->fastaFileNames[i][j] != "") {
217 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
218 if(pDataArray->qFileName != ""){
219 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
222 if(pDataArray->nameFile != ""){
223 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
230 ofstream trimCountFile;
231 ofstream scrapCountFile;
232 if(pDataArray->countfile != ""){
233 pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
234 pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
235 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
239 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
240 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
242 }else { //this accounts for the difference in line endings.
243 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
247 if(pDataArray->qFileName != "") {
248 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
249 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
251 }else { //this accounts for the difference in line endings.
252 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
257 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
259 pDataArray->count = pDataArray->lineEnd;
260 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
262 if (pDataArray->m->control_pressed) {
263 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
264 if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
265 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
266 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
267 if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
269 if(pDataArray->qFileName != ""){ qFile.close(); }
274 string trashCode = "";
275 int currentSeqsDiffs = 0;
277 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
279 QualityScores currQual;
280 if(pDataArray->qFileName != ""){
281 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
284 string origSeq = currSeq.getUnaligned();
287 int barcodeIndex = 0;
290 if(pDataArray->numLinkers != 0){
291 success = trimOligos.stripLinker(currSeq, currQual);
292 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
293 else{ currentSeqsDiffs += success; }
296 if(pDataArray->barcodes.size() != 0){
297 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
298 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
299 else{ currentSeqsDiffs += success; }
302 if(pDataArray->rbarcodes.size() != 0){
303 success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
304 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
305 else{ currentSeqsDiffs += success; }
308 if(pDataArray->numSpacers != 0){
309 success = trimOligos.stripSpacer(currSeq, currQual);
310 if(success > pDataArray->sdiffs) { trashCode += 's'; }
311 else{ currentSeqsDiffs += success; }
315 if(pDataArray->numFPrimers != 0){
316 success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
317 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
318 else{ currentSeqsDiffs += success; }
321 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
323 if(pDataArray->numRPrimers != 0){
324 success = trimOligos.stripReverse(currSeq, currQual);
325 if(!success) { trashCode += 'r'; }
328 if(pDataArray->keepFirst != 0){
329 //success = keepFirstTrim(currSeq, currQual);
331 if(currQual.getName() != ""){
332 currQual.trimQScores(-1, pDataArray->keepFirst);
334 currSeq.trim(pDataArray->keepFirst);
337 if(pDataArray->removeLast != 0){
338 //success = removeLastTrim(currSeq, currQual);
340 int length = currSeq.getNumBases() - pDataArray->removeLast;
343 if(currQual.getName() != ""){
344 currQual.trimQScores(-1, length);
346 currSeq.trim(length);
351 if(!success) { trashCode += 'l'; }
355 if(pDataArray->qFileName != ""){
356 int origLength = currSeq.getNumBases();
358 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
359 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
360 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
361 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
362 else { success = 1; }
364 //you don't want to trim, if it fails above then scrap it
365 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
367 if(!success) { trashCode += 'q'; }
370 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
371 //success = cullLength(currSeq);
372 int length = currSeq.getNumBases();
373 success = 0; //guilty until proven innocent
374 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
375 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
376 else { success = 0; }
378 if(!success) { trashCode += 'l'; }
380 if(pDataArray->maxHomoP > 0){
381 //success = cullHomoP(currSeq);
382 int longHomoP = currSeq.getLongHomoPolymer();
383 success = 0; //guilty until proven innocent
384 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
385 else { success = 0; }
387 if(!success) { trashCode += 'h'; }
389 if(pDataArray->maxAmbig != -1){
390 //success = cullAmbigs(currSeq);
391 int numNs = currSeq.getAmbigBases();
392 success = 0; //guilty until proven innocent
393 if(numNs <= pDataArray->maxAmbig) { success = 1; }
394 else { success = 0; }
395 if(!success) { trashCode += 'n'; }
398 if(pDataArray->flip){ // should go last
399 currSeq.reverseComplement();
400 if(pDataArray->qFileName != ""){
401 currQual.flipQScores();
405 if(trashCode.length() == 0){
406 currSeq.setAligned(currSeq.getUnaligned());
407 currSeq.printSequence(trimFASTAFile);
409 if(pDataArray->qFileName != ""){
410 currQual.printQScores(trimQualFile);
413 if(pDataArray->nameFile != ""){
414 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
415 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
416 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
419 int numRedundants = 0;
420 if (pDataArray->countfile != "") {
421 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
422 if (itCount != pDataArray->nameCount.end()) {
423 trimCountFile << itCount->first << '\t' << itCount->second << endl;
424 numRedundants = itCount->second-1;
425 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
428 if (pDataArray->createGroup) {
429 if(pDataArray->barcodes.size() != 0){
430 string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
431 if (pDataArray->primers.size() != 0) {
432 if (pDataArray->primerNameVector[primerIndex] != "") {
433 if(thisGroup != "") {
434 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
436 thisGroup = pDataArray->primerNameVector[primerIndex];
441 if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
442 else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
444 if (pDataArray->nameFile != "") {
445 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
446 if (itName != pDataArray->nameMap.end()) {
447 vector<string> thisSeqsNames;
448 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
449 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
450 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
451 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
453 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
456 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
457 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
458 else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
463 if(pDataArray->allFiles){
465 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
466 currSeq.printSequence(output);
469 if(pDataArray->qFileName != ""){
470 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
471 currQual.printQScores(output);
475 if(pDataArray->nameFile != ""){
476 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
477 if (itName != pDataArray->nameMap.end()) {
478 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
479 output << itName->first << '\t' << itName->second << endl;
481 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
486 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
487 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
488 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
489 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
491 if (pDataArray->countfile != "") {
492 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
493 if (itCount != pDataArray->nameCount.end()) {
494 trimCountFile << itCount->first << '\t' << itCount->second << endl;
495 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
497 currSeq.setName(currSeq.getName() + '|' + trashCode);
498 currSeq.setUnaligned(origSeq);
499 currSeq.setAligned(origSeq);
500 currSeq.printSequence(scrapFASTAFile);
501 if(pDataArray->qFileName != ""){
502 currQual.printQScores(scrapQualFile);
509 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
513 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
517 trimFASTAFile.close();
518 scrapFASTAFile.close();
519 if (pDataArray->createGroup) { outGroupsFile.close(); }
520 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
521 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
526 catch(exception& e) {
527 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
534 /**************************************************************************************************/