1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
18 #include "trimoligos.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
30 string getHelpString();
31 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
32 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
35 void help() { m->mothurOut(getHelpString()); }
42 unsigned long long start;
43 unsigned long long end;
44 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
48 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
49 bool keepFirstTrim(Sequence&, QualityScores&);
50 bool removeLastTrim(Sequence&, QualityScores&);
51 bool cullLength(Sequence&);
52 bool cullHomoP(Sequence&);
53 bool cullAmbigs(Sequence&);
54 string reverseOligo(string);
56 bool abort, createGroup;
57 string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
59 bool flip, allFiles, qtrim, keepforward;
60 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
61 int qWindowSize, qWindowStep, keepFirst, removeLast;
62 double qRollAverage, qThreshold, qWindowAverage, qAverage;
63 vector<string> revPrimer, outputNames;
64 set<string> filesToRemove;
65 map<string, int> barcodes;
66 vector<string> groupVector;
67 map<string, int> primers;
68 vector<string> linker;
69 vector<string> spacer;
70 map<string, int> combos;
71 map<string, int> groupToIndex;
72 vector<string> primerNameVector; //needed here?
73 vector<string> barcodeNameVector; //needed here?
74 map<string, int> groupCounts;
75 map<string, string> nameMap;
77 vector<int> processIDS; //processid
78 vector<linePair> lines;
79 vector<linePair> qLines;
81 int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
82 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
83 int setLines(string, string);
86 /**************************************************************************************************/
87 //custom data structure for threads to use.
88 // This is passed by void pointer so it can be any data type
89 // that can be passed using a single void pointer (LPVOID).
91 unsigned long long start, end;
93 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
94 vector<vector<string> > fastaFileNames;
95 vector<vector<string> > qualFileNames;
96 vector<vector<string> > nameFileNames;
97 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
98 bool flip, allFiles, qtrim, keepforward, createGroup;
99 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
100 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
101 double qRollAverage, qThreshold, qWindowAverage, qAverage;
102 vector<string> revPrimer;
103 map<string, int> barcodes;
104 map<string, int> primers;
105 vector<string> linker;
106 vector<string> spacer;
107 map<string, int> combos;
108 vector<string> primerNameVector;
109 vector<string> barcodeNameVector;
110 map<string, int> groupCounts;
111 map<string, string> nameMap;
114 trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
115 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
116 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
117 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
118 int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
125 scrapQFileName = sqn;
127 scrapNFileName = snn;
129 fastaFileNames = ffn;
144 primers = pri; numFPrimers = primers.size();
145 revPrimer = revP; numRPrimers = revPrimer.size();
146 linker = li; numLinkers = linker.size();
147 spacer = spa; numSpacers = spacer.size();
148 primerNameVector = priNameVector;
149 barcodeNameVector = barNameVector;
151 createGroup = cGroup;
155 removeLast = removeL;
156 qWindowStep = WindowStep;
157 qWindowSize = WindowSize;
158 qWindowAverage = WindowAverage;
160 qThreshold = Threshold;
162 qRollAverage = RollAverage;
172 /**************************************************************************************************/
173 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
175 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
176 trimData* pDataArray;
177 pDataArray = (trimData*)lpParam;
180 ofstream trimFASTAFile;
181 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
183 ofstream scrapFASTAFile;
184 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
186 ofstream trimQualFile;
187 ofstream scrapQualFile;
188 if(pDataArray->qFileName != ""){
189 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
190 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
193 ofstream trimNameFile;
194 ofstream scrapNameFile;
195 if(pDataArray->nameFile != ""){
196 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
197 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
201 ofstream outGroupsFile;
202 if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
203 if(pDataArray->allFiles){
204 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
205 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
206 if (pDataArray->fastaFileNames[i][j] != "") {
208 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
209 if(pDataArray->qFileName != ""){
210 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
213 if(pDataArray->nameFile != ""){
214 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
222 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
223 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
225 }else { //this accounts for the difference in line endings.
226 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
230 if(pDataArray->qFileName != "") {
231 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
232 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
234 }else { //this accounts for the difference in line endings.
235 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
240 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
242 pDataArray->count = pDataArray->lineEnd;
243 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
245 if (pDataArray->m->control_pressed) {
246 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
247 if (pDataArray->createGroup) { outGroupsFile.close(); }
248 if(pDataArray->qFileName != ""){ qFile.close(); }
253 string trashCode = "";
254 int currentSeqsDiffs = 0;
256 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
258 QualityScores currQual;
259 if(pDataArray->qFileName != ""){
260 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
263 string origSeq = currSeq.getUnaligned();
266 int barcodeIndex = 0;
269 if(pDataArray->numLinkers != 0){
270 success = trimOligos.stripLinker(currSeq, currQual);
271 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
272 else{ currentSeqsDiffs += success; }
275 if(pDataArray->barcodes.size() != 0){
276 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
277 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
278 else{ currentSeqsDiffs += success; }
281 if(pDataArray->numSpacers != 0){
282 success = trimOligos.stripSpacer(currSeq, currQual);
283 if(success > pDataArray->sdiffs) { trashCode += 's'; }
284 else{ currentSeqsDiffs += success; }
288 if(pDataArray->numFPrimers != 0){
289 success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
290 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
291 else{ currentSeqsDiffs += success; }
294 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
296 if(pDataArray->numRPrimers != 0){
297 success = trimOligos.stripReverse(currSeq, currQual);
298 if(!success) { trashCode += 'r'; }
301 if(pDataArray->keepFirst != 0){
302 //success = keepFirstTrim(currSeq, currQual);
304 if(currQual.getName() != ""){
305 currQual.trimQScores(-1, pDataArray->keepFirst);
307 currSeq.trim(pDataArray->keepFirst);
310 if(pDataArray->removeLast != 0){
311 //success = removeLastTrim(currSeq, currQual);
313 int length = currSeq.getNumBases() - pDataArray->removeLast;
316 if(currQual.getName() != ""){
317 currQual.trimQScores(-1, length);
319 currSeq.trim(length);
324 if(!success) { trashCode += 'l'; }
328 if(pDataArray->qFileName != ""){
329 int origLength = currSeq.getNumBases();
331 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
332 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
333 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
334 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
335 else { success = 1; }
337 //you don't want to trim, if it fails above then scrap it
338 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
340 if(!success) { trashCode += 'q'; }
343 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
344 //success = cullLength(currSeq);
345 int length = currSeq.getNumBases();
346 success = 0; //guilty until proven innocent
347 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
348 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
349 else { success = 0; }
351 if(!success) { trashCode += 'l'; }
353 if(pDataArray->maxHomoP > 0){
354 //success = cullHomoP(currSeq);
355 int longHomoP = currSeq.getLongHomoPolymer();
356 success = 0; //guilty until proven innocent
357 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
358 else { success = 0; }
360 if(!success) { trashCode += 'h'; }
362 if(pDataArray->maxAmbig != -1){
363 //success = cullAmbigs(currSeq);
364 int numNs = currSeq.getAmbigBases();
365 success = 0; //guilty until proven innocent
366 if(numNs <= pDataArray->maxAmbig) { success = 1; }
367 else { success = 0; }
368 if(!success) { trashCode += 'n'; }
371 if(pDataArray->flip){ // should go last
372 currSeq.reverseComplement();
373 if(pDataArray->qFileName != ""){
374 currQual.flipQScores();
378 if(trashCode.length() == 0){
379 currSeq.setAligned(currSeq.getUnaligned());
380 currSeq.printSequence(trimFASTAFile);
382 if(pDataArray->qFileName != ""){
383 currQual.printQScores(trimQualFile);
386 if(pDataArray->nameFile != ""){
387 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
388 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
389 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
392 if (pDataArray->createGroup) {
393 if(pDataArray->barcodes.size() != 0){
394 string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
395 if (pDataArray->primers.size() != 0) {
396 if (pDataArray->primerNameVector[primerIndex] != "") {
397 if(thisGroup != "") {
398 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
400 thisGroup = pDataArray->primerNameVector[primerIndex];
405 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
407 if (pDataArray->nameFile != "") {
408 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
409 if (itName != pDataArray->nameMap.end()) {
410 vector<string> thisSeqsNames;
411 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
412 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
413 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
415 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
418 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
419 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; }
420 else { pDataArray->groupCounts[it->first]++; }
425 if(pDataArray->allFiles){
427 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
428 currSeq.printSequence(output);
431 if(pDataArray->qFileName != ""){
432 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
433 currQual.printQScores(output);
437 if(pDataArray->nameFile != ""){
438 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
439 if (itName != pDataArray->nameMap.end()) {
440 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
441 output << itName->first << '\t' << itName->second << endl;
443 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
448 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
449 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
450 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
451 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
453 currSeq.setName(currSeq.getName() + '|' + trashCode);
454 currSeq.setUnaligned(origSeq);
455 currSeq.setAligned(origSeq);
456 currSeq.printSequence(scrapFASTAFile);
457 if(pDataArray->qFileName != ""){
458 currQual.printQScores(scrapQualFile);
465 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
469 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
473 trimFASTAFile.close();
474 scrapFASTAFile.close();
475 if (pDataArray->createGroup) { outGroupsFile.close(); }
476 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
477 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
482 catch(exception& e) {
483 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
490 /**************************************************************************************************/