1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
18 #include "trimoligos.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
30 string getHelpString();
31 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
32 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
35 void help() { m->mothurOut(getHelpString()); }
42 unsigned long long start;
43 unsigned long long end;
44 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
48 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
49 bool keepFirstTrim(Sequence&, QualityScores&);
50 bool removeLastTrim(Sequence&, QualityScores&);
51 bool cullLength(Sequence&);
52 bool cullHomoP(Sequence&);
53 bool cullAmbigs(Sequence&);
55 bool abort, createGroup;
56 string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
58 bool flip, allFiles, qtrim, keepforward;
59 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
60 int qWindowSize, qWindowStep, keepFirst, removeLast;
61 double qRollAverage, qThreshold, qWindowAverage, qAverage;
62 vector<string> revPrimer, outputNames;
63 set<string> filesToRemove;
64 map<string, int> barcodes;
65 vector<string> groupVector;
66 map<string, int> primers;
67 vector<string> linker;
68 vector<string> spacer;
69 map<string, int> combos;
70 map<string, int> groupToIndex;
71 vector<string> primerNameVector; //needed here?
72 vector<string> barcodeNameVector; //needed here?
73 map<string, int> groupCounts;
74 map<string, string> nameMap;
76 vector<int> processIDS; //processid
77 vector<linePair> lines;
78 vector<linePair> qLines;
80 int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
81 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
82 int setLines(string, string);
85 /**************************************************************************************************/
86 //custom data structure for threads to use.
87 // This is passed by void pointer so it can be any data type
88 // that can be passed using a single void pointer (LPVOID).
90 unsigned long long start, end;
92 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
93 vector<vector<string> > fastaFileNames;
94 vector<vector<string> > qualFileNames;
95 vector<vector<string> > nameFileNames;
96 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
97 bool flip, allFiles, qtrim, keepforward, createGroup;
98 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
99 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
100 double qRollAverage, qThreshold, qWindowAverage, qAverage;
101 vector<string> revPrimer;
102 map<string, int> barcodes;
103 map<string, int> primers;
104 vector<string> linker;
105 vector<string> spacer;
106 map<string, int> combos;
107 vector<string> primerNameVector;
108 vector<string> barcodeNameVector;
109 map<string, int> groupCounts;
110 map<string, string> nameMap;
113 trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
114 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
115 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
116 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
117 int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
124 scrapQFileName = sqn;
126 scrapNFileName = snn;
128 fastaFileNames = ffn;
143 primers = pri; numFPrimers = primers.size();
144 revPrimer = revP; numRPrimers = revPrimer.size();
145 linker = li; numLinkers = linker.size();
146 spacer = spa; numSpacers = spacer.size();
147 primerNameVector = priNameVector;
148 barcodeNameVector = barNameVector;
150 createGroup = cGroup;
154 removeLast = removeL;
155 qWindowStep = WindowStep;
156 qWindowSize = WindowSize;
157 qWindowAverage = WindowAverage;
159 qThreshold = Threshold;
161 qRollAverage = RollAverage;
171 /**************************************************************************************************/
172 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
174 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
175 trimData* pDataArray;
176 pDataArray = (trimData*)lpParam;
179 ofstream trimFASTAFile;
180 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
182 ofstream scrapFASTAFile;
183 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
185 ofstream trimQualFile;
186 ofstream scrapQualFile;
187 if(pDataArray->qFileName != ""){
188 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
189 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
192 ofstream trimNameFile;
193 ofstream scrapNameFile;
194 if(pDataArray->nameFile != ""){
195 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
196 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
200 ofstream outGroupsFile;
201 if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
202 if(pDataArray->allFiles){
203 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
204 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
205 if (pDataArray->fastaFileNames[i][j] != "") {
207 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
208 if(pDataArray->qFileName != ""){
209 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
212 if(pDataArray->nameFile != ""){
213 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
221 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
222 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
224 }else { //this accounts for the difference in line endings.
225 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
229 if(pDataArray->qFileName != "") {
230 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
231 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
233 }else { //this accounts for the difference in line endings.
234 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
239 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
241 pDataArray->count = pDataArray->lineEnd;
242 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
244 if (pDataArray->m->control_pressed) {
245 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
246 if (pDataArray->createGroup) { outGroupsFile.close(); }
247 if(pDataArray->qFileName != ""){ qFile.close(); }
252 string trashCode = "";
253 int currentSeqsDiffs = 0;
255 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
257 QualityScores currQual;
258 if(pDataArray->qFileName != ""){
259 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
262 string origSeq = currSeq.getUnaligned();
265 int barcodeIndex = 0;
268 if(pDataArray->numLinkers != 0){
269 success = trimOligos.stripLinker(currSeq, currQual);
270 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
271 else{ currentSeqsDiffs += success; }
274 if(pDataArray->barcodes.size() != 0){
275 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
276 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
277 else{ currentSeqsDiffs += success; }
280 if(pDataArray->numSpacers != 0){
281 success = trimOligos.stripSpacer(currSeq, currQual);
282 if(success > pDataArray->sdiffs) { trashCode += 's'; }
283 else{ currentSeqsDiffs += success; }
287 if(pDataArray->numFPrimers != 0){
288 success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
289 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
290 else{ currentSeqsDiffs += success; }
293 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
295 if(pDataArray->numRPrimers != 0){
296 success = trimOligos.stripReverse(currSeq, currQual);
297 if(!success) { trashCode += 'r'; }
300 if(pDataArray->keepFirst != 0){
301 //success = keepFirstTrim(currSeq, currQual);
303 if(currQual.getName() != ""){
304 currQual.trimQScores(-1, pDataArray->keepFirst);
306 currSeq.trim(pDataArray->keepFirst);
309 if(pDataArray->removeLast != 0){
310 //success = removeLastTrim(currSeq, currQual);
312 int length = currSeq.getNumBases() - pDataArray->removeLast;
315 if(currQual.getName() != ""){
316 currQual.trimQScores(-1, length);
318 currSeq.trim(length);
323 if(!success) { trashCode += 'l'; }
327 if(pDataArray->qFileName != ""){
328 int origLength = currSeq.getNumBases();
330 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
331 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
332 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
333 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
334 else { success = 1; }
336 //you don't want to trim, if it fails above then scrap it
337 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
339 if(!success) { trashCode += 'q'; }
342 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
343 //success = cullLength(currSeq);
344 int length = currSeq.getNumBases();
345 success = 0; //guilty until proven innocent
346 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
347 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
348 else { success = 0; }
350 if(!success) { trashCode += 'l'; }
352 if(pDataArray->maxHomoP > 0){
353 //success = cullHomoP(currSeq);
354 int longHomoP = currSeq.getLongHomoPolymer();
355 success = 0; //guilty until proven innocent
356 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
357 else { success = 0; }
359 if(!success) { trashCode += 'h'; }
361 if(pDataArray->maxAmbig != -1){
362 //success = cullAmbigs(currSeq);
363 int numNs = currSeq.getAmbigBases();
364 success = 0; //guilty until proven innocent
365 if(numNs <= pDataArray->maxAmbig) { success = 1; }
366 else { success = 0; }
367 if(!success) { trashCode += 'n'; }
370 if(pDataArray->flip){ // should go last
371 currSeq.reverseComplement();
372 if(pDataArray->qFileName != ""){
373 currQual.flipQScores();
377 if(trashCode.length() == 0){
378 currSeq.setAligned(currSeq.getUnaligned());
379 currSeq.printSequence(trimFASTAFile);
381 if(pDataArray->qFileName != ""){
382 currQual.printQScores(trimQualFile);
385 if(pDataArray->nameFile != ""){
386 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
387 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
388 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
391 if (pDataArray->createGroup) {
392 if(pDataArray->barcodes.size() != 0){
393 string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
394 if (pDataArray->primers.size() != 0) {
395 if (pDataArray->primerNameVector[primerIndex] != "") {
396 if(thisGroup != "") {
397 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
399 thisGroup = pDataArray->primerNameVector[primerIndex];
404 outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
406 if (pDataArray->nameFile != "") {
407 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
408 if (itName != pDataArray->nameMap.end()) {
409 vector<string> thisSeqsNames;
410 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
411 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
412 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
414 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
417 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
418 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; }
419 else { pDataArray->groupCounts[it->first]++; }
424 if(pDataArray->allFiles){
426 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
427 currSeq.printSequence(output);
430 if(pDataArray->qFileName != ""){
431 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
432 currQual.printQScores(output);
436 if(pDataArray->nameFile != ""){
437 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
438 if (itName != pDataArray->nameMap.end()) {
439 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
440 output << itName->first << '\t' << itName->second << endl;
442 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
447 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
448 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
449 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
450 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
452 currSeq.setName(currSeq.getName() + '|' + trashCode);
453 currSeq.setUnaligned(origSeq);
454 currSeq.setAligned(origSeq);
455 currSeq.printSequence(scrapFASTAFile);
456 if(pDataArray->qFileName != ""){
457 currQual.printQScores(scrapQualFile);
464 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
468 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
472 trimFASTAFile.close();
473 scrapFASTAFile.close();
474 if (pDataArray->createGroup) { outGroupsFile.close(); }
475 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
476 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
481 catch(exception& e) {
482 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
489 /**************************************************************************************************/