1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "trimoligos.h"
18 #include "counttable.h"
21 class TrimSeqsCommand : public Command {
23 TrimSeqsCommand(string);
27 vector<string> setParameters();
28 string getCommandName() { return "trim.seqs"; }
29 string getCommandCategory() { return "Sequence Processing"; }
30 string getOutputFileNameTag(string, string);
31 string getHelpString();
32 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
33 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
36 void help() { m->mothurOut(getHelpString()); }
40 unsigned long long start;
41 unsigned long long end;
42 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
46 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
47 bool keepFirstTrim(Sequence&, QualityScores&);
48 bool removeLastTrim(Sequence&, QualityScores&);
49 bool cullLength(Sequence&);
50 bool cullHomoP(Sequence&);
51 bool cullAmbigs(Sequence&);
52 string reverseOligo(string);
54 bool abort, createGroup;
55 string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
57 bool flip, allFiles, qtrim, keepforward;
58 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
59 int qWindowSize, qWindowStep, keepFirst, removeLast;
60 double qRollAverage, qThreshold, qWindowAverage, qAverage;
61 vector<string> revPrimer, outputNames;
62 set<string> filesToRemove;
63 map<string, int> barcodes;
64 vector<string> groupVector;
65 map<string, int> primers;
66 vector<string> linker;
67 vector<string> spacer;
68 map<string, int> combos;
69 map<string, int> groupToIndex;
70 vector<string> primerNameVector; //needed here?
71 vector<string> barcodeNameVector; //needed here?
72 map<string, int> groupCounts;
73 map<string, string> nameMap;
74 map<string, int> nameCount; //for countfile name -> repCount
75 map<string, string> groupMap; //for countfile name -> group
77 vector<int> processIDS; //processid
78 vector<linePair> lines;
79 vector<linePair> qLines;
81 int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
82 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
83 int setLines(string, string);
86 /**************************************************************************************************/
87 //custom data structure for threads to use.
88 // This is passed by void pointer so it can be any data type
89 // that can be passed using a single void pointer (LPVOID).
91 unsigned long long start, end;
93 string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
94 vector<vector<string> > fastaFileNames;
95 vector<vector<string> > qualFileNames;
96 vector<vector<string> > nameFileNames;
97 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
98 bool flip, allFiles, qtrim, keepforward, createGroup;
99 int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
100 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
101 double qRollAverage, qThreshold, qWindowAverage, qAverage;
102 vector<string> revPrimer;
103 map<string, int> barcodes;
104 map<string, int> primers;
105 map<string, int> nameCount;
106 vector<string> linker;
107 vector<string> spacer;
108 map<string, int> combos;
109 vector<string> primerNameVector;
110 vector<string> barcodeNameVector;
111 map<string, int> groupCounts;
112 map<string, string> nameMap;
113 map<string, string> groupMap;
116 trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
117 int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
118 vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
119 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
120 int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
128 scrapQFileName = sqn;
130 scrapNFileName = snn;
132 scrapCFileName = scn;
134 fastaFileNames = ffn;
150 primers = pri; numFPrimers = primers.size();
151 revPrimer = revP; numRPrimers = revPrimer.size();
152 linker = li; numLinkers = linker.size();
153 spacer = spa; numSpacers = spacer.size();
154 primerNameVector = priNameVector;
155 barcodeNameVector = barNameVector;
157 createGroup = cGroup;
161 removeLast = removeL;
162 qWindowStep = WindowStep;
163 qWindowSize = WindowSize;
164 qWindowAverage = WindowAverage;
166 qThreshold = Threshold;
168 qRollAverage = RollAverage;
178 /**************************************************************************************************/
179 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
181 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
182 trimData* pDataArray;
183 pDataArray = (trimData*)lpParam;
186 ofstream trimFASTAFile;
187 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
189 ofstream scrapFASTAFile;
190 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
192 ofstream trimQualFile;
193 ofstream scrapQualFile;
194 if(pDataArray->qFileName != ""){
195 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
196 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
199 ofstream trimNameFile;
200 ofstream scrapNameFile;
201 if(pDataArray->nameFile != ""){
202 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
203 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
207 ofstream outGroupsFile;
208 if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
209 if(pDataArray->allFiles){
210 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
211 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
212 if (pDataArray->fastaFileNames[i][j] != "") {
214 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
215 if(pDataArray->qFileName != ""){
216 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
219 if(pDataArray->nameFile != ""){
220 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
227 ofstream trimCountFile;
228 ofstream scrapCountFile;
229 if(pDataArray->countfile != ""){
230 pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
231 pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
232 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
236 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
237 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
239 }else { //this accounts for the difference in line endings.
240 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
244 if(pDataArray->qFileName != "") {
245 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
246 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
248 }else { //this accounts for the difference in line endings.
249 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
254 TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
256 pDataArray->count = pDataArray->lineEnd;
257 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
259 if (pDataArray->m->control_pressed) {
260 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
261 if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
262 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
263 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
264 if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
266 if(pDataArray->qFileName != ""){ qFile.close(); }
271 string trashCode = "";
272 int currentSeqsDiffs = 0;
274 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
276 QualityScores currQual;
277 if(pDataArray->qFileName != ""){
278 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
281 string origSeq = currSeq.getUnaligned();
284 int barcodeIndex = 0;
287 if(pDataArray->numLinkers != 0){
288 success = trimOligos.stripLinker(currSeq, currQual);
289 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
290 else{ currentSeqsDiffs += success; }
293 if(pDataArray->barcodes.size() != 0){
294 success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
295 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
296 else{ currentSeqsDiffs += success; }
299 if(pDataArray->numSpacers != 0){
300 success = trimOligos.stripSpacer(currSeq, currQual);
301 if(success > pDataArray->sdiffs) { trashCode += 's'; }
302 else{ currentSeqsDiffs += success; }
306 if(pDataArray->numFPrimers != 0){
307 success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
308 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
309 else{ currentSeqsDiffs += success; }
312 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
314 if(pDataArray->numRPrimers != 0){
315 success = trimOligos.stripReverse(currSeq, currQual);
316 if(!success) { trashCode += 'r'; }
319 if(pDataArray->keepFirst != 0){
320 //success = keepFirstTrim(currSeq, currQual);
322 if(currQual.getName() != ""){
323 currQual.trimQScores(-1, pDataArray->keepFirst);
325 currSeq.trim(pDataArray->keepFirst);
328 if(pDataArray->removeLast != 0){
329 //success = removeLastTrim(currSeq, currQual);
331 int length = currSeq.getNumBases() - pDataArray->removeLast;
334 if(currQual.getName() != ""){
335 currQual.trimQScores(-1, length);
337 currSeq.trim(length);
342 if(!success) { trashCode += 'l'; }
346 if(pDataArray->qFileName != ""){
347 int origLength = currSeq.getNumBases();
349 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
350 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
351 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
352 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
353 else { success = 1; }
355 //you don't want to trim, if it fails above then scrap it
356 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
358 if(!success) { trashCode += 'q'; }
361 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
362 //success = cullLength(currSeq);
363 int length = currSeq.getNumBases();
364 success = 0; //guilty until proven innocent
365 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
366 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
367 else { success = 0; }
369 if(!success) { trashCode += 'l'; }
371 if(pDataArray->maxHomoP > 0){
372 //success = cullHomoP(currSeq);
373 int longHomoP = currSeq.getLongHomoPolymer();
374 success = 0; //guilty until proven innocent
375 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
376 else { success = 0; }
378 if(!success) { trashCode += 'h'; }
380 if(pDataArray->maxAmbig != -1){
381 //success = cullAmbigs(currSeq);
382 int numNs = currSeq.getAmbigBases();
383 success = 0; //guilty until proven innocent
384 if(numNs <= pDataArray->maxAmbig) { success = 1; }
385 else { success = 0; }
386 if(!success) { trashCode += 'n'; }
389 if(pDataArray->flip){ // should go last
390 currSeq.reverseComplement();
391 if(pDataArray->qFileName != ""){
392 currQual.flipQScores();
396 if(trashCode.length() == 0){
397 currSeq.setAligned(currSeq.getUnaligned());
398 currSeq.printSequence(trimFASTAFile);
400 if(pDataArray->qFileName != ""){
401 currQual.printQScores(trimQualFile);
404 if(pDataArray->nameFile != ""){
405 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
406 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
407 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
410 int numRedundants = 0;
411 if (pDataArray->countfile != "") {
412 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
413 if (itCount != pDataArray->nameCount.end()) {
414 trimCountFile << itCount->first << '\t' << itCount->second << endl;
415 numRedundants = itCount->second-1;
416 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
419 if (pDataArray->createGroup) {
420 if(pDataArray->barcodes.size() != 0){
421 string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
422 if (pDataArray->primers.size() != 0) {
423 if (pDataArray->primerNameVector[primerIndex] != "") {
424 if(thisGroup != "") {
425 thisGroup += "." + pDataArray->primerNameVector[primerIndex];
427 thisGroup = pDataArray->primerNameVector[primerIndex];
432 if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
433 else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
435 if (pDataArray->nameFile != "") {
436 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
437 if (itName != pDataArray->nameMap.end()) {
438 vector<string> thisSeqsNames;
439 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
440 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
441 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
442 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
444 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
447 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
448 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
449 else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
454 if(pDataArray->allFiles){
456 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
457 currSeq.printSequence(output);
460 if(pDataArray->qFileName != ""){
461 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
462 currQual.printQScores(output);
466 if(pDataArray->nameFile != ""){
467 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
468 if (itName != pDataArray->nameMap.end()) {
469 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
470 output << itName->first << '\t' << itName->second << endl;
472 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
477 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
478 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
479 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
480 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
482 if (pDataArray->countfile != "") {
483 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
484 if (itCount != pDataArray->nameCount.end()) {
485 trimCountFile << itCount->first << '\t' << itCount->second << endl;
486 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
488 currSeq.setName(currSeq.getName() + '|' + trashCode);
489 currSeq.setUnaligned(origSeq);
490 currSeq.setAligned(origSeq);
491 currSeq.printSequence(scrapFASTAFile);
492 if(pDataArray->qFileName != ""){
493 currQual.printQScores(scrapQualFile);
500 if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
504 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
508 trimFASTAFile.close();
509 scrapFASTAFile.close();
510 if (pDataArray->createGroup) { outGroupsFile.close(); }
511 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
512 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
517 catch(exception& e) {
518 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
525 /**************************************************************************************************/