1 #ifndef TRIMSEQSCOMMAND_H
2 #define TRIMSEQSCOMMAND_H
8 * Created by Pat Schloss on 6/6/09.
9 * Copyright 2009 Patrick D. Schloss. All rights reserved.
14 #include "command.hpp"
15 #include "sequence.hpp"
16 #include "qualityscores.h"
17 #include "trimoligos.h"
18 #include "counttable.h"
22 class TrimSeqsCommand : public Command {
24 TrimSeqsCommand(string);
28 vector<string> setParameters();
29 string getCommandName() { return "trim.seqs"; }
30 string getCommandCategory() { return "Sequence Processing"; }
32 string getHelpString();
33 string getOutputPattern(string);
34 string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
35 string getDescription() { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
38 void help() { m->mothurOut(getHelpString()); }
42 unsigned long long start;
43 unsigned long long end;
44 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
49 bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&, map<string, string>&);
50 bool keepFirstTrim(Sequence&, QualityScores&);
51 bool removeLastTrim(Sequence&, QualityScores&);
52 bool cullLength(Sequence&);
53 bool cullHomoP(Sequence&);
54 bool cullAmbigs(Sequence&);
55 bool abort, createGroup;
56 string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
58 bool flip, allFiles, qtrim, keepforward, pairedOligos, reorient, logtransform;
59 int numBarcodes, numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
60 int qWindowSize, qWindowStep, keepFirst, removeLast;
61 double qRollAverage, qThreshold, qWindowAverage, qAverage;
62 set<string> filesToRemove;
63 vector<string> groupVector,outputNames;
64 map<string, int> groupCounts;
65 map<string, string> nameMap;
66 map<string, int> nameCount; //for countfile name -> repCount
67 map<string, string> groupMap; //for countfile name -> group
69 vector<int> processIDS; //processid
70 vector<linePair> lines;
71 vector<linePair> qLines;
73 int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
74 int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
75 int setLines(string, string);
78 /**************************************************************************************************/
79 //custom data structure for threads to use.
80 // This is passed by void pointer so it can be any data type
81 // that can be passed using a single void pointer (LPVOID).
83 unsigned long long start, end;
85 string filename, qFileName, oligosfile, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
86 vector<vector<string> > fastaFileNames;
87 vector<vector<string> > qualFileNames;
88 vector<vector<string> > nameFileNames;
89 unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
90 bool flip, allFiles, qtrim, keepforward, createGroup, reorient, logtransform;
91 int maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
92 int qWindowSize, qWindowStep, keepFirst, removeLast, count;
93 double qRollAverage, qThreshold, qWindowAverage, qAverage;
94 map<string, int> nameCount;
95 map<string, int> groupCounts;
96 map<string, string> nameMap;
97 map<string, string> groupMap;
100 trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
101 int pd, int bd, int ld, int sd, int td, string o, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
102 int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage, bool lt,
103 int minL, int maxA, int maxH, int maxL, bool fli, bool reo, map<string, string> nm, map<string, int> ncount) {
111 scrapQFileName = sqn;
113 scrapNFileName = snn;
115 scrapCFileName = scn;
117 fastaFileNames = ffn;
134 createGroup = cGroup;
138 removeLast = removeL;
139 qWindowStep = WindowStep;
140 qWindowSize = WindowSize;
141 qWindowAverage = WindowAverage;
143 qThreshold = Threshold;
145 qRollAverage = RollAverage;
157 /**************************************************************************************************/
158 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
160 static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
161 trimData* pDataArray;
162 pDataArray = (trimData*)lpParam;
165 ofstream trimFASTAFile;
166 pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
168 ofstream scrapFASTAFile;
169 pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
171 ofstream trimQualFile;
172 ofstream scrapQualFile;
173 if(pDataArray->qFileName != ""){
174 pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
175 pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
178 ofstream trimNameFile;
179 ofstream scrapNameFile;
180 if(pDataArray->nameFile != ""){
181 pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
182 pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
186 ofstream outGroupsFile;
187 if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
188 if(pDataArray->allFiles){
189 for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
190 for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
191 if (pDataArray->fastaFileNames[i][j] != "") {
193 pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
194 if(pDataArray->qFileName != ""){
195 pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
198 if(pDataArray->nameFile != ""){
199 pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
206 ofstream trimCountFile;
207 ofstream scrapCountFile;
208 if(pDataArray->countfile != ""){
209 pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
210 pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
211 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
215 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
216 if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
218 }else { //this accounts for the difference in line endings.
219 inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
223 if(pDataArray->qFileName != "") {
224 pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
225 if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
227 }else { //this accounts for the difference in line endings.
228 qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
232 int numBarcodes, numLinkers, numSpacers, numRPrimers, numFPrimers;
234 Oligos oligos(pDataArray->oligosfile);
235 if (oligos.hasPairedBarcodes()) {
237 numFPrimers = oligos.getPairedPrimers().size();
238 numBarcodes = oligos.getPairedBarcodes().size();
240 pairedOligos = false;
241 numFPrimers = oligos.getPrimers().size();
242 numBarcodes = oligos.getBarcodes().size();
245 numLinkers = oligos.getLinkers().size();
246 numSpacers = oligos.getSpacers().size();
247 numRPrimers = oligos.getReversePrimers().size();
249 TrimOligos* trimOligos = NULL;
250 if (pairedOligos) { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, oligos.getPairedPrimers(), oligos.getPairedBarcodes()); }
251 else { trimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, oligos.getPrimers(), oligos.getBarcodes(), oligos.getReversePrimers(), oligos.getLinkers(), oligos.getSpacers()); }
253 TrimOligos* rtrimOligos = NULL;
254 if (pDataArray->reorient) {
255 rtrimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, oligos.getReorientedPairedPrimers(), oligos.getReorientedPairedBarcodes()); numBarcodes = oligos.getReorientedPairedBarcodes().size();
258 pDataArray->count = 0;
259 for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
261 if (pDataArray->m->control_pressed) {
262 delete trimOligos; if (pDataArray->reorient) { delete rtrimOligos; }
263 inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
264 if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); }
265 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
266 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
267 if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); }
269 if(pDataArray->qFileName != ""){ qFile.close(); }
274 string trashCode = "";
275 int currentSeqsDiffs = 0;
277 Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
278 Sequence savedSeq(currSeq.getName(), currSeq.getAligned());
280 QualityScores currQual; QualityScores savedQual;
281 if(pDataArray->qFileName != ""){
282 currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
283 savedQual.setName(currQual.getName()); savedQual.setScores(currQual.getScores());
287 string origSeq = currSeq.getUnaligned();
291 int barcodeIndex = 0;
295 success = trimOligos->stripLinker(currSeq, currQual);
296 if(success > pDataArray->ldiffs) { trashCode += 'k'; }
297 else{ currentSeqsDiffs += success; }
300 if(numBarcodes != 0){
301 success = trimOligos->stripBarcode(currSeq, currQual, barcodeIndex);
302 if(success > pDataArray->bdiffs) { trashCode += 'b'; }
303 else{ currentSeqsDiffs += success; }
307 success = trimOligos->stripSpacer(currSeq, currQual);
308 if(success > pDataArray->sdiffs) { trashCode += 's'; }
309 else{ currentSeqsDiffs += success; }
313 if(numFPrimers != 0){
314 success = trimOligos->stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
315 if(success > pDataArray->pdiffs) { trashCode += 'f'; }
316 else{ currentSeqsDiffs += success; }
319 if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
321 if(numRPrimers != 0){
322 success = trimOligos->stripReverse(currSeq, currQual);
323 if(!success) { trashCode += 'r'; }
326 if (pDataArray->reorient && (trashCode != "")) { //if you failed and want to check the reverse
328 string thisTrashCode = "";
329 int thisCurrentSeqsDiffs = 0;
331 int thisBarcodeIndex = 0;
332 int thisPrimerIndex = 0;
334 if(numBarcodes != 0){
335 thisSuccess = rtrimOligos->stripBarcode(savedSeq, savedQual, thisBarcodeIndex);
336 if(thisSuccess > pDataArray->bdiffs) { thisTrashCode += 'b'; }
337 else{ thisCurrentSeqsDiffs += thisSuccess; }
340 if(numFPrimers != 0){
341 thisSuccess = rtrimOligos->stripForward(savedSeq, savedQual, thisPrimerIndex, pDataArray->keepforward);
342 if(thisSuccess > pDataArray->pdiffs) { thisTrashCode += 'f'; }
343 else{ thisCurrentSeqsDiffs += thisSuccess; }
346 if (thisCurrentSeqsDiffs > pDataArray->tdiffs) { thisTrashCode += 't'; }
348 if (thisTrashCode == "") {
349 trashCode = thisTrashCode;
350 success = thisSuccess;
351 currentSeqsDiffs = thisCurrentSeqsDiffs;
352 barcodeIndex = thisBarcodeIndex;
353 primerIndex = thisPrimerIndex;
354 savedSeq.reverseComplement();
355 currSeq.setAligned(savedSeq.getAligned());
356 if(pDataArray->qFileName != ""){
357 savedQual.flipQScores();
358 currQual.setScores(savedQual.getScores());
360 }else { trashCode += "(" + thisTrashCode + ")"; }
364 if(pDataArray->keepFirst != 0){
365 //success = keepFirstTrim(currSeq, currQual);
367 if(currQual.getName() != ""){
368 currQual.trimQScores(-1, pDataArray->keepFirst);
370 currSeq.trim(pDataArray->keepFirst);
373 if(pDataArray->removeLast != 0){
374 //success = removeLastTrim(currSeq, currQual);
376 int length = currSeq.getNumBases() - pDataArray->removeLast;
379 if(currQual.getName() != ""){
380 currQual.trimQScores(-1, length);
382 currSeq.trim(length);
387 if(!success) { trashCode += 'l'; }
391 if(pDataArray->qFileName != ""){
392 int origLength = currSeq.getNumBases();
394 if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
395 else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage, pDataArray->logtransform); }
396 else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage, pDataArray->logtransform); }
397 else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage, pDataArray->logtransform); }
398 else { success = 1; }
400 //you don't want to trim, if it fails above then scrap it
401 if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
403 if(!success) { trashCode += 'q'; }
406 if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
407 //success = cullLength(currSeq);
408 int length = currSeq.getNumBases();
409 success = 0; //guilty until proven innocent
410 if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
411 else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
412 else { success = 0; }
414 if(!success) { trashCode += 'l'; }
416 if(pDataArray->maxHomoP > 0){
417 //success = cullHomoP(currSeq);
418 int longHomoP = currSeq.getLongHomoPolymer();
419 success = 0; //guilty until proven innocent
420 if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
421 else { success = 0; }
423 if(!success) { trashCode += 'h'; }
425 if(pDataArray->maxAmbig != -1){
426 //success = cullAmbigs(currSeq);
427 int numNs = currSeq.getAmbigBases();
428 success = 0; //guilty until proven innocent
429 if(numNs <= pDataArray->maxAmbig) { success = 1; }
430 else { success = 0; }
431 if(!success) { trashCode += 'n'; }
434 if(pDataArray->flip){ // should go last
435 currSeq.reverseComplement();
436 if(pDataArray->qFileName != ""){
437 currQual.flipQScores();
441 if(trashCode.length() == 0){
442 string thisGroup = "";
443 if (pDataArray->createGroup) { thisGroup = oligos.getGroupName(barcodeIndex, primerIndex); }
445 int pos = thisGroup.find("ignore");
446 if (pos == string::npos) {
448 currSeq.setAligned(currSeq.getUnaligned());
449 currSeq.printSequence(trimFASTAFile);
451 if(pDataArray->qFileName != ""){
452 currQual.printQScores(trimQualFile);
455 if(pDataArray->nameFile != ""){
456 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
457 if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
458 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
461 int numRedundants = 0;
462 if (pDataArray->countfile != "") {
463 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
464 if (itCount != pDataArray->nameCount.end()) {
465 trimCountFile << itCount->first << '\t' << itCount->second << endl;
466 numRedundants = itCount->second-1;
467 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
470 if (pDataArray->createGroup) {
471 if(numBarcodes != 0){
473 if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
474 else { pDataArray->groupMap[currSeq.getName()] = thisGroup; }
476 if (pDataArray->nameFile != "") {
477 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
478 if (itName != pDataArray->nameMap.end()) {
479 vector<string> thisSeqsNames;
480 pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
481 numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
482 for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
483 outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
485 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
488 map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
489 if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
490 else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
495 if(pDataArray->allFiles){
497 pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
498 currSeq.printSequence(output);
501 if(pDataArray->qFileName != ""){
502 pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
503 currQual.printQScores(output);
507 if(pDataArray->nameFile != ""){
508 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
509 if (itName != pDataArray->nameMap.end()) {
510 pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
511 output << itName->first << '\t' << itName->second << endl;
513 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
519 if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
520 map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
521 if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
522 else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
524 if (pDataArray->countfile != "") {
525 map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
526 if (itCount != pDataArray->nameCount.end()) {
527 trimCountFile << itCount->first << '\t' << itCount->second << endl;
528 }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
530 currSeq.setName(currSeq.getName() + '|' + trashCode);
531 currSeq.setUnaligned(origSeq);
532 currSeq.setAligned(origSeq);
533 currSeq.printSequence(scrapFASTAFile);
534 if(pDataArray->qFileName != ""){
535 currQual.printQScores(scrapQualFile);
542 if((pDataArray->count) % 1000 == 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
546 if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
548 if (pDataArray->reorient) { delete rtrimOligos; }
551 trimFASTAFile.close();
552 scrapFASTAFile.close();
553 if (pDataArray->createGroup) { outGroupsFile.close(); }
554 if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
555 if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
560 catch(exception& e) {
561 pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
568 /**************************************************************************************************/