1 #ifndef CHIMERASLAYERCOMMAND_H
2 #define CHIMERASLAYERCOMMAND_H
5 * chimeraslayercommand.h
8 * Created by westcott on 3/31/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
16 #include "chimeraslayer.h"
17 #include "sequenceparser.h"
18 #include "sequencecountparser.h"
20 /***********************************************************/
22 class ChimeraSlayerCommand : public Command {
24 ChimeraSlayerCommand(string);
25 ChimeraSlayerCommand();
26 ~ChimeraSlayerCommand() {}
28 vector<string> setParameters();
29 string getCommandName() { return "chimera.slayer"; }
30 string getCommandCategory() { return "Sequence Processing"; }
32 string getHelpString();
33 string getOutputPattern(string);
34 string getCitation() { return "Haas BJ, Gevers D, Earl A, Feldgarden M, Ward DV, Giannokous G, Ciulla D, Tabbaa D, Highlander SK, Sodergren E, Methe B, Desantis TZ, Petrosino JF, Knight R, Birren BW (2011). Chimeric 16S rRNA sequence formation and detection in Sanger and 454-pyrosequenced PCR amplicons. Genome Res 21:494.\nhttp://www.mothur.org/wiki/Chimera.slayer"; }
35 string getDescription() { return "detect chimeric sequences"; }
38 void help() { m->mothurOut(getHelpString()); }
43 unsigned long long start;
44 unsigned long long end;
45 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
48 vector<int> processIDS; //processid
49 vector<linePair> lines;
51 int driver(linePair, string, string, string, string, map<string, int>&);
52 int createProcesses(string, string, string, string, map<string, int>&);
53 int divideInHalf(Sequence, string&, string&);
54 map<string, int> sortFastaFile(string, string);
55 map<string, int> sortFastaFile(vector<Sequence>&, map<string, string>&, string newFile);
56 int sortFastaFile(vector<Sequence>&, map<string, int>&, string newFile);
57 string getNamesFile(string&);
58 //int setupChimera(string,);
59 int MPIExecute(string, string, string, string, map<string, int>&);
60 int deconvoluteResults(map<string, string>&, string, string, string);
61 map<string, int> priority;
62 int setUpForSelfReference(SequenceParser*&, map<string, string>&, map<string, map<string, int> >&, int);
63 int setUpForSelfReference(SequenceCountParser*&, map<string, string>&, map<string, map<string, int> >&, int);
64 int driverGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&, string);
65 int createProcessesGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&, string, string);
66 int MPIExecuteGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&, string, string);
70 int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, set<string>&, vector<unsigned long long>&, string, map<string, int>&, bool);
73 bool abort, realign, trim, trimera, save, hasName, hasCount, dups;
74 string fastafile, groupfile, templatefile, outputDir, search, namefile, countfile, blastlocation;
75 int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
78 map<string, map<string, string> > group2NameMap;
79 vector<string> outputNames;
80 vector<string> fastaFileNames;
81 vector<string> nameFileNames;
82 vector<string> groupFileNames;
86 /***********************************************************/
88 //custom data structure for threads to use.
89 // This is passed by void pointer so it can be any data type
90 // that can be passed using a single void pointer (LPVOID).
95 string filename, countlist;
100 bool trim, realign, dups, hasCount;
101 unsigned long long start;
102 unsigned long long end;
103 int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted;
106 map<string, int> priority;
110 map<string, map<string, int> > fileToPriority;
111 map<string, string> fileGroup;
112 map<string, map<string, string> > group2NameMap;
115 slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
133 minSimilarity = minS;
147 slayerData(map<string, map<string, string> > g2n, bool hc, bool dps, string cl, string o, string fa, string ac, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, map<string, map<string, int> >& fPriority, map<string, string>& fileG, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
163 fileToPriority = fPriority;
168 minSimilarity = minS;
185 /**************************************************************************************************/
186 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
188 static DWORD WINAPI MySlayerThreadFunction(LPVOID lpParam){
189 slayerData* pDataArray;
190 pDataArray = (slayerData*)lpParam;
194 pDataArray->m->openOutputFile(pDataArray->outputFName, out);
197 pDataArray->m->openOutputFile(pDataArray->accnos, out2);
200 if (pDataArray->trim) { pDataArray->m->openOutputFile(pDataArray->fasta, out3); }
203 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
208 if (pDataArray->templatefile != "self") { //you want to run slayer with a reference template
209 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
211 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->priority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
214 //print header if you are process 0
215 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
216 chimera->printHeader(out);
218 }else { //this accounts for the difference in line endings.
219 inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
222 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
224 if (chimera->getUnaligned()) {
225 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
226 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
230 int templateSeqsLength = chimera->getLength();
232 if (pDataArray->start == 0) { chimera->printHeader(out); }
234 pDataArray->count = 0;
235 for(int i = 0; i < pDataArray->end; i++){
237 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
239 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
240 string candidateAligned = candidateSeq->getAligned();
242 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
243 if (candidateSeq->getAligned().length() != templateSeqsLength) {
244 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
247 chimera->getChimeras(candidateSeq);
249 if (pDataArray->m->control_pressed) { delete candidateSeq; delete chimera; return 1; }
251 //if you are not chimeric, then check each half
252 data_results wholeResults = chimera->getResults();
254 //determine if we need to split
255 bool isChimeric = false;
257 if (wholeResults.flag == "yes") {
258 string chimeraFlag = "no";
259 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
261 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
264 if (chimeraFlag == "yes") {
265 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
269 if ((!isChimeric) && pDataArray->trimera) {
271 //split sequence in half by bases
272 string leftQuery, rightQuery;
273 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
274 //divideInHalf(tempSeq, leftQuery, rightQuery);
275 string queryUnAligned = tempSeq.getUnaligned();
276 int numBases = int(queryUnAligned.length() * 0.5);
278 string queryAligned = tempSeq.getAligned();
279 leftQuery = tempSeq.getAligned();
280 rightQuery = tempSeq.getAligned();
284 for (int i = 0; i < queryAligned.length(); i++) {
286 if (isalpha(queryAligned[i])) {
291 if (baseCount >= numBases) { leftSpot = i; break; } //first half
294 //blank out right side
295 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
297 //blank out left side
298 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
300 //run chimeraSlayer on each piece
301 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
302 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
305 chimera->getChimeras(left);
306 data_results leftResults = chimera->getResults();
308 chimera->getChimeras(right);
309 data_results rightResults = chimera->getResults();
311 //if either piece is chimeric then report
312 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
313 if (pDataArray->trim) { trimmed.printSequence(out3); }
315 delete left; delete right;
317 }else { //already chimeric
319 Sequence trimmed = chimera->print(out, out2);
320 if (pDataArray->trim) { trimmed.printSequence(out3); }
330 if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count) +"\n"); }
333 if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(pDataArray->count)+"\n"); }
335 pDataArray->numNoParents = chimera->getNumNoParents();
336 if (pDataArray->numNoParents == pDataArray->count) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
340 if (pDataArray->trim) { out3.close(); }
347 catch(exception& e) {
348 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerThreadFunction");
353 /**************************************************************************************************/
355 static DWORD WINAPI MySlayerGroupThreadFunction(LPVOID lpParam){
356 slayerData* pDataArray;
357 pDataArray = (slayerData*)lpParam;
360 ofstream outCountList;
361 if (pDataArray->hasCount && pDataArray->dups) { pDataArray->m->openOutputFile(pDataArray->countlist, outCountList); }
366 for (map<string, map<string, int> >::iterator itFile = pDataArray->fileToPriority.begin(); itFile != pDataArray->fileToPriority.end(); itFile++) {
368 if (pDataArray->m->control_pressed) { return 0; }
370 int start = time(NULL);
371 string thisFastaName = itFile->first;
372 map<string, int> thisPriority = itFile->second;
373 string thisoutputFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.chimera";
374 string thisaccnosFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.accnos";
375 string thistrimFastaFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.fasta";
377 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group: " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
379 //int numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName, thisPriority);
380 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
383 pDataArray->m->openOutputFile(thisoutputFileName, out);
386 pDataArray->m->openOutputFile(thisaccnosFileName, out2);
389 if (pDataArray->trim) { pDataArray->m->openOutputFile(thistrimFastaFileName, out3); }
392 pDataArray->m->openInputFile(thisFastaName, inFASTA);
395 chimera = new ChimeraSlayer(thisFastaName, pDataArray->templatefile, pDataArray->trim, thisPriority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
396 chimera->printHeader(out);
400 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
402 if (chimera->getUnaligned()) {
403 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
404 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
408 int templateSeqsLength = chimera->getLength();
413 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
415 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
416 string candidateAligned = candidateSeq->getAligned();
418 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
419 if (candidateSeq->getAligned().length() != templateSeqsLength) {
420 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
423 chimera->getChimeras(candidateSeq);
425 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete candidateSeq; delete chimera; return 1; }
427 //if you are not chimeric, then check each half
428 data_results wholeResults = chimera->getResults();
430 //determine if we need to split
431 bool isChimeric = false;
433 if (wholeResults.flag == "yes") {
434 string chimeraFlag = "no";
435 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
437 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
440 if (chimeraFlag == "yes") {
441 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
445 if ((!isChimeric) && pDataArray->trimera) {
447 //split sequence in half by bases
448 string leftQuery, rightQuery;
449 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
450 //divideInHalf(tempSeq, leftQuery, rightQuery);
451 string queryUnAligned = tempSeq.getUnaligned();
452 int numBases = int(queryUnAligned.length() * 0.5);
454 string queryAligned = tempSeq.getAligned();
455 leftQuery = tempSeq.getAligned();
456 rightQuery = tempSeq.getAligned();
460 for (int i = 0; i < queryAligned.length(); i++) {
462 if (isalpha(queryAligned[i])) {
467 if (baseCount >= numBases) { leftSpot = i; break; } //first half
470 //blank out right side
471 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
473 //blank out left side
474 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
476 //run chimeraSlayer on each piece
477 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
478 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
481 chimera->getChimeras(left);
482 data_results leftResults = chimera->getResults();
484 chimera->getChimeras(right);
485 data_results rightResults = chimera->getResults();
487 //if either piece is chimeric then report
488 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
489 if (pDataArray->trim) { trimmed.printSequence(out3); }
491 delete left; delete right;
493 }else { //already chimeric
495 Sequence trimmed = chimera->print(out, out2);
496 if (pDataArray->trim) { trimmed.printSequence(out3); }
506 if (inFASTA.eof()) { break; }
509 if((numSeqs) % 100 == 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(numSeqs)+"\n"); pDataArray->m->mothurOutEndLine(); }
512 if((numSeqs) % 100 != 0){ pDataArray->m->mothurOutJustToScreen("Processing sequence: " + toString(numSeqs)+"\n"); }
514 pDataArray->numNoParents = chimera->getNumNoParents();
515 if (pDataArray->numNoParents == numSeqs) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
519 if (pDataArray->trim) { out3.close(); }
524 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
526 //if we provided a count file with group info and set dereplicate=t, then we want to create a *.pick.count_table
527 //This table will zero out group counts for seqs determined to be chimeric by that group.
528 if (pDataArray->dups) {
529 if (!pDataArray->m->isBlank(thisaccnosFileName)) {
531 pDataArray->m->openInputFile(thisaccnosFileName, in);
533 if (pDataArray->hasCount) {
535 in >> name; pDataArray->m->gobble(in);
536 outCountList << name << '\t' << pDataArray->fileGroup[thisFastaName] << endl;
540 map<string, map<string, string> >::iterator itGroupNameMap = pDataArray->group2NameMap.find(pDataArray->fileGroup[thisFastaName]);
541 if (itGroupNameMap != pDataArray->group2NameMap.end()) {
542 map<string, string> thisnamemap = itGroupNameMap->second;
543 map<string, string>::iterator itN;
545 pDataArray->m->openOutputFile(thisaccnosFileName+".temp", out);
547 in >> name; pDataArray->m->gobble(in);
548 //pDataArray->m->mothurOut("here = " + name + '\t');
549 itN = thisnamemap.find(name);
550 if (itN != thisnamemap.end()) {
551 vector<string> tempNames; pDataArray->m->splitAtComma(itN->second, tempNames);
552 for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; }
553 //pDataArray->m->mothurOut(itN->second + '\n');
555 }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); pDataArray->m->control_pressed = true; }
559 pDataArray->m->renameFile(thisaccnosFileName+".temp", thisaccnosFileName);
560 }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + pDataArray->fileGroup[thisFastaName] + ".\n"); pDataArray->m->control_pressed = true; }
568 pDataArray->m->appendFiles(thisoutputFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(thisoutputFileName);
569 pDataArray->m->appendFiles(thisaccnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(thisaccnosFileName);
570 if (pDataArray->trim) { pDataArray->m->appendFiles(thistrimFastaFileName, pDataArray->fasta); pDataArray->m->mothurRemove(thistrimFastaFileName); }
571 pDataArray->m->mothurRemove(thisFastaName);
573 totalSeqs += numSeqs;
575 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
578 pDataArray->count = totalSeqs;
579 if (pDataArray->hasCount && pDataArray->dups) { outCountList.close(); }
584 catch(exception& e) {
585 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerGroupThreadFunction");
592 /**************************************************************************************************/