1 #ifndef CHIMERASLAYERCOMMAND_H
2 #define CHIMERASLAYERCOMMAND_H
5 * chimeraslayercommand.h
8 * Created by westcott on 3/31/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
16 #include "chimeraslayer.h"
17 #include "sequenceparser.h"
18 #include "sequencecountparser.h"
20 /***********************************************************/
22 class ChimeraSlayerCommand : public Command {
24 ChimeraSlayerCommand(string);
25 ChimeraSlayerCommand();
26 ~ChimeraSlayerCommand() {}
28 vector<string> setParameters();
29 string getCommandName() { return "chimera.slayer"; }
30 string getCommandCategory() { return "Sequence Processing"; }
31 string getOutputFileNameTag(string, string);
32 string getHelpString();
33 string getCitation() { return "Haas BJ, Gevers D, Earl A, Feldgarden M, Ward DV, Giannokous G, Ciulla D, Tabbaa D, Highlander SK, Sodergren E, Methe B, Desantis TZ, Petrosino JF, Knight R, Birren BW (2011). Chimeric 16S rRNA sequence formation and detection in Sanger and 454-pyrosequenced PCR amplicons. Genome Res 21:494.\nhttp://www.mothur.org/wiki/Chimera.slayer"; }
34 string getDescription() { return "detect chimeric sequences"; }
37 void help() { m->mothurOut(getHelpString()); }
42 unsigned long long start;
43 unsigned long long end;
44 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
47 vector<int> processIDS; //processid
48 vector<linePair> lines;
50 int driver(linePair, string, string, string, string, map<string, int>&);
51 int createProcesses(string, string, string, string, map<string, int>&);
52 int divideInHalf(Sequence, string&, string&);
53 map<string, int> sortFastaFile(string, string);
54 map<string, int> sortFastaFile(vector<Sequence>&, map<string, string>&, string newFile);
55 int sortFastaFile(vector<Sequence>&, map<string, int>&, string newFile);
56 string getNamesFile(string&);
57 //int setupChimera(string,);
58 int MPIExecute(string, string, string, string, map<string, int>&);
59 int deconvoluteResults(map<string, string>&, string, string, string);
60 map<string, int> priority;
61 int setUpForSelfReference(SequenceParser*&, map<string, string>&, map<string, map<string, int> >&, int);
62 int setUpForSelfReference(SequenceCountParser*&, map<string, string>&, map<string, map<string, int> >&, int);
63 int driverGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
64 int createProcessesGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
65 int MPIExecuteGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
69 int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, string, map<string, int>&, bool);
72 bool abort, realign, trim, trimera, save, hasName, hasCount;
73 string fastafile, groupfile, templatefile, outputDir, search, namefile, countfile, blastlocation;
74 int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
77 vector<string> outputNames;
78 vector<string> fastaFileNames;
79 vector<string> nameFileNames;
80 vector<string> groupFileNames;
84 /***********************************************************/
86 //custom data structure for threads to use.
87 // This is passed by void pointer so it can be any data type
88 // that can be passed using a single void pointer (LPVOID).
99 unsigned long long start;
100 unsigned long long end;
101 int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted;
104 map<string, int> priority;
108 map<string, map<string, int> > fileToPriority;
109 map<string, string> fileGroup;
112 slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
130 minSimilarity = minS;
144 slayerData(string o, string fa, string ac, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, map<string, map<string, int> >& fPriority, map<string, string>& fileG, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
156 fileToPriority = fPriority;
161 minSimilarity = minS;
178 /**************************************************************************************************/
179 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
181 static DWORD WINAPI MySlayerThreadFunction(LPVOID lpParam){
182 slayerData* pDataArray;
183 pDataArray = (slayerData*)lpParam;
187 pDataArray->m->openOutputFile(pDataArray->outputFName, out);
190 pDataArray->m->openOutputFile(pDataArray->accnos, out2);
193 if (pDataArray->trim) { pDataArray->m->openOutputFile(pDataArray->fasta, out3); }
196 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
201 if (pDataArray->templatefile != "self") { //you want to run slayer with a reference template
202 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
204 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->priority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
207 //print header if you are process 0
208 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
209 chimera->printHeader(out);
211 }else { //this accounts for the difference in line endings.
212 inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
215 pDataArray->count = pDataArray->end;
217 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
219 if (chimera->getUnaligned()) {
220 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
221 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
225 int templateSeqsLength = chimera->getLength();
227 if (pDataArray->start == 0) { chimera->printHeader(out); }
230 for(int i = 0; i < pDataArray->end; i++){
232 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
234 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
235 string candidateAligned = candidateSeq->getAligned();
237 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
238 if (candidateSeq->getAligned().length() != templateSeqsLength) {
239 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
242 chimera->getChimeras(candidateSeq);
244 if (pDataArray->m->control_pressed) { delete candidateSeq; delete chimera; return 1; }
246 //if you are not chimeric, then check each half
247 data_results wholeResults = chimera->getResults();
249 //determine if we need to split
250 bool isChimeric = false;
252 if (wholeResults.flag == "yes") {
253 string chimeraFlag = "no";
254 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
256 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
259 if (chimeraFlag == "yes") {
260 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
264 if ((!isChimeric) && pDataArray->trimera) {
266 //split sequence in half by bases
267 string leftQuery, rightQuery;
268 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
269 //divideInHalf(tempSeq, leftQuery, rightQuery);
270 string queryUnAligned = tempSeq.getUnaligned();
271 int numBases = int(queryUnAligned.length() * 0.5);
273 string queryAligned = tempSeq.getAligned();
274 leftQuery = tempSeq.getAligned();
275 rightQuery = tempSeq.getAligned();
279 for (int i = 0; i < queryAligned.length(); i++) {
281 if (isalpha(queryAligned[i])) {
286 if (baseCount >= numBases) { leftSpot = i; break; } //first half
289 //blank out right side
290 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
292 //blank out left side
293 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
295 //run chimeraSlayer on each piece
296 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
297 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
300 chimera->getChimeras(left);
301 data_results leftResults = chimera->getResults();
303 chimera->getChimeras(right);
304 data_results rightResults = chimera->getResults();
306 //if either piece is chimeric then report
307 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
308 if (pDataArray->trim) { trimmed.printSequence(out3); }
310 delete left; delete right;
312 }else { //already chimeric
314 Sequence trimmed = chimera->print(out, out2);
315 if (pDataArray->trim) { trimmed.printSequence(out3); }
325 if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
328 if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
330 pDataArray->numNoParents = chimera->getNumNoParents();
331 if (pDataArray->numNoParents == count) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
335 if (pDataArray->trim) { out3.close(); }
342 catch(exception& e) {
343 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerThreadFunction");
348 /**************************************************************************************************/
350 static DWORD WINAPI MySlayerGroupThreadFunction(LPVOID lpParam){
351 slayerData* pDataArray;
352 pDataArray = (slayerData*)lpParam;
358 for (map<string, map<string, int> >::iterator itFile = pDataArray->fileToPriority.begin(); itFile != pDataArray->fileToPriority.end(); itFile++) {
360 if (pDataArray->m->control_pressed) { return 0; }
362 int start = time(NULL);
363 string thisFastaName = itFile->first;
364 map<string, int> thisPriority = itFile->second;
365 string thisoutputFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.chimera";
366 string thisaccnosFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.accnos";
367 string thistrimFastaFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.fasta";
369 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group: " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
371 //int numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName, thisPriority);
372 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
375 pDataArray->m->openOutputFile(thisoutputFileName, out);
378 pDataArray->m->openOutputFile(thisaccnosFileName, out2);
381 if (pDataArray->trim) { pDataArray->m->openOutputFile(thistrimFastaFileName, out3); }
384 pDataArray->m->openInputFile(thisFastaName, inFASTA);
387 chimera = new ChimeraSlayer(thisFastaName, pDataArray->templatefile, pDataArray->trim, thisPriority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
388 chimera->printHeader(out);
392 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
394 if (chimera->getUnaligned()) {
395 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
396 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
400 int templateSeqsLength = chimera->getLength();
405 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
407 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
408 string candidateAligned = candidateSeq->getAligned();
410 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
411 if (candidateSeq->getAligned().length() != templateSeqsLength) {
412 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
415 chimera->getChimeras(candidateSeq);
417 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete candidateSeq; delete chimera; return 1; }
419 //if you are not chimeric, then check each half
420 data_results wholeResults = chimera->getResults();
422 //determine if we need to split
423 bool isChimeric = false;
425 if (wholeResults.flag == "yes") {
426 string chimeraFlag = "no";
427 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
429 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
432 if (chimeraFlag == "yes") {
433 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
437 if ((!isChimeric) && pDataArray->trimera) {
439 //split sequence in half by bases
440 string leftQuery, rightQuery;
441 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
442 //divideInHalf(tempSeq, leftQuery, rightQuery);
443 string queryUnAligned = tempSeq.getUnaligned();
444 int numBases = int(queryUnAligned.length() * 0.5);
446 string queryAligned = tempSeq.getAligned();
447 leftQuery = tempSeq.getAligned();
448 rightQuery = tempSeq.getAligned();
452 for (int i = 0; i < queryAligned.length(); i++) {
454 if (isalpha(queryAligned[i])) {
459 if (baseCount >= numBases) { leftSpot = i; break; } //first half
462 //blank out right side
463 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
465 //blank out left side
466 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
468 //run chimeraSlayer on each piece
469 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
470 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
473 chimera->getChimeras(left);
474 data_results leftResults = chimera->getResults();
476 chimera->getChimeras(right);
477 data_results rightResults = chimera->getResults();
479 //if either piece is chimeric then report
480 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
481 if (pDataArray->trim) { trimmed.printSequence(out3); }
483 delete left; delete right;
485 }else { //already chimeric
487 Sequence trimmed = chimera->print(out, out2);
488 if (pDataArray->trim) { trimmed.printSequence(out3); }
498 if (inFASTA.eof()) { break; }
501 if((numSeqs) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
504 if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
506 pDataArray->numNoParents = chimera->getNumNoParents();
507 if (pDataArray->numNoParents == numSeqs) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
511 if (pDataArray->trim) { out3.close(); }
516 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
519 pDataArray->m->appendFiles(thisoutputFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(thisoutputFileName);
520 pDataArray->m->appendFiles(thisaccnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(thisaccnosFileName);
521 if (pDataArray->trim) { pDataArray->m->appendFiles(thistrimFastaFileName, pDataArray->fasta); pDataArray->m->mothurRemove(thistrimFastaFileName); }
522 pDataArray->m->mothurRemove(thisFastaName);
524 totalSeqs += numSeqs;
526 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
529 pDataArray->count = totalSeqs;
534 catch(exception& e) {
535 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerGroupThreadFunction");
542 /**************************************************************************************************/