1 #ifndef CHIMERASLAYERCOMMAND_H
2 #define CHIMERASLAYERCOMMAND_H
5 * chimeraslayercommand.h
8 * Created by westcott on 3/31/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
16 #include "chimeraslayer.h"
17 #include "sequenceparser.h"
19 /***********************************************************/
21 class ChimeraSlayerCommand : public Command {
23 ChimeraSlayerCommand(string);
24 ChimeraSlayerCommand();
25 ~ChimeraSlayerCommand() {}
27 vector<string> setParameters();
28 string getCommandName() { return "chimera.slayer"; }
29 string getCommandCategory() { return "Sequence Processing"; }
30 string getOutputFileNameTag(string, string);
31 string getHelpString();
32 string getCitation() { return "Haas BJ, Gevers D, Earl A, Feldgarden M, Ward DV, Giannokous G, Ciulla D, Tabbaa D, Highlander SK, Sodergren E, Methe B, Desantis TZ, Petrosino JF, Knight R, Birren BW (2011). Chimeric 16S rRNA sequence formation and detection in Sanger and 454-pyrosequenced PCR amplicons. Genome Res. \nhttp://www.mothur.org/wiki/Chimera.slayer"; }
33 string getDescription() { return "detect chimeric sequences"; }
36 void help() { m->mothurOut(getHelpString()); }
41 unsigned long long start;
42 unsigned long long end;
43 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
46 vector<int> processIDS; //processid
47 vector<linePair> lines;
49 int driver(linePair, string, string, string, string, map<string, int>&);
50 int createProcesses(string, string, string, string, map<string, int>&);
51 int divideInHalf(Sequence, string&, string&);
52 map<string, int> sortFastaFile(string, string);
53 map<string, int> sortFastaFile(vector<Sequence>&, map<string, string>&, string newFile);
54 string getNamesFile(string&);
55 //int setupChimera(string,);
56 int MPIExecute(string, string, string, string, map<string, int>&);
57 int deconvoluteResults(SequenceParser*, string, string, string);
58 map<string, int> priority;
59 int setUpForSelfReference(SequenceParser*&, map<string, string>&, map<string, map<string, int> >&, int);
60 int driverGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
61 int createProcessesGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
62 int MPIExecuteGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
66 int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, string, map<string, int>&, bool);
69 bool abort, realign, trim, trimera, save;
70 string fastafile, groupfile, templatefile, outputDir, search, namefile, blastlocation;
71 int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
74 vector<string> outputNames;
75 vector<string> fastaFileNames;
76 vector<string> nameFileNames;
77 vector<string> groupFileNames;
81 /***********************************************************/
83 //custom data structure for threads to use.
84 // This is passed by void pointer so it can be any data type
85 // that can be passed using a single void pointer (LPVOID).
96 unsigned long long start;
97 unsigned long long end;
98 int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted;
101 map<string, int> priority;
105 map<string, map<string, int> > fileToPriority;
106 map<string, string> fileGroup;
109 slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
127 minSimilarity = minS;
141 slayerData(string o, string fa, string ac, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, map<string, map<string, int> >& fPriority, map<string, string>& fileG, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
153 fileToPriority = fPriority;
158 minSimilarity = minS;
175 /**************************************************************************************************/
176 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
178 static DWORD WINAPI MySlayerThreadFunction(LPVOID lpParam){
179 slayerData* pDataArray;
180 pDataArray = (slayerData*)lpParam;
184 pDataArray->m->openOutputFile(pDataArray->outputFName, out);
187 pDataArray->m->openOutputFile(pDataArray->accnos, out2);
190 if (pDataArray->trim) { pDataArray->m->openOutputFile(pDataArray->fasta, out3); }
193 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
198 if (pDataArray->templatefile != "self") { //you want to run slayer with a reference template
199 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
201 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->priority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
204 //print header if you are process 0
205 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
206 chimera->printHeader(out);
208 }else { //this accounts for the difference in line endings.
209 inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
212 pDataArray->count = pDataArray->end;
214 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
216 if (chimera->getUnaligned()) {
217 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
218 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
222 int templateSeqsLength = chimera->getLength();
224 if (pDataArray->start == 0) { chimera->printHeader(out); }
227 for(int i = 0; i < pDataArray->end; i++){
229 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
231 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
232 string candidateAligned = candidateSeq->getAligned();
234 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
235 if (candidateSeq->getAligned().length() != templateSeqsLength) {
236 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
239 chimera->getChimeras(candidateSeq);
241 if (pDataArray->m->control_pressed) { delete candidateSeq; delete chimera; return 1; }
243 //if you are not chimeric, then check each half
244 data_results wholeResults = chimera->getResults();
246 //determine if we need to split
247 bool isChimeric = false;
249 if (wholeResults.flag == "yes") {
250 string chimeraFlag = "no";
251 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
253 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
256 if (chimeraFlag == "yes") {
257 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
261 if ((!isChimeric) && pDataArray->trimera) {
263 //split sequence in half by bases
264 string leftQuery, rightQuery;
265 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
266 //divideInHalf(tempSeq, leftQuery, rightQuery);
267 string queryUnAligned = tempSeq.getUnaligned();
268 int numBases = int(queryUnAligned.length() * 0.5);
270 string queryAligned = tempSeq.getAligned();
271 leftQuery = tempSeq.getAligned();
272 rightQuery = tempSeq.getAligned();
276 for (int i = 0; i < queryAligned.length(); i++) {
278 if (isalpha(queryAligned[i])) {
283 if (baseCount >= numBases) { leftSpot = i; break; } //first half
286 //blank out right side
287 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
289 //blank out left side
290 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
292 //run chimeraSlayer on each piece
293 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
294 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
297 chimera->getChimeras(left);
298 data_results leftResults = chimera->getResults();
300 chimera->getChimeras(right);
301 data_results rightResults = chimera->getResults();
303 //if either piece is chimeric then report
304 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
305 if (pDataArray->trim) { trimmed.printSequence(out3); }
307 delete left; delete right;
309 }else { //already chimeric
311 Sequence trimmed = chimera->print(out, out2);
312 if (pDataArray->trim) { trimmed.printSequence(out3); }
322 if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
325 if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
327 pDataArray->numNoParents = chimera->getNumNoParents();
328 if (pDataArray->numNoParents == count) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
332 if (pDataArray->trim) { out3.close(); }
339 catch(exception& e) {
340 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerThreadFunction");
345 /**************************************************************************************************/
347 static DWORD WINAPI MySlayerGroupThreadFunction(LPVOID lpParam){
348 slayerData* pDataArray;
349 pDataArray = (slayerData*)lpParam;
355 for (map<string, map<string, int> >::iterator itFile = pDataArray->fileToPriority.begin(); itFile != pDataArray->fileToPriority.end(); itFile++) {
357 if (pDataArray->m->control_pressed) { return 0; }
359 int start = time(NULL);
360 string thisFastaName = itFile->first;
361 map<string, int> thisPriority = itFile->second;
362 string thisoutputFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.chimera";
363 string thisaccnosFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.accnos";
364 string thistrimFastaFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.fasta";
366 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group: " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
368 //int numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName, thisPriority);
369 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
372 pDataArray->m->openOutputFile(thisoutputFileName, out);
375 pDataArray->m->openOutputFile(thisaccnosFileName, out2);
378 if (pDataArray->trim) { pDataArray->m->openOutputFile(thistrimFastaFileName, out3); }
381 pDataArray->m->openInputFile(thisFastaName, inFASTA);
384 chimera = new ChimeraSlayer(thisFastaName, pDataArray->templatefile, pDataArray->trim, thisPriority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
385 chimera->printHeader(out);
389 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
391 if (chimera->getUnaligned()) {
392 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
393 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
397 int templateSeqsLength = chimera->getLength();
402 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
404 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
405 string candidateAligned = candidateSeq->getAligned();
407 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
408 if (candidateSeq->getAligned().length() != templateSeqsLength) {
409 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
412 chimera->getChimeras(candidateSeq);
414 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete candidateSeq; delete chimera; return 1; }
416 //if you are not chimeric, then check each half
417 data_results wholeResults = chimera->getResults();
419 //determine if we need to split
420 bool isChimeric = false;
422 if (wholeResults.flag == "yes") {
423 string chimeraFlag = "no";
424 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
426 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
429 if (chimeraFlag == "yes") {
430 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
434 if ((!isChimeric) && pDataArray->trimera) {
436 //split sequence in half by bases
437 string leftQuery, rightQuery;
438 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
439 //divideInHalf(tempSeq, leftQuery, rightQuery);
440 string queryUnAligned = tempSeq.getUnaligned();
441 int numBases = int(queryUnAligned.length() * 0.5);
443 string queryAligned = tempSeq.getAligned();
444 leftQuery = tempSeq.getAligned();
445 rightQuery = tempSeq.getAligned();
449 for (int i = 0; i < queryAligned.length(); i++) {
451 if (isalpha(queryAligned[i])) {
456 if (baseCount >= numBases) { leftSpot = i; break; } //first half
459 //blank out right side
460 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
462 //blank out left side
463 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
465 //run chimeraSlayer on each piece
466 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
467 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
470 chimera->getChimeras(left);
471 data_results leftResults = chimera->getResults();
473 chimera->getChimeras(right);
474 data_results rightResults = chimera->getResults();
476 //if either piece is chimeric then report
477 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
478 if (pDataArray->trim) { trimmed.printSequence(out3); }
480 delete left; delete right;
482 }else { //already chimeric
484 Sequence trimmed = chimera->print(out, out2);
485 if (pDataArray->trim) { trimmed.printSequence(out3); }
495 if (inFASTA.eof()) { break; }
498 if((numSeqs) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
501 if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
503 pDataArray->numNoParents = chimera->getNumNoParents();
504 if (pDataArray->numNoParents == numSeqs) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
508 if (pDataArray->trim) { out3.close(); }
513 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
516 pDataArray->m->appendFiles(thisoutputFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(thisoutputFileName);
517 pDataArray->m->appendFiles(thisaccnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(thisaccnosFileName);
518 if (pDataArray->trim) { pDataArray->m->appendFiles(thistrimFastaFileName, pDataArray->fasta); pDataArray->m->mothurRemove(thistrimFastaFileName); }
519 pDataArray->m->mothurRemove(thisFastaName);
521 totalSeqs += numSeqs;
523 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
526 pDataArray->count = totalSeqs;
531 catch(exception& e) {
532 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerGroupThreadFunction");
539 /**************************************************************************************************/