1 #ifndef CHIMERASLAYERCOMMAND_H
2 #define CHIMERASLAYERCOMMAND_H
5 * chimeraslayercommand.h
8 * Created by westcott on 3/31/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
16 #include "chimeraslayer.h"
17 #include "sequenceparser.h"
19 /***********************************************************/
21 class ChimeraSlayerCommand : public Command {
23 ChimeraSlayerCommand(string);
24 ChimeraSlayerCommand();
25 ~ChimeraSlayerCommand() {}
27 vector<string> setParameters();
28 string getCommandName() { return "chimera.slayer"; }
29 string getCommandCategory() { return "Sequence Processing"; }
30 string getHelpString();
31 string getCitation() { return "Haas BJ, Gevers D, Earl A, Feldgarden M, Ward DV, Giannokous G, Ciulla D, Tabbaa D, Highlander SK, Sodergren E, Methe B, Desantis TZ, Petrosino JF, Knight R, Birren BW (2011). Chimeric 16S rRNA sequence formation and detection in Sanger and 454-pyrosequenced PCR amplicons. Genome Res. \nhttp://www.mothur.org/wiki/Chimera.slayer"; }
32 string getDescription() { return "detect chimeric sequences"; }
35 void help() { m->mothurOut(getHelpString()); }
40 unsigned long long start;
41 unsigned long long end;
42 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
45 vector<int> processIDS; //processid
46 vector<linePair> lines;
48 int driver(linePair, string, string, string, string, map<string, int>&);
49 int createProcesses(string, string, string, string, map<string, int>&);
50 int divideInHalf(Sequence, string&, string&);
51 map<string, int> sortFastaFile(string, string);
52 map<string, int> sortFastaFile(vector<Sequence>&, map<string, string>&, string newFile);
53 string getNamesFile(string&);
54 //int setupChimera(string,);
55 int MPIExecute(string, string, string, string, map<string, int>&);
56 int deconvoluteResults(SequenceParser*, string, string, string);
57 map<string, int> priority;
58 int setUpForSelfReference(SequenceParser*&, map<string, string>&, map<string, map<string, int> >&, int);
59 int driverGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
60 int createProcessesGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
61 int MPIExecuteGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
65 int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, string, map<string, int>&, bool);
68 bool abort, realign, trim, trimera, save;
69 string fastafile, groupfile, templatefile, outputDir, search, namefile, blastlocation;
70 int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
73 vector<string> outputNames;
74 vector<string> fastaFileNames;
75 vector<string> nameFileNames;
76 vector<string> groupFileNames;
80 /***********************************************************/
82 //custom data structure for threads to use.
83 // This is passed by void pointer so it can be any data type
84 // that can be passed using a single void pointer (LPVOID).
95 unsigned long long start;
96 unsigned long long end;
97 int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted;
100 map<string, int> priority;
104 map<string, map<string, int> > fileToPriority;
105 map<string, string> fileGroup;
108 slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
126 minSimilarity = minS;
140 slayerData(string o, string fa, string ac, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, map<string, map<string, int> >& fPriority, map<string, string>& fileG, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
152 fileToPriority = fPriority;
157 minSimilarity = minS;
174 /**************************************************************************************************/
175 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
177 static DWORD WINAPI MySlayerThreadFunction(LPVOID lpParam){
178 slayerData* pDataArray;
179 pDataArray = (slayerData*)lpParam;
183 pDataArray->m->openOutputFile(pDataArray->outputFName, out);
186 pDataArray->m->openOutputFile(pDataArray->accnos, out2);
189 if (pDataArray->trim) { pDataArray->m->openOutputFile(pDataArray->fasta, out3); }
192 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
197 if (pDataArray->templatefile != "self") { //you want to run slayer with a reference template
198 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
200 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->priority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
203 //print header if you are process 0
204 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
205 chimera->printHeader(out);
207 }else { //this accounts for the difference in line endings.
208 inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
211 pDataArray->count = pDataArray->end;
213 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
215 if (chimera->getUnaligned()) {
216 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
217 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
221 int templateSeqsLength = chimera->getLength();
223 if (pDataArray->start == 0) { chimera->printHeader(out); }
226 for(int i = 0; i < pDataArray->end; i++){
228 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
230 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
231 string candidateAligned = candidateSeq->getAligned();
233 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
234 if (candidateSeq->getAligned().length() != templateSeqsLength) {
235 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
238 chimera->getChimeras(candidateSeq);
240 if (pDataArray->m->control_pressed) { delete candidateSeq; delete chimera; return 1; }
242 //if you are not chimeric, then check each half
243 data_results wholeResults = chimera->getResults();
245 //determine if we need to split
246 bool isChimeric = false;
248 if (wholeResults.flag == "yes") {
249 string chimeraFlag = "no";
250 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
252 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
255 if (chimeraFlag == "yes") {
256 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
260 if ((!isChimeric) && pDataArray->trimera) {
262 //split sequence in half by bases
263 string leftQuery, rightQuery;
264 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
265 //divideInHalf(tempSeq, leftQuery, rightQuery);
266 string queryUnAligned = tempSeq.getUnaligned();
267 int numBases = int(queryUnAligned.length() * 0.5);
269 string queryAligned = tempSeq.getAligned();
270 leftQuery = tempSeq.getAligned();
271 rightQuery = tempSeq.getAligned();
275 for (int i = 0; i < queryAligned.length(); i++) {
277 if (isalpha(queryAligned[i])) {
282 if (baseCount >= numBases) { leftSpot = i; break; } //first half
285 //blank out right side
286 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
288 //blank out left side
289 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
291 //run chimeraSlayer on each piece
292 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
293 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
296 chimera->getChimeras(left);
297 data_results leftResults = chimera->getResults();
299 chimera->getChimeras(right);
300 data_results rightResults = chimera->getResults();
302 //if either piece is chimeric then report
303 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
304 if (pDataArray->trim) { trimmed.printSequence(out3); }
306 delete left; delete right;
308 }else { //already chimeric
310 Sequence trimmed = chimera->print(out, out2);
311 if (pDataArray->trim) { trimmed.printSequence(out3); }
321 if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
324 if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
326 pDataArray->numNoParents = chimera->getNumNoParents();
327 if (pDataArray->numNoParents == count) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
331 if (pDataArray->trim) { out3.close(); }
338 catch(exception& e) {
339 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerThreadFunction");
344 /**************************************************************************************************/
346 static DWORD WINAPI MySlayerGroupThreadFunction(LPVOID lpParam){
347 slayerData* pDataArray;
348 pDataArray = (slayerData*)lpParam;
354 for (map<string, map<string, int> >::iterator itFile = pDataArray->fileToPriority.begin(); itFile != pDataArray->fileToPriority.end(); itFile++) {
356 if (pDataArray->m->control_pressed) { return 0; }
358 int start = time(NULL);
359 string thisFastaName = itFile->first;
360 map<string, int> thisPriority = itFile->second;
361 string thisoutputFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.chimera";
362 string thisaccnosFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.accnos";
363 string thistrimFastaFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.fasta";
365 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group: " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
367 //int numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName, thisPriority);
368 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
371 pDataArray->m->openOutputFile(thisoutputFileName, out);
374 pDataArray->m->openOutputFile(thisaccnosFileName, out2);
377 if (pDataArray->trim) { pDataArray->m->openOutputFile(thistrimFastaFileName, out3); }
380 pDataArray->m->openInputFile(thisFastaName, inFASTA);
383 chimera = new ChimeraSlayer(thisFastaName, pDataArray->templatefile, pDataArray->trim, thisPriority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
384 chimera->printHeader(out);
388 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
390 if (chimera->getUnaligned()) {
391 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
392 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
396 int templateSeqsLength = chimera->getLength();
401 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
403 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
404 string candidateAligned = candidateSeq->getAligned();
406 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
407 if (candidateSeq->getAligned().length() != templateSeqsLength) {
408 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
411 chimera->getChimeras(candidateSeq);
413 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete candidateSeq; delete chimera; return 1; }
415 //if you are not chimeric, then check each half
416 data_results wholeResults = chimera->getResults();
418 //determine if we need to split
419 bool isChimeric = false;
421 if (wholeResults.flag == "yes") {
422 string chimeraFlag = "no";
423 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
425 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
428 if (chimeraFlag == "yes") {
429 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
433 if ((!isChimeric) && pDataArray->trimera) {
435 //split sequence in half by bases
436 string leftQuery, rightQuery;
437 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
438 //divideInHalf(tempSeq, leftQuery, rightQuery);
439 string queryUnAligned = tempSeq.getUnaligned();
440 int numBases = int(queryUnAligned.length() * 0.5);
442 string queryAligned = tempSeq.getAligned();
443 leftQuery = tempSeq.getAligned();
444 rightQuery = tempSeq.getAligned();
448 for (int i = 0; i < queryAligned.length(); i++) {
450 if (isalpha(queryAligned[i])) {
455 if (baseCount >= numBases) { leftSpot = i; break; } //first half
458 //blank out right side
459 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
461 //blank out left side
462 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
464 //run chimeraSlayer on each piece
465 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
466 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
469 chimera->getChimeras(left);
470 data_results leftResults = chimera->getResults();
472 chimera->getChimeras(right);
473 data_results rightResults = chimera->getResults();
475 //if either piece is chimeric then report
476 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
477 if (pDataArray->trim) { trimmed.printSequence(out3); }
479 delete left; delete right;
481 }else { //already chimeric
483 Sequence trimmed = chimera->print(out, out2);
484 if (pDataArray->trim) { trimmed.printSequence(out3); }
494 if (inFASTA.eof()) { break; }
497 if((numSeqs) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
500 if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
502 pDataArray->numNoParents = chimera->getNumNoParents();
503 if (pDataArray->numNoParents == numSeqs) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
507 if (pDataArray->trim) { out3.close(); }
512 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
515 pDataArray->m->appendFiles(thisoutputFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(thisoutputFileName);
516 pDataArray->m->appendFiles(thisaccnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(thisaccnosFileName);
517 if (pDataArray->trim) { pDataArray->m->appendFiles(thistrimFastaFileName, pDataArray->fasta); pDataArray->m->mothurRemove(thistrimFastaFileName); }
518 pDataArray->m->mothurRemove(thisFastaName);
520 totalSeqs += numSeqs;
522 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
525 pDataArray->count = totalSeqs;
530 catch(exception& e) {
531 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerGroupThreadFunction");
538 /**************************************************************************************************/