1 #ifndef CHIMERASLAYERCOMMAND_H
2 #define CHIMERASLAYERCOMMAND_H
5 * chimeraslayercommand.h
8 * Created by westcott on 3/31/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
16 #include "chimeraslayer.h"
17 #include "sequenceparser.h"
18 #include "sequencecountparser.h"
20 /***********************************************************/
22 class ChimeraSlayerCommand : public Command {
24 ChimeraSlayerCommand(string);
25 ChimeraSlayerCommand();
26 ~ChimeraSlayerCommand() {}
28 vector<string> setParameters();
29 string getCommandName() { return "chimera.slayer"; }
30 string getCommandCategory() { return "Sequence Processing"; }
32 string getHelpString();
33 string getOutputPattern(string);
34 string getCitation() { return "Haas BJ, Gevers D, Earl A, Feldgarden M, Ward DV, Giannokous G, Ciulla D, Tabbaa D, Highlander SK, Sodergren E, Methe B, Desantis TZ, Petrosino JF, Knight R, Birren BW (2011). Chimeric 16S rRNA sequence formation and detection in Sanger and 454-pyrosequenced PCR amplicons. Genome Res 21:494.\nhttp://www.mothur.org/wiki/Chimera.slayer"; }
35 string getDescription() { return "detect chimeric sequences"; }
38 void help() { m->mothurOut(getHelpString()); }
43 unsigned long long start;
44 unsigned long long end;
45 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
48 vector<int> processIDS; //processid
49 vector<linePair> lines;
51 int driver(linePair, string, string, string, string, map<string, int>&);
52 int createProcesses(string, string, string, string, map<string, int>&);
53 int divideInHalf(Sequence, string&, string&);
54 map<string, int> sortFastaFile(string, string);
55 map<string, int> sortFastaFile(vector<Sequence>&, map<string, string>&, string newFile);
56 int sortFastaFile(vector<Sequence>&, map<string, int>&, string newFile);
57 string getNamesFile(string&);
58 //int setupChimera(string,);
59 int MPIExecute(string, string, string, string, map<string, int>&);
60 int deconvoluteResults(map<string, string>&, string, string, string);
61 map<string, int> priority;
62 int setUpForSelfReference(SequenceParser*&, map<string, string>&, map<string, map<string, int> >&, int);
63 int setUpForSelfReference(SequenceCountParser*&, map<string, string>&, map<string, map<string, int> >&, int);
64 int driverGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
65 int createProcessesGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
66 int MPIExecuteGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
70 int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, string, map<string, int>&, bool);
73 bool abort, realign, trim, trimera, save, hasName, hasCount, dups;
74 string fastafile, groupfile, templatefile, outputDir, search, namefile, countfile, blastlocation;
75 int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
78 vector<string> outputNames;
79 vector<string> fastaFileNames;
80 vector<string> nameFileNames;
81 vector<string> groupFileNames;
85 /***********************************************************/
87 //custom data structure for threads to use.
88 // This is passed by void pointer so it can be any data type
89 // that can be passed using a single void pointer (LPVOID).
100 unsigned long long start;
101 unsigned long long end;
102 int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted;
105 map<string, int> priority;
109 map<string, map<string, int> > fileToPriority;
110 map<string, string> fileGroup;
113 slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
131 minSimilarity = minS;
145 slayerData(string o, string fa, string ac, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, map<string, map<string, int> >& fPriority, map<string, string>& fileG, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
157 fileToPriority = fPriority;
162 minSimilarity = minS;
179 /**************************************************************************************************/
180 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
182 static DWORD WINAPI MySlayerThreadFunction(LPVOID lpParam){
183 slayerData* pDataArray;
184 pDataArray = (slayerData*)lpParam;
188 pDataArray->m->openOutputFile(pDataArray->outputFName, out);
191 pDataArray->m->openOutputFile(pDataArray->accnos, out2);
194 if (pDataArray->trim) { pDataArray->m->openOutputFile(pDataArray->fasta, out3); }
197 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
202 if (pDataArray->templatefile != "self") { //you want to run slayer with a reference template
203 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
205 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->priority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
208 //print header if you are process 0
209 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
210 chimera->printHeader(out);
212 }else { //this accounts for the difference in line endings.
213 inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
216 pDataArray->count = pDataArray->end;
218 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
220 if (chimera->getUnaligned()) {
221 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
222 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
226 int templateSeqsLength = chimera->getLength();
228 if (pDataArray->start == 0) { chimera->printHeader(out); }
231 for(int i = 0; i < pDataArray->end; i++){
233 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
235 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
236 string candidateAligned = candidateSeq->getAligned();
238 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
239 if (candidateSeq->getAligned().length() != templateSeqsLength) {
240 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
243 chimera->getChimeras(candidateSeq);
245 if (pDataArray->m->control_pressed) { delete candidateSeq; delete chimera; return 1; }
247 //if you are not chimeric, then check each half
248 data_results wholeResults = chimera->getResults();
250 //determine if we need to split
251 bool isChimeric = false;
253 if (wholeResults.flag == "yes") {
254 string chimeraFlag = "no";
255 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
257 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
260 if (chimeraFlag == "yes") {
261 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
265 if ((!isChimeric) && pDataArray->trimera) {
267 //split sequence in half by bases
268 string leftQuery, rightQuery;
269 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
270 //divideInHalf(tempSeq, leftQuery, rightQuery);
271 string queryUnAligned = tempSeq.getUnaligned();
272 int numBases = int(queryUnAligned.length() * 0.5);
274 string queryAligned = tempSeq.getAligned();
275 leftQuery = tempSeq.getAligned();
276 rightQuery = tempSeq.getAligned();
280 for (int i = 0; i < queryAligned.length(); i++) {
282 if (isalpha(queryAligned[i])) {
287 if (baseCount >= numBases) { leftSpot = i; break; } //first half
290 //blank out right side
291 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
293 //blank out left side
294 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
296 //run chimeraSlayer on each piece
297 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
298 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
301 chimera->getChimeras(left);
302 data_results leftResults = chimera->getResults();
304 chimera->getChimeras(right);
305 data_results rightResults = chimera->getResults();
307 //if either piece is chimeric then report
308 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
309 if (pDataArray->trim) { trimmed.printSequence(out3); }
311 delete left; delete right;
313 }else { //already chimeric
315 Sequence trimmed = chimera->print(out, out2);
316 if (pDataArray->trim) { trimmed.printSequence(out3); }
326 if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
329 if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine(); }
331 pDataArray->numNoParents = chimera->getNumNoParents();
332 if (pDataArray->numNoParents == count) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
336 if (pDataArray->trim) { out3.close(); }
343 catch(exception& e) {
344 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerThreadFunction");
349 /**************************************************************************************************/
351 static DWORD WINAPI MySlayerGroupThreadFunction(LPVOID lpParam){
352 slayerData* pDataArray;
353 pDataArray = (slayerData*)lpParam;
359 for (map<string, map<string, int> >::iterator itFile = pDataArray->fileToPriority.begin(); itFile != pDataArray->fileToPriority.end(); itFile++) {
361 if (pDataArray->m->control_pressed) { return 0; }
363 int start = time(NULL);
364 string thisFastaName = itFile->first;
365 map<string, int> thisPriority = itFile->second;
366 string thisoutputFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.chimera";
367 string thisaccnosFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.accnos";
368 string thistrimFastaFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.fasta";
370 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group: " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
372 //int numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName, thisPriority);
373 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
376 pDataArray->m->openOutputFile(thisoutputFileName, out);
379 pDataArray->m->openOutputFile(thisaccnosFileName, out2);
382 if (pDataArray->trim) { pDataArray->m->openOutputFile(thistrimFastaFileName, out3); }
385 pDataArray->m->openInputFile(thisFastaName, inFASTA);
388 chimera = new ChimeraSlayer(thisFastaName, pDataArray->templatefile, pDataArray->trim, thisPriority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
389 chimera->printHeader(out);
393 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
395 if (chimera->getUnaligned()) {
396 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
397 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
401 int templateSeqsLength = chimera->getLength();
406 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
408 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
409 string candidateAligned = candidateSeq->getAligned();
411 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
412 if (candidateSeq->getAligned().length() != templateSeqsLength) {
413 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
416 chimera->getChimeras(candidateSeq);
418 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete candidateSeq; delete chimera; return 1; }
420 //if you are not chimeric, then check each half
421 data_results wholeResults = chimera->getResults();
423 //determine if we need to split
424 bool isChimeric = false;
426 if (wholeResults.flag == "yes") {
427 string chimeraFlag = "no";
428 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
430 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
433 if (chimeraFlag == "yes") {
434 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
438 if ((!isChimeric) && pDataArray->trimera) {
440 //split sequence in half by bases
441 string leftQuery, rightQuery;
442 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
443 //divideInHalf(tempSeq, leftQuery, rightQuery);
444 string queryUnAligned = tempSeq.getUnaligned();
445 int numBases = int(queryUnAligned.length() * 0.5);
447 string queryAligned = tempSeq.getAligned();
448 leftQuery = tempSeq.getAligned();
449 rightQuery = tempSeq.getAligned();
453 for (int i = 0; i < queryAligned.length(); i++) {
455 if (isalpha(queryAligned[i])) {
460 if (baseCount >= numBases) { leftSpot = i; break; } //first half
463 //blank out right side
464 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
466 //blank out left side
467 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
469 //run chimeraSlayer on each piece
470 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
471 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
474 chimera->getChimeras(left);
475 data_results leftResults = chimera->getResults();
477 chimera->getChimeras(right);
478 data_results rightResults = chimera->getResults();
480 //if either piece is chimeric then report
481 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
482 if (pDataArray->trim) { trimmed.printSequence(out3); }
484 delete left; delete right;
486 }else { //already chimeric
488 Sequence trimmed = chimera->print(out, out2);
489 if (pDataArray->trim) { trimmed.printSequence(out3); }
499 if (inFASTA.eof()) { break; }
502 if((numSeqs) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
505 if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
507 pDataArray->numNoParents = chimera->getNumNoParents();
508 if (pDataArray->numNoParents == numSeqs) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
512 if (pDataArray->trim) { out3.close(); }
517 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
520 pDataArray->m->appendFiles(thisoutputFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(thisoutputFileName);
521 pDataArray->m->appendFiles(thisaccnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(thisaccnosFileName);
522 if (pDataArray->trim) { pDataArray->m->appendFiles(thistrimFastaFileName, pDataArray->fasta); pDataArray->m->mothurRemove(thistrimFastaFileName); }
523 pDataArray->m->mothurRemove(thisFastaName);
525 totalSeqs += numSeqs;
527 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
530 pDataArray->count = totalSeqs;
535 catch(exception& e) {
536 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerGroupThreadFunction");
543 /**************************************************************************************************/