1 #ifndef CHIMERASLAYERCOMMAND_H
2 #define CHIMERASLAYERCOMMAND_H
5 * chimeraslayercommand.h
8 * Created by westcott on 3/31/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
16 #include "chimeraslayer.h"
17 #include "sequenceparser.h"
18 #include "sequencecountparser.h"
20 /***********************************************************/
22 class ChimeraSlayerCommand : public Command {
24 ChimeraSlayerCommand(string);
25 ChimeraSlayerCommand();
26 ~ChimeraSlayerCommand() {}
28 vector<string> setParameters();
29 string getCommandName() { return "chimera.slayer"; }
30 string getCommandCategory() { return "Sequence Processing"; }
32 string getHelpString();
33 string getOutputPattern(string);
34 string getCitation() { return "Haas BJ, Gevers D, Earl A, Feldgarden M, Ward DV, Giannokous G, Ciulla D, Tabbaa D, Highlander SK, Sodergren E, Methe B, Desantis TZ, Petrosino JF, Knight R, Birren BW (2011). Chimeric 16S rRNA sequence formation and detection in Sanger and 454-pyrosequenced PCR amplicons. Genome Res 21:494.\nhttp://www.mothur.org/wiki/Chimera.slayer"; }
35 string getDescription() { return "detect chimeric sequences"; }
38 void help() { m->mothurOut(getHelpString()); }
43 unsigned long long start;
44 unsigned long long end;
45 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
48 vector<int> processIDS; //processid
49 vector<linePair> lines;
51 int driver(linePair, string, string, string, string, map<string, int>&);
52 int createProcesses(string, string, string, string, map<string, int>&);
53 int divideInHalf(Sequence, string&, string&);
54 map<string, int> sortFastaFile(string, string);
55 map<string, int> sortFastaFile(vector<Sequence>&, map<string, string>&, string newFile);
56 int sortFastaFile(vector<Sequence>&, map<string, int>&, string newFile);
57 string getNamesFile(string&);
58 //int setupChimera(string,);
59 int MPIExecute(string, string, string, string, map<string, int>&);
60 int deconvoluteResults(map<string, string>&, string, string, string);
61 map<string, int> priority;
62 int setUpForSelfReference(SequenceParser*&, map<string, string>&, map<string, map<string, int> >&, int);
63 int setUpForSelfReference(SequenceCountParser*&, map<string, string>&, map<string, map<string, int> >&, int);
64 int driverGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
65 int createProcessesGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
66 int MPIExecuteGroups(string, string, string, map<string, map<string, int> >&, map<string, string>&);
70 int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, string, map<string, int>&, bool);
73 bool abort, realign, trim, trimera, save, hasName, hasCount, dups;
74 string fastafile, groupfile, templatefile, outputDir, search, namefile, countfile, blastlocation;
75 int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
78 vector<string> outputNames;
79 vector<string> fastaFileNames;
80 vector<string> nameFileNames;
81 vector<string> groupFileNames;
85 /***********************************************************/
87 //custom data structure for threads to use.
88 // This is passed by void pointer so it can be any data type
89 // that can be passed using a single void pointer (LPVOID).
100 unsigned long long start;
101 unsigned long long end;
102 int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted;
105 map<string, int> priority;
109 map<string, map<string, int> > fileToPriority;
110 map<string, string> fileGroup;
113 slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
131 minSimilarity = minS;
145 slayerData(string o, string fa, string ac, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, map<string, map<string, int> >& fPriority, map<string, string>& fileG, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
157 fileToPriority = fPriority;
162 minSimilarity = minS;
179 /**************************************************************************************************/
180 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
182 static DWORD WINAPI MySlayerThreadFunction(LPVOID lpParam){
183 slayerData* pDataArray;
184 pDataArray = (slayerData*)lpParam;
188 pDataArray->m->openOutputFile(pDataArray->outputFName, out);
191 pDataArray->m->openOutputFile(pDataArray->accnos, out2);
194 if (pDataArray->trim) { pDataArray->m->openOutputFile(pDataArray->fasta, out3); }
197 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
202 if (pDataArray->templatefile != "self") { //you want to run slayer with a reference template
203 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
205 chimera = new ChimeraSlayer(pDataArray->filename, pDataArray->templatefile, pDataArray->trim, pDataArray->priority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
208 //print header if you are process 0
209 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
210 chimera->printHeader(out);
212 }else { //this accounts for the difference in line endings.
213 inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA);
216 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
218 if (chimera->getUnaligned()) {
219 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
220 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
224 int templateSeqsLength = chimera->getLength();
226 if (pDataArray->start == 0) { chimera->printHeader(out); }
228 pDataArray->count = 0;
229 for(int i = 0; i < pDataArray->end; i++){
231 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
233 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
234 string candidateAligned = candidateSeq->getAligned();
236 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
237 if (candidateSeq->getAligned().length() != templateSeqsLength) {
238 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
241 chimera->getChimeras(candidateSeq);
243 if (pDataArray->m->control_pressed) { delete candidateSeq; delete chimera; return 1; }
245 //if you are not chimeric, then check each half
246 data_results wholeResults = chimera->getResults();
248 //determine if we need to split
249 bool isChimeric = false;
251 if (wholeResults.flag == "yes") {
252 string chimeraFlag = "no";
253 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
255 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
258 if (chimeraFlag == "yes") {
259 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
263 if ((!isChimeric) && pDataArray->trimera) {
265 //split sequence in half by bases
266 string leftQuery, rightQuery;
267 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
268 //divideInHalf(tempSeq, leftQuery, rightQuery);
269 string queryUnAligned = tempSeq.getUnaligned();
270 int numBases = int(queryUnAligned.length() * 0.5);
272 string queryAligned = tempSeq.getAligned();
273 leftQuery = tempSeq.getAligned();
274 rightQuery = tempSeq.getAligned();
278 for (int i = 0; i < queryAligned.length(); i++) {
280 if (isalpha(queryAligned[i])) {
285 if (baseCount >= numBases) { leftSpot = i; break; } //first half
288 //blank out right side
289 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
291 //blank out left side
292 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
294 //run chimeraSlayer on each piece
295 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
296 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
299 chimera->getChimeras(left);
300 data_results leftResults = chimera->getResults();
302 chimera->getChimeras(right);
303 data_results rightResults = chimera->getResults();
305 //if either piece is chimeric then report
306 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
307 if (pDataArray->trim) { trimmed.printSequence(out3); }
309 delete left; delete right;
311 }else { //already chimeric
313 Sequence trimmed = chimera->print(out, out2);
314 if (pDataArray->trim) { trimmed.printSequence(out3); }
324 if((pDataArray->count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
327 if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
329 pDataArray->numNoParents = chimera->getNumNoParents();
330 if (pDataArray->numNoParents == pDataArray->count) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
334 if (pDataArray->trim) { out3.close(); }
341 catch(exception& e) {
342 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerThreadFunction");
347 /**************************************************************************************************/
349 static DWORD WINAPI MySlayerGroupThreadFunction(LPVOID lpParam){
350 slayerData* pDataArray;
351 pDataArray = (slayerData*)lpParam;
357 for (map<string, map<string, int> >::iterator itFile = pDataArray->fileToPriority.begin(); itFile != pDataArray->fileToPriority.end(); itFile++) {
359 if (pDataArray->m->control_pressed) { return 0; }
361 int start = time(NULL);
362 string thisFastaName = itFile->first;
363 map<string, int> thisPriority = itFile->second;
364 string thisoutputFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.chimera";
365 string thisaccnosFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.accnos";
366 string thistrimFastaFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisFastaName)) + pDataArray->fileGroup[thisFastaName] + "slayer.fasta";
368 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Checking sequences from group: " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
370 //int numSeqs = driver(lines[0], thisoutputFileName, thisFastaName, thisaccnosFileName, thistrimFastaFileName, thisPriority);
371 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
374 pDataArray->m->openOutputFile(thisoutputFileName, out);
377 pDataArray->m->openOutputFile(thisaccnosFileName, out2);
380 if (pDataArray->trim) { pDataArray->m->openOutputFile(thistrimFastaFileName, out3); }
383 pDataArray->m->openInputFile(thisFastaName, inFASTA);
386 chimera = new ChimeraSlayer(thisFastaName, pDataArray->templatefile, pDataArray->trim, thisPriority, pDataArray->search, pDataArray->ksize, pDataArray->match, pDataArray->mismatch, pDataArray->window, pDataArray->divR, pDataArray->minSimilarity, pDataArray->minCoverage, pDataArray->minBS, pDataArray->minSNP, pDataArray->parents, pDataArray->iters, pDataArray->increment, pDataArray->numwanted, pDataArray->realign, pDataArray->blastlocation, pDataArray->threadId);
387 chimera->printHeader(out);
391 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 0; }
393 if (chimera->getUnaligned()) {
394 pDataArray->m->mothurOut("Your template sequences are different lengths, please correct."); pDataArray->m->mothurOutEndLine();
395 out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close();
399 int templateSeqsLength = chimera->getLength();
404 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete chimera; return 1; }
406 Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
407 string candidateAligned = candidateSeq->getAligned();
409 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
410 if (candidateSeq->getAligned().length() != templateSeqsLength) {
411 pDataArray->m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); pDataArray->m->mothurOutEndLine();
414 chimera->getChimeras(candidateSeq);
416 if (pDataArray->m->control_pressed) { out.close(); out2.close(); if (pDataArray->trim) { out3.close(); } inFASTA.close(); delete candidateSeq; delete chimera; return 1; }
418 //if you are not chimeric, then check each half
419 data_results wholeResults = chimera->getResults();
421 //determine if we need to split
422 bool isChimeric = false;
424 if (wholeResults.flag == "yes") {
425 string chimeraFlag = "no";
426 if( (wholeResults.results[0].bsa >= pDataArray->minBS && wholeResults.results[0].divr_qla_qrb >= pDataArray->divR)
428 (wholeResults.results[0].bsb >= pDataArray->minBS && wholeResults.results[0].divr_qlb_qra >= pDataArray->divR) ) { chimeraFlag = "yes"; }
431 if (chimeraFlag == "yes") {
432 if ((wholeResults.results[0].bsa >= pDataArray->minBS) || (wholeResults.results[0].bsb >= pDataArray->minBS)) { isChimeric = true; }
436 if ((!isChimeric) && pDataArray->trimera) {
438 //split sequence in half by bases
439 string leftQuery, rightQuery;
440 Sequence tempSeq(candidateSeq->getName(), candidateAligned);
441 //divideInHalf(tempSeq, leftQuery, rightQuery);
442 string queryUnAligned = tempSeq.getUnaligned();
443 int numBases = int(queryUnAligned.length() * 0.5);
445 string queryAligned = tempSeq.getAligned();
446 leftQuery = tempSeq.getAligned();
447 rightQuery = tempSeq.getAligned();
451 for (int i = 0; i < queryAligned.length(); i++) {
453 if (isalpha(queryAligned[i])) {
458 if (baseCount >= numBases) { leftSpot = i; break; } //first half
461 //blank out right side
462 for (int i = leftSpot; i < leftQuery.length(); i++) { leftQuery[i] = '.'; }
464 //blank out left side
465 for (int i = 0; i < leftSpot; i++) { rightQuery[i] = '.'; }
467 //run chimeraSlayer on each piece
468 Sequence* left = new Sequence(candidateSeq->getName(), leftQuery);
469 Sequence* right = new Sequence(candidateSeq->getName(), rightQuery);
472 chimera->getChimeras(left);
473 data_results leftResults = chimera->getResults();
475 chimera->getChimeras(right);
476 data_results rightResults = chimera->getResults();
478 //if either piece is chimeric then report
479 Sequence trimmed = chimera->print(out, out2, leftResults, rightResults);
480 if (pDataArray->trim) { trimmed.printSequence(out3); }
482 delete left; delete right;
484 }else { //already chimeric
486 Sequence trimmed = chimera->print(out, out2);
487 if (pDataArray->trim) { trimmed.printSequence(out3); }
497 if (inFASTA.eof()) { break; }
500 if((numSeqs) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
503 if((numSeqs) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(numSeqs)); pDataArray->m->mothurOutEndLine(); }
505 pDataArray->numNoParents = chimera->getNumNoParents();
506 if (pDataArray->numNoParents == numSeqs) { pDataArray->m->mothurOut("[WARNING]: megablast returned 0 potential parents for all your sequences. This could be due to formatdb.exe not being setup properly, please check formatdb.log for errors.\n"); }
510 if (pDataArray->trim) { out3.close(); }
515 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
518 pDataArray->m->appendFiles(thisoutputFileName, pDataArray->outputFName); pDataArray->m->mothurRemove(thisoutputFileName);
519 pDataArray->m->appendFiles(thisaccnosFileName, pDataArray->accnos); pDataArray->m->mothurRemove(thisaccnosFileName);
520 if (pDataArray->trim) { pDataArray->m->appendFiles(thistrimFastaFileName, pDataArray->fasta); pDataArray->m->mothurRemove(thistrimFastaFileName); }
521 pDataArray->m->mothurRemove(thisFastaName);
523 totalSeqs += numSeqs;
525 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences from group " + pDataArray->fileGroup[thisFastaName] + "."); pDataArray->m->mothurOutEndLine();
528 pDataArray->count = totalSeqs;
533 catch(exception& e) {
534 pDataArray->m->errorOut(e, "ChimeraSlayerCommand", "MySlayerGroupThreadFunction");
541 /**************************************************************************************************/