2 * chimerapintailcommand.cpp
5 * Created by westcott on 4/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "chimerapintailcommand.h"
13 //***************************************************************************************************************
15 ChimeraPintailCommand::ChimeraPintailCommand(string option) {
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"fasta","filter","processors","window" ,"increment","template","conservation","quantile","mask","outputdir","inputdir"};
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string,string> parameters = parser.getParameters();
30 ValidParameters validParameter;
31 map<string,string>::iterator it;
33 //check to make sure all parameters are valid for command
34 for (it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //if the user changes the input directory command factory will send this info to us in the output parameter
39 string inputDir = validParameter.validFile(parameters, "inputdir", false);
40 if (inputDir == "not found"){ inputDir = ""; }
43 it = parameters.find("fasta");
44 //user has given a template file
45 if(it != parameters.end()){
46 path = hasPath(it->second);
47 //if the user has not given a path then, add inputdir. else leave path alone.
48 if (path == "") { parameters["fasta"] = inputDir + it->second; }
51 it = parameters.find("template");
52 //user has given a template file
53 if(it != parameters.end()){
54 path = hasPath(it->second);
55 //if the user has not given a path then, add inputdir. else leave path alone.
56 if (path == "") { parameters["template"] = inputDir + it->second; }
59 it = parameters.find("conservation");
60 //user has given a template file
61 if(it != parameters.end()){
62 path = hasPath(it->second);
63 //if the user has not given a path then, add inputdir. else leave path alone.
64 if (path == "") { parameters["conservation"] = inputDir + it->second; }
67 it = parameters.find("quantile");
68 //user has given a template file
69 if(it != parameters.end()){
70 path = hasPath(it->second);
71 //if the user has not given a path then, add inputdir. else leave path alone.
72 if (path == "") { parameters["quantile"] = inputDir + it->second; }
77 //check for required parameters
78 fastafile = validParameter.validFile(parameters, "fasta", true);
79 if (fastafile == "not open") { abort = true; }
80 else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
82 //if the user changes the output directory command factory will send this info to us in the output parameter
83 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
85 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
88 templatefile = validParameter.validFile(parameters, "template", true);
89 if (templatefile == "not open") { abort = true; }
90 else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
92 consfile = validParameter.validFile(parameters, "conservation", true);
93 if (consfile == "not open") { abort = true; }
94 else if (consfile == "not found") { consfile = ""; }
96 quanfile = validParameter.validFile(parameters, "quantile", true);
97 if (quanfile == "not open") { abort = true; }
98 else if (quanfile == "not found") { quanfile = ""; }
100 maskfile = validParameter.validFile(parameters, "mask", false);
101 if (maskfile == "not found") { maskfile = ""; }
102 else if (maskfile != "default") {
103 if (inputDir != "") {
104 string path = hasPath(maskfile);
105 //if the user has not given a path then, add inputdir. else leave path alone.
106 if (path == "") { maskfile = inputDir + maskfile; }
110 int ableToOpen = openInputFile(maskfile, in);
111 if (ableToOpen == 1) { abort = true; }
116 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
117 filter = isTrue(temp);
119 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
120 convert(temp, processors);
122 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
123 convert(temp, window);
125 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
126 convert(temp, increment);
129 catch(exception& e) {
130 m->errorOut(e, "ChimeraPintailCommand", "ChimeraPintailCommand");
134 //**********************************************************************************************************************
136 void ChimeraPintailCommand::help(){
139 m->mothurOut("The chimera.pintail command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
140 m->mothurOut("This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n");
141 m->mothurOut("The chimera.pintail command parameters are fasta, template, filter, mask, processors, window, increment, conservation and quantile.\n");
142 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
143 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
144 m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
145 m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences, by default no mask is applied. You can apply an ecoli mask by typing, mask=default. \n");
146 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
148 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
150 m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=1/4 sequence length. \n");
151 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
152 m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
153 m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
154 m->mothurOut("The chimera.pintail command should be in the following format: \n");
155 m->mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
156 m->mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, method=bellerophon, window=200) \n");
157 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
159 catch(exception& e) {
160 m->errorOut(e, "ChimeraPintailCommand", "help");
165 //***************************************************************************************************************
167 ChimeraPintailCommand::~ChimeraPintailCommand(){ /* do nothing */ }
169 //***************************************************************************************************************
171 int ChimeraPintailCommand::execute(){
174 if (abort == true) { return 0; }
176 int start = time(NULL);
178 chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
181 if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); }
184 string outputFileName, accnosFileName;
185 if (maskfile != "") {
186 outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.chimeras";
187 accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.accnos";
189 outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + "pintail.chimeras";
190 accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + "pintail.accnos";
192 bool hasAccnos = true;
194 if (m->control_pressed) { delete chimera; return 0; }
196 if (chimera->getUnaligned()) {
197 m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine();
201 templateSeqsLength = chimera->getLength();
204 int pid, end, numSeqsPerProcessor;
207 MPIWroteAccnos = false;
210 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
211 MPI_Comm_size(MPI_COMM_WORLD, &processors);
215 MPI_File outMPIAccnos;
217 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
218 int inMode=MPI_MODE_RDONLY;
220 //char* outFilename = new char[outputFileName.length()];
221 //memcpy(outFilename, outputFileName.c_str(), outputFileName.length());
223 char outFilename[1024];
224 strcpy(outFilename, outputFileName.c_str());
226 //char* outAccnosFilename = new char[accnosFileName.length()];
227 //memcpy(outAccnosFilename, accnosFileName.c_str(), accnosFileName.length());
229 char outAccnosFilename[1024];
230 strcpy(outAccnosFilename, accnosFileName.c_str());
232 //char* inFileName = new char[fastafile.length()];
233 //memcpy(inFileName, fastafile.c_str(), fastafile.length());
235 char inFileName[1024];
236 strcpy(inFileName, fastafile.c_str());
238 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
239 MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
240 MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos);
243 //delete outFilename;
244 //delete outAccnosFilename;
246 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; }
248 if (pid == 0) { //you are the root process
250 MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
252 //send file positions to all processes
253 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
254 MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
256 //figure out how many sequences you have to align
257 numSeqsPerProcessor = numSeqs / processors;
258 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
259 int startIndex = pid * numSeqsPerProcessor;
262 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
264 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); delete chimera; return 0; }
266 for (int i = 1; i < processors; i++) {
268 MPI_Recv(&tempResult, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
269 if (tempResult != 0) { MPIWroteAccnos = true; }
271 }else{ //you are a child process
272 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
273 MPIPos.resize(numSeqs+1);
274 MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
276 //figure out how many sequences you have to align
277 numSeqsPerProcessor = numSeqs / processors;
278 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
279 int startIndex = pid * numSeqsPerProcessor;
282 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
284 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; }
286 MPI_Send(&MPIWroteAccnos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
290 MPI_File_close(&inMPI);
291 MPI_File_close(&outMPI);
292 MPI_File_close(&outMPIAccnos);
294 //delete accnos file if blank
296 if (!MPIWroteAccnos) {
298 //MPI_File_delete(outAccnosFilename, info);
300 remove(accnosFileName.c_str());
307 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
310 openInputFile(fastafile, inFASTA);
311 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
314 lines.push_back(new linePair(0, numSeqs));
316 driver(lines[0], outputFileName, fastafile, accnosFileName);
318 if (m->control_pressed) {
319 remove(outputFileName.c_str());
320 remove(accnosFileName.c_str());
321 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
326 //delete accnos file if its blank
327 if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; }
330 vector<int> positions;
331 processIDS.resize(0);
334 openInputFile(fastafile, inFASTA);
337 while(!inFASTA.eof()){
338 input = getline(inFASTA);
339 if (input.length() != 0) {
340 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
345 numSeqs = positions.size();
347 int numSeqsPerProcessor = numSeqs / processors;
349 for (int i = 0; i < processors; i++) {
350 long int startPos = positions[ i * numSeqsPerProcessor ];
351 if(i == processors - 1){
352 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
354 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
358 createProcesses(outputFileName, fastafile, accnosFileName);
360 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
362 //append output files
363 for(int i=1;i<processors;i++){
364 appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
365 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
368 vector<string> nonBlankAccnosFiles;
369 //delete blank accnos files generated with multiple processes
370 for(int i=0;i<processors;i++){
371 if (!(isBlank(accnosFileName + toString(processIDS[i]) + ".temp"))) {
372 nonBlankAccnosFiles.push_back(accnosFileName + toString(processIDS[i]) + ".temp");
373 }else { remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str()); }
376 //append accnos files
377 if (nonBlankAccnosFiles.size() != 0) {
378 rename(nonBlankAccnosFiles[0].c_str(), accnosFileName.c_str());
380 for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
381 appendFiles(nonBlankAccnosFiles[h], accnosFileName);
382 remove(nonBlankAccnosFiles[h].c_str());
384 }else{ hasAccnos = false; }
386 if (m->control_pressed) {
387 remove(outputFileName.c_str());
388 remove(accnosFileName.c_str());
389 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
397 openInputFile(fastafile, inFASTA);
398 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
400 lines.push_back(new linePair(0, numSeqs));
402 driver(lines[0], outputFileName, fastafile, accnosFileName);
404 if (m->control_pressed) {
405 remove(outputFileName.c_str());
406 remove(accnosFileName.c_str());
407 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
412 //delete accnos file if its blank
413 if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; }
419 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
421 m->mothurOutEndLine();
422 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
423 m->mothurOut(outputFileName); m->mothurOutEndLine();
424 if (hasAccnos) { m->mothurOut(accnosFileName); m->mothurOutEndLine(); }
425 m->mothurOutEndLine();
426 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
431 catch(exception& e) {
432 m->errorOut(e, "ChimeraPintailCommand", "execute");
436 //**********************************************************************************************************************
438 int ChimeraPintailCommand::driver(linePair* line, string outputFName, string filename, string accnos){
441 openOutputFile(outputFName, out);
444 openOutputFile(accnos, out2);
447 openInputFile(filename, inFASTA);
449 inFASTA.seekg(line->start);
451 for(int i=0;i<line->numSeqs;i++){
453 if (m->control_pressed) { return 1; }
455 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
457 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
459 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
460 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
463 chimera->getChimeras(candidateSeq);
465 if (m->control_pressed) { delete candidateSeq; return 1; }
468 chimera->print(out, out2);
474 if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); }
477 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); }
485 catch(exception& e) {
486 m->errorOut(e, "ChimeraPintailCommand", "driver");
490 //**********************************************************************************************************************
492 int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<long>& MPIPos){
497 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
499 for(int i=0;i<num;i++){
501 if (m->control_pressed) { return 1; }
504 int length = MPIPos[start+i+1] - MPIPos[start+i];
506 char* buf4 = new char[length];
507 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
509 string tempBuf = buf4;
510 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
511 istringstream iss (tempBuf,istringstream::in);
514 Sequence* candidateSeq = new Sequence(iss); gobble(iss);
516 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
518 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
519 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
522 chimera->getChimeras(candidateSeq);
524 if (m->control_pressed) { delete candidateSeq; return 1; }
527 bool isChimeric = chimera->print(outMPI, outAccMPI);
528 if (isChimeric) { MPIWroteAccnos = true; }
534 if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); }
537 if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); }
542 catch(exception& e) {
543 m->errorOut(e, "ChimeraPintailCommand", "driverMPI");
549 /**************************************************************************************************/
551 int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) {
553 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
555 // processIDS.resize(0);
557 //loop through and create all the processes you want
558 while (process != processors) {
562 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
565 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp");
567 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
570 //force parent to wait until all the processes are done
571 for (int i=0;i<processors;i++) {
572 int temp = processIDS[i];
579 catch(exception& e) {
580 m->errorOut(e, "ChimeraPintailCommand", "createProcesses");
585 /**************************************************************************************************/