2 * chimerapintailcommand.cpp
5 * Created by westcott on 4/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "chimerapintailcommand.h"
13 //***************************************************************************************************************
15 ChimeraPintailCommand::ChimeraPintailCommand(string option) {
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"fasta","filter","processors","window" ,"increment","template","conservation","quantile","mask","outputdir","inputdir"};
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string,string> parameters = parser.getParameters();
30 ValidParameters validParameter;
31 map<string,string>::iterator it;
33 //check to make sure all parameters are valid for command
34 for (it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //if the user changes the input directory command factory will send this info to us in the output parameter
39 string inputDir = validParameter.validFile(parameters, "inputdir", false);
40 if (inputDir == "not found"){ inputDir = ""; }
43 it = parameters.find("fasta");
44 //user has given a template file
45 if(it != parameters.end()){
46 path = hasPath(it->second);
47 //if the user has not given a path then, add inputdir. else leave path alone.
48 if (path == "") { parameters["fasta"] = inputDir + it->second; }
51 it = parameters.find("template");
52 //user has given a template file
53 if(it != parameters.end()){
54 path = hasPath(it->second);
55 //if the user has not given a path then, add inputdir. else leave path alone.
56 if (path == "") { parameters["template"] = inputDir + it->second; }
59 it = parameters.find("conservation");
60 //user has given a template file
61 if(it != parameters.end()){
62 path = hasPath(it->second);
63 //if the user has not given a path then, add inputdir. else leave path alone.
64 if (path == "") { parameters["conservation"] = inputDir + it->second; }
67 it = parameters.find("quantile");
68 //user has given a template file
69 if(it != parameters.end()){
70 path = hasPath(it->second);
71 //if the user has not given a path then, add inputdir. else leave path alone.
72 if (path == "") { parameters["quantile"] = inputDir + it->second; }
77 //check for required parameters
78 fastafile = validParameter.validFile(parameters, "fasta", true);
79 if (fastafile == "not open") { abort = true; }
80 else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
82 //if the user changes the output directory command factory will send this info to us in the output parameter
83 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
85 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
88 templatefile = validParameter.validFile(parameters, "template", true);
89 if (templatefile == "not open") { abort = true; }
90 else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
92 consfile = validParameter.validFile(parameters, "conservation", true);
93 if (consfile == "not open") { abort = true; }
94 else if (consfile == "not found") { consfile = ""; }
96 quanfile = validParameter.validFile(parameters, "quantile", true);
97 if (quanfile == "not open") { abort = true; }
98 else if (quanfile == "not found") { quanfile = ""; }
100 maskfile = validParameter.validFile(parameters, "mask", false);
101 if (maskfile == "not found") { maskfile = ""; }
102 else if (maskfile != "default") {
103 if (inputDir != "") {
104 string path = hasPath(maskfile);
105 //if the user has not given a path then, add inputdir. else leave path alone.
106 if (path == "") { maskfile = inputDir + maskfile; }
110 int ableToOpen = openInputFile(maskfile, in);
111 if (ableToOpen == 1) { abort = true; }
116 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
117 filter = isTrue(temp);
119 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
120 convert(temp, processors);
122 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
123 convert(temp, window);
125 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
126 convert(temp, increment);
129 catch(exception& e) {
130 m->errorOut(e, "ChimeraPintailCommand", "ChimeraPintailCommand");
134 //**********************************************************************************************************************
136 void ChimeraPintailCommand::help(){
139 m->mothurOut("The chimera.pintail command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
140 m->mothurOut("This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n");
141 m->mothurOut("The chimera.pintail command parameters are fasta, template, filter, mask, processors, window, increment, conservation and quantile.\n");
142 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
143 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
144 m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
145 m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences, by default no mask is applied. You can apply an ecoli mask by typing, mask=default. \n");
146 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
148 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
150 m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n");
151 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
152 m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
153 m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
154 m->mothurOut("The chimera.pintail command should be in the following format: \n");
155 m->mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
156 m->mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, method=bellerophon, window=200) \n");
157 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
159 catch(exception& e) {
160 m->errorOut(e, "ChimeraPintailCommand", "help");
165 //***************************************************************************************************************
167 ChimeraPintailCommand::~ChimeraPintailCommand(){ /* do nothing */ }
169 //***************************************************************************************************************
171 int ChimeraPintailCommand::execute(){
174 if (abort == true) { return 0; }
176 int start = time(NULL);
179 if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); }
181 chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
183 string outputFileName, accnosFileName;
184 if (maskfile != "") {
185 outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.chimeras";
186 accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.accnos";
188 outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + "pintail.chimeras";
189 accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + "pintail.accnos";
191 bool hasAccnos = true;
193 if (m->control_pressed) { delete chimera; return 0; }
195 if (chimera->getUnaligned()) {
196 m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine();
200 templateSeqsLength = chimera->getLength();
203 int pid, end, numSeqsPerProcessor;
206 MPIWroteAccnos = false;
209 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
210 MPI_Comm_size(MPI_COMM_WORLD, &processors);
214 MPI_File outMPIAccnos;
216 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
217 int inMode=MPI_MODE_RDONLY;
219 //char* outFilename = new char[outputFileName.length()];
220 //memcpy(outFilename, outputFileName.c_str(), outputFileName.length());
222 char outFilename[1024];
223 strcpy(outFilename, outputFileName.c_str());
225 //char* outAccnosFilename = new char[accnosFileName.length()];
226 //memcpy(outAccnosFilename, accnosFileName.c_str(), accnosFileName.length());
228 char outAccnosFilename[1024];
229 strcpy(outAccnosFilename, accnosFileName.c_str());
231 //char* inFileName = new char[fastafile.length()];
232 //memcpy(inFileName, fastafile.c_str(), fastafile.length());
234 char inFileName[1024];
235 strcpy(inFileName, fastafile.c_str());
237 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
238 MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
239 MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos);
242 //delete outFilename;
243 //delete outAccnosFilename;
245 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; }
247 if (pid == 0) { //you are the root process
249 MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
251 //send file positions to all processes
252 for(int i = 1; i < processors; i++) {
253 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
254 MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
257 //figure out how many sequences you have to align
258 numSeqsPerProcessor = numSeqs / processors;
259 int startIndex = pid * numSeqsPerProcessor;
260 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
264 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
266 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); delete chimera; return 0; }
268 for (int i = 1; i < processors; i++) {
270 MPI_Recv(&tempResult, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
271 if (tempResult != 0) { MPIWroteAccnos = true; }
273 }else{ //you are a child process
274 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
275 MPIPos.resize(numSeqs+1);
276 MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
278 //figure out how many sequences you have to align
279 numSeqsPerProcessor = numSeqs / processors;
280 int startIndex = pid * numSeqsPerProcessor;
281 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
285 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
287 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; }
289 MPI_Send(&MPIWroteAccnos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
293 MPI_File_close(&inMPI);
294 MPI_File_close(&outMPI);
295 MPI_File_close(&outMPIAccnos);
296 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
298 //delete accnos file if blank
300 if (!MPIWroteAccnos) {
302 //MPI_File_delete(outAccnosFilename, info);
304 remove(accnosFileName.c_str());
311 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
314 openInputFile(fastafile, inFASTA);
315 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
318 lines.push_back(new linePair(0, numSeqs));
320 driver(lines[0], outputFileName, fastafile, accnosFileName);
322 if (m->control_pressed) {
323 remove(outputFileName.c_str());
324 remove(accnosFileName.c_str());
325 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
330 //delete accnos file if its blank
331 if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; }
334 vector<int> positions;
335 processIDS.resize(0);
338 openInputFile(fastafile, inFASTA);
341 while(!inFASTA.eof()){
342 input = getline(inFASTA);
343 if (input.length() != 0) {
344 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
349 numSeqs = positions.size();
351 int numSeqsPerProcessor = numSeqs / processors;
353 for (int i = 0; i < processors; i++) {
354 long int startPos = positions[ i * numSeqsPerProcessor ];
355 if(i == processors - 1){
356 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
358 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
362 createProcesses(outputFileName, fastafile, accnosFileName);
364 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
366 //append output files
367 for(int i=1;i<processors;i++){
368 appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
369 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
372 vector<string> nonBlankAccnosFiles;
373 //delete blank accnos files generated with multiple processes
374 for(int i=0;i<processors;i++){
375 if (!(isBlank(accnosFileName + toString(processIDS[i]) + ".temp"))) {
376 nonBlankAccnosFiles.push_back(accnosFileName + toString(processIDS[i]) + ".temp");
377 }else { remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str()); }
380 //append accnos files
381 if (nonBlankAccnosFiles.size() != 0) {
382 rename(nonBlankAccnosFiles[0].c_str(), accnosFileName.c_str());
384 for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
385 appendFiles(nonBlankAccnosFiles[h], accnosFileName);
386 remove(nonBlankAccnosFiles[h].c_str());
388 }else{ hasAccnos = false; }
390 if (m->control_pressed) {
391 remove(outputFileName.c_str());
392 remove(accnosFileName.c_str());
393 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
401 openInputFile(fastafile, inFASTA);
402 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
404 lines.push_back(new linePair(0, numSeqs));
406 driver(lines[0], outputFileName, fastafile, accnosFileName);
408 if (m->control_pressed) {
409 remove(outputFileName.c_str());
410 remove(accnosFileName.c_str());
411 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
416 //delete accnos file if its blank
417 if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; }
423 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
425 m->mothurOutEndLine();
426 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
427 m->mothurOut(outputFileName); m->mothurOutEndLine();
428 if (hasAccnos) { m->mothurOut(accnosFileName); m->mothurOutEndLine(); }
429 m->mothurOutEndLine();
430 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
435 catch(exception& e) {
436 m->errorOut(e, "ChimeraPintailCommand", "execute");
440 //**********************************************************************************************************************
442 int ChimeraPintailCommand::driver(linePair* line, string outputFName, string filename, string accnos){
445 openOutputFile(outputFName, out);
448 openOutputFile(accnos, out2);
451 openInputFile(filename, inFASTA);
453 inFASTA.seekg(line->start);
455 for(int i=0;i<line->numSeqs;i++){
457 if (m->control_pressed) { return 1; }
459 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
461 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
463 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
464 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
467 chimera->getChimeras(candidateSeq);
469 if (m->control_pressed) { delete candidateSeq; return 1; }
472 chimera->print(out, out2);
478 if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); }
481 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); }
489 catch(exception& e) {
490 m->errorOut(e, "ChimeraPintailCommand", "driver");
494 //**********************************************************************************************************************
496 int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<long>& MPIPos){
501 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
503 for(int i=0;i<num;i++){
505 if (m->control_pressed) { return 1; }
508 int length = MPIPos[start+i+1] - MPIPos[start+i];
510 char* buf4 = new char[length];
511 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
513 string tempBuf = buf4;
514 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
515 istringstream iss (tempBuf,istringstream::in);
518 Sequence* candidateSeq = new Sequence(iss); gobble(iss);
520 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
522 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
523 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
526 chimera->getChimeras(candidateSeq);
528 if (m->control_pressed) { delete candidateSeq; return 1; }
531 bool isChimeric = chimera->print(outMPI, outAccMPI);
532 if (isChimeric) { MPIWroteAccnos = true; }
538 if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); }
541 if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); }
546 catch(exception& e) {
547 m->errorOut(e, "ChimeraPintailCommand", "driverMPI");
553 /**************************************************************************************************/
555 int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) {
557 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
559 // processIDS.resize(0);
561 //loop through and create all the processes you want
562 while (process != processors) {
566 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
569 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp");
571 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
574 //force parent to wait until all the processes are done
575 for (int i=0;i<processors;i++) {
576 int temp = processIDS[i];
583 catch(exception& e) {
584 m->errorOut(e, "ChimeraPintailCommand", "createProcesses");
589 /**************************************************************************************************/