2 * chimerapintailcommand.cpp
5 * Created by westcott on 4/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "chimerapintailcommand.h"
13 //***************************************************************************************************************
15 ChimeraPintailCommand::ChimeraPintailCommand(string option) {
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"fasta","filter","processors","window" ,"increment","template","conservation","quantile","mask","outputdir","inputdir"};
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string,string> parameters = parser.getParameters();
30 ValidParameters validParameter;
31 map<string,string>::iterator it;
33 //check to make sure all parameters are valid for command
34 for (it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //if the user changes the input directory command factory will send this info to us in the output parameter
39 string inputDir = validParameter.validFile(parameters, "inputdir", false);
40 if (inputDir == "not found"){ inputDir = ""; }
43 it = parameters.find("fasta");
44 //user has given a template file
45 if(it != parameters.end()){
46 path = hasPath(it->second);
47 //if the user has not given a path then, add inputdir. else leave path alone.
48 if (path == "") { parameters["fasta"] = inputDir + it->second; }
51 it = parameters.find("template");
52 //user has given a template file
53 if(it != parameters.end()){
54 path = hasPath(it->second);
55 //if the user has not given a path then, add inputdir. else leave path alone.
56 if (path == "") { parameters["template"] = inputDir + it->second; }
59 it = parameters.find("conservation");
60 //user has given a template file
61 if(it != parameters.end()){
62 path = hasPath(it->second);
63 //if the user has not given a path then, add inputdir. else leave path alone.
64 if (path == "") { parameters["conservation"] = inputDir + it->second; }
67 it = parameters.find("quantile");
68 //user has given a template file
69 if(it != parameters.end()){
70 path = hasPath(it->second);
71 //if the user has not given a path then, add inputdir. else leave path alone.
72 if (path == "") { parameters["quantile"] = inputDir + it->second; }
77 //check for required parameters
78 fastafile = validParameter.validFile(parameters, "fasta", true);
79 if (fastafile == "not open") { abort = true; }
80 else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
82 //if the user changes the output directory command factory will send this info to us in the output parameter
83 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
85 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
88 templatefile = validParameter.validFile(parameters, "template", true);
89 if (templatefile == "not open") { abort = true; }
90 else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
92 consfile = validParameter.validFile(parameters, "conservation", true);
93 if (consfile == "not open") { abort = true; }
94 else if (consfile == "not found") { consfile = ""; }
96 quanfile = validParameter.validFile(parameters, "quantile", true);
97 if (quanfile == "not open") { abort = true; }
98 else if (quanfile == "not found") { quanfile = ""; }
100 maskfile = validParameter.validFile(parameters, "mask", false);
101 if (maskfile == "not found") { maskfile = ""; }
102 else if (maskfile != "default") {
103 if (inputDir != "") {
104 string path = hasPath(maskfile);
105 //if the user has not given a path then, add inputdir. else leave path alone.
106 if (path == "") { maskfile = inputDir + maskfile; }
110 int ableToOpen = openInputFile(maskfile, in);
111 if (ableToOpen == 1) { abort = true; }
116 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
117 filter = isTrue(temp);
119 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
120 convert(temp, processors);
122 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
123 convert(temp, window);
125 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
126 convert(temp, increment);
129 catch(exception& e) {
130 m->errorOut(e, "ChimeraPintailCommand", "ChimeraPintailCommand");
134 //**********************************************************************************************************************
136 void ChimeraPintailCommand::help(){
139 m->mothurOut("The chimera.pintail command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
140 m->mothurOut("This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n");
141 m->mothurOut("The chimera.pintail command parameters are fasta, template, filter, mask, processors, window, increment, conservation and quantile.\n");
142 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
143 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
144 m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
145 m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences, by default no mask is applied. You can apply an ecoli mask by typing, mask=default. \n");
146 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
148 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
150 m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=1/4 sequence length. \n");
151 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
152 m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
153 m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
154 m->mothurOut("The chimera.pintail command should be in the following format: \n");
155 m->mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
156 m->mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, method=bellerophon, window=200) \n");
157 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
159 catch(exception& e) {
160 m->errorOut(e, "ChimeraPintailCommand", "help");
165 //***************************************************************************************************************
167 ChimeraPintailCommand::~ChimeraPintailCommand(){ /* do nothing */ }
169 //***************************************************************************************************************
171 int ChimeraPintailCommand::execute(){
174 if (abort == true) { return 0; }
176 int start = time(NULL);
178 chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
181 if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); }
184 string outputFileName, accnosFileName;
185 if (maskfile != "") {
186 outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.chimeras";
187 accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.accnos";
189 outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + "pintail.chimeras";
190 accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + "pintail.accnos";
192 bool hasAccnos = true;
194 if (m->control_pressed) { delete chimera; return 0; }
196 if (chimera->getUnaligned()) {
197 m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine();
201 templateSeqsLength = chimera->getLength();
204 int pid, end, numSeqsPerProcessor;
207 MPIWroteAccnos = false;
210 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
211 MPI_Comm_size(MPI_COMM_WORLD, &processors);
215 MPI_File outMPIAccnos;
217 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
218 int inMode=MPI_MODE_RDONLY;
220 char* outFilename = new char[outputFileName.length()];
\r
221 memcpy(outFilename, outputFileName.c_str(), outputFileName.length());
223 char* outAccnosFilename = new char[accnosFileName.length()];
\r
224 memcpy(outAccnosFilename, accnosFileName.c_str(), accnosFileName.length());
226 char* inFileName = new char[fastafile.length()];
\r
227 memcpy(inFileName, fastafile.c_str(), fastafile.length());
229 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
230 MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
231 MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos);
235 delete outAccnosFilename;
237 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; }
239 if (pid == 0) { //you are the root process
241 MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
243 //send file positions to all processes
244 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
245 MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
247 //figure out how many sequences you have to align
248 numSeqsPerProcessor = numSeqs / processors;
249 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
250 int startIndex = pid * numSeqsPerProcessor;
253 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
255 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); delete chimera; return 0; }
257 for (int i = 1; i < processors; i++) {
259 MPI_Recv(&tempResult, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
260 if (tempResult != 0) { MPIWroteAccnos = true; }
262 }else{ //you are a child process
263 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
264 MPIPos.resize(numSeqs+1);
265 MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
267 //figure out how many sequences you have to align
268 numSeqsPerProcessor = numSeqs / processors;
269 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
270 int startIndex = pid * numSeqsPerProcessor;
273 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
275 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); delete chimera; return 0; }
277 MPI_Send(&MPIWroteAccnos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
281 MPI_File_close(&inMPI);
282 MPI_File_close(&outMPI);
283 MPI_File_close(&outMPIAccnos);
285 //delete accnos file if blank
287 if (!MPIWroteAccnos) {
289 //MPI_File_delete(outAccnosFilename, info);
291 remove(accnosFileName.c_str());
298 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
301 openInputFile(fastafile, inFASTA);
302 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
305 lines.push_back(new linePair(0, numSeqs));
307 driver(lines[0], outputFileName, fastafile, accnosFileName);
309 if (m->control_pressed) {
310 remove(outputFileName.c_str());
311 remove(accnosFileName.c_str());
312 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
317 //delete accnos file if its blank
318 if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; }
321 vector<int> positions;
322 processIDS.resize(0);
325 openInputFile(fastafile, inFASTA);
328 while(!inFASTA.eof()){
329 input = getline(inFASTA);
330 if (input.length() != 0) {
331 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
336 numSeqs = positions.size();
338 int numSeqsPerProcessor = numSeqs / processors;
340 for (int i = 0; i < processors; i++) {
341 long int startPos = positions[ i * numSeqsPerProcessor ];
342 if(i == processors - 1){
343 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
345 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
349 createProcesses(outputFileName, fastafile, accnosFileName);
351 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
353 //append output files
354 for(int i=1;i<processors;i++){
355 appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
356 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
359 vector<string> nonBlankAccnosFiles;
360 //delete blank accnos files generated with multiple processes
361 for(int i=0;i<processors;i++){
362 if (!(isBlank(accnosFileName + toString(processIDS[i]) + ".temp"))) {
363 nonBlankAccnosFiles.push_back(accnosFileName + toString(processIDS[i]) + ".temp");
364 }else { remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str()); }
367 //append accnos files
368 if (nonBlankAccnosFiles.size() != 0) {
369 rename(nonBlankAccnosFiles[0].c_str(), accnosFileName.c_str());
371 for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
372 appendFiles(nonBlankAccnosFiles[h], accnosFileName);
373 remove(nonBlankAccnosFiles[h].c_str());
375 }else{ hasAccnos = false; }
377 if (m->control_pressed) {
378 remove(outputFileName.c_str());
379 remove(accnosFileName.c_str());
380 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
388 openInputFile(fastafile, inFASTA);
389 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
391 lines.push_back(new linePair(0, numSeqs));
393 driver(lines[0], outputFileName, fastafile, accnosFileName);
395 if (m->control_pressed) {
396 remove(outputFileName.c_str());
397 remove(accnosFileName.c_str());
398 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
403 //delete accnos file if its blank
404 if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; }
410 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
412 m->mothurOutEndLine();
413 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
414 m->mothurOut(outputFileName); m->mothurOutEndLine();
415 if (hasAccnos) { m->mothurOut(accnosFileName); m->mothurOutEndLine(); }
416 m->mothurOutEndLine();
417 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
422 catch(exception& e) {
423 m->errorOut(e, "ChimeraPintailCommand", "execute");
427 //**********************************************************************************************************************
429 int ChimeraPintailCommand::driver(linePair* line, string outputFName, string filename, string accnos){
432 openOutputFile(outputFName, out);
435 openOutputFile(accnos, out2);
438 openInputFile(filename, inFASTA);
440 inFASTA.seekg(line->start);
442 for(int i=0;i<line->numSeqs;i++){
444 if (m->control_pressed) { return 1; }
446 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
448 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
450 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
451 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
454 chimera->getChimeras(candidateSeq);
456 if (m->control_pressed) { delete candidateSeq; return 1; }
459 chimera->print(out, out2);
465 if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); }
468 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); }
476 catch(exception& e) {
477 m->errorOut(e, "ChimeraPintailCommand", "driver");
481 //**********************************************************************************************************************
483 int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<long>& MPIPos){
488 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
490 for(int i=0;i<num;i++){
492 if (m->control_pressed) { return 1; }
495 int length = MPIPos[start+i+1] - MPIPos[start+i];
497 char* buf4 = new char[length];
498 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
500 string tempBuf = buf4;
501 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
502 istringstream iss (tempBuf,istringstream::in);
505 Sequence* candidateSeq = new Sequence(iss); gobble(iss);
507 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
509 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
510 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
513 chimera->getChimeras(candidateSeq);
515 if (m->control_pressed) { delete candidateSeq; return 1; }
518 bool isChimeric = chimera->print(outMPI, outAccMPI);
519 if (isChimeric) { MPIWroteAccnos = true; }
525 if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); }
528 if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); }
533 catch(exception& e) {
534 m->errorOut(e, "ChimeraPintailCommand", "driverMPI");
540 /**************************************************************************************************/
542 int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) {
544 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
546 // processIDS.resize(0);
548 //loop through and create all the processes you want
549 while (process != processors) {
553 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
556 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp");
558 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
561 //force parent to wait until all the processes are done
562 for (int i=0;i<processors;i++) {
563 int temp = processIDS[i];
570 catch(exception& e) {
571 m->errorOut(e, "ChimeraPintailCommand", "createProcesses");
576 /**************************************************************************************************/