2 * chimerapintailcommand.cpp
5 * Created by westcott on 4/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "chimerapintailcommand.h"
13 //***************************************************************************************************************
15 ChimeraPintailCommand::ChimeraPintailCommand(string option) {
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"fasta","filter","processors","window" ,"increment","template","conservation","quantile","mask","outputdir","inputdir"};
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string,string> parameters = parser.getParameters();
30 ValidParameters validParameter("chimera.pintail");
31 map<string,string>::iterator it;
33 //check to make sure all parameters are valid for command
34 for (it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //if the user changes the input directory command factory will send this info to us in the output parameter
39 inputDir = validParameter.validFile(parameters, "inputdir", false);
40 if (inputDir == "not found"){ inputDir = ""; }
43 it = parameters.find("template");
44 //user has given a template file
45 if(it != parameters.end()){
46 path = hasPath(it->second);
47 //if the user has not given a path then, add inputdir. else leave path alone.
48 if (path == "") { parameters["template"] = inputDir + it->second; }
51 it = parameters.find("conservation");
52 //user has given a template file
53 if(it != parameters.end()){
54 path = hasPath(it->second);
55 //if the user has not given a path then, add inputdir. else leave path alone.
56 if (path == "") { parameters["conservation"] = inputDir + it->second; }
59 it = parameters.find("quantile");
60 //user has given a template file
61 if(it != parameters.end()){
62 path = hasPath(it->second);
63 //if the user has not given a path then, add inputdir. else leave path alone.
64 if (path == "") { parameters["quantile"] = inputDir + it->second; }
69 //check for required parameters
70 fastafile = validParameter.validFile(parameters, "fasta", false);
71 if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
73 splitAtDash(fastafile, fastaFileNames);
75 //go through files and make sure they are good, if not, then disregard them
76 for (int i = 0; i < fastaFileNames.size(); i++) {
78 string path = hasPath(fastaFileNames[i]);
79 //if the user has not given a path then, add inputdir. else leave path alone.
80 if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; }
88 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
89 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
94 ableToOpen = openInputFile(fastaFileNames[i], in, "noerror");
96 //if you can't open it, try default location
97 if (ableToOpen == 1) {
98 if (m->getDefaultPath() != "") { //default path is set
99 string tryPath = m->getDefaultPath() + getSimpleName(fastaFileNames[i]);
100 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
101 ableToOpen = openInputFile(tryPath, in, "noerror");
102 fastaFileNames[i] = tryPath;
108 for (int j = 1; j < processors; j++) {
109 MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD);
113 MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
118 if (ableToOpen == 1) {
119 m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
120 //erase from file list
121 fastaFileNames.erase(fastaFileNames.begin()+i);
126 //make sure there is at least one valid file left
127 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
131 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
132 filter = isTrue(temp);
134 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
135 convert(temp, processors);
137 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
138 convert(temp, window);
140 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
141 convert(temp, increment);
143 maskfile = validParameter.validFile(parameters, "mask", false);
144 if (maskfile == "not found") { maskfile = ""; }
145 else if (maskfile != "default") {
146 if (inputDir != "") {
147 string path = hasPath(maskfile);
148 //if the user has not given a path then, add inputdir. else leave path alone.
149 if (path == "") { maskfile = inputDir + maskfile; }
153 int ableToOpen = openInputFile(maskfile, in);
154 if (ableToOpen == 1) { abort = true; }
159 //if the user changes the output directory command factory will send this info to us in the output parameter
160 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
162 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
165 templatefile = validParameter.validFile(parameters, "template", true);
166 if (templatefile == "not open") { abort = true; }
167 else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
169 consfile = validParameter.validFile(parameters, "conservation", true);
170 if (consfile == "not open") { abort = true; }
171 else if (consfile == "not found") {
174 string tempConsFile = getRootName(inputDir + getSimpleName(templatefile)) + "freq";
175 ifstream FileTest(tempConsFile.c_str());
176 if(FileTest){ m->mothurOut("I found " + tempConsFile + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); consfile = tempConsFile; FileTest.close(); }
179 quanfile = validParameter.validFile(parameters, "quantile", true);
180 if (quanfile == "not open") { abort = true; }
181 else if (quanfile == "not found") { quanfile = ""; }
184 catch(exception& e) {
185 m->errorOut(e, "ChimeraPintailCommand", "ChimeraPintailCommand");
189 //**********************************************************************************************************************
191 void ChimeraPintailCommand::help(){
194 m->mothurOut("The chimera.pintail command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
195 m->mothurOut("This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n");
196 m->mothurOut("The chimera.pintail command parameters are fasta, template, filter, mask, processors, window, increment, conservation and quantile.\n");
197 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
198 m->mothurOut("You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
199 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
200 m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
201 m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences, by default no mask is applied. You can apply an ecoli mask by typing, mask=default. \n");
202 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
204 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
206 m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n");
207 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
208 m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
209 m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
210 m->mothurOut("The chimera.pintail command should be in the following format: \n");
211 m->mothurOut("chimera.pintail(fasta=yourFastaFile, template=yourTemplate) \n");
212 m->mothurOut("Example: chimera.pintail(fasta=AD.align, template=silva.bacteria.fasta) \n");
213 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
215 catch(exception& e) {
216 m->errorOut(e, "ChimeraPintailCommand", "help");
221 //***************************************************************************************************************
223 ChimeraPintailCommand::~ChimeraPintailCommand(){ /* do nothing */ }
225 //***************************************************************************************************************
227 int ChimeraPintailCommand::execute(){
230 if (abort == true) { return 0; }
232 for (int s = 0; s < fastaFileNames.size(); s++) {
234 m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
236 int start = time(NULL);
239 if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); }
241 //check for quantile to save the time
242 string tempQuan = "";
243 if ((!filter) && (maskfile == "")) {
244 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.quan";
245 }else if ((!filter) && (maskfile != "")) {
246 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.masked.quan";
247 }else if ((filter) && (maskfile != "")) {
248 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "masked.quan";
249 }else if ((filter) && (maskfile == "")) {
250 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "quan";
253 ifstream FileTest(tempQuan.c_str());
254 if(FileTest){ m->mothurOut("I found " + tempQuan + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); quanfile = tempQuan; FileTest.close(); }
256 chimera = new Pintail(fastaFileNames[s], templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
258 string outputFileName, accnosFileName;
259 if (maskfile != "") {
260 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.chimeras";
261 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.accnos";
263 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.chimeras";
264 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.accnos";
267 if (m->control_pressed) { delete chimera; for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } return 0; }
269 if (chimera->getUnaligned()) {
270 m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine();
274 templateSeqsLength = chimera->getLength();
277 int pid, end, numSeqsPerProcessor;
279 vector<unsigned long int> MPIPos;
282 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
283 MPI_Comm_size(MPI_COMM_WORLD, &processors);
287 MPI_File outMPIAccnos;
289 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
290 int inMode=MPI_MODE_RDONLY;
292 char outFilename[1024];
293 strcpy(outFilename, outputFileName.c_str());
295 char outAccnosFilename[1024];
296 strcpy(outAccnosFilename, accnosFileName.c_str());
298 char inFileName[1024];
299 strcpy(inFileName, fastaFileNames[s].c_str());
301 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
302 MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
303 MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos);
305 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
307 if (pid == 0) { //you are the root process
309 MPIPos = setFilePosFasta(fastaFileNames[s], numSeqs); //fills MPIPos, returns numSeqs
311 //send file positions to all processes
312 for(int i = 1; i < processors; i++) {
313 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
314 MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
317 //figure out how many sequences you have to align
318 numSeqsPerProcessor = numSeqs / processors;
319 int startIndex = pid * numSeqsPerProcessor;
320 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
323 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
325 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
327 }else{ //you are a child process
328 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
329 MPIPos.resize(numSeqs+1);
330 MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
332 //figure out how many sequences you have to align
333 numSeqsPerProcessor = numSeqs / processors;
334 int startIndex = pid * numSeqsPerProcessor;
335 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
338 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
340 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
344 MPI_File_close(&inMPI);
345 MPI_File_close(&outMPI);
346 MPI_File_close(&outMPIAccnos);
347 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
351 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
354 openInputFile(fastaFileNames[s], inFASTA);
355 getNumSeqs(inFASTA, numSeqs);
358 lines.push_back(new linePair(0, numSeqs));
360 driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
362 if (m->control_pressed) {
363 remove(outputFileName.c_str());
364 remove(accnosFileName.c_str());
365 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
366 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
372 vector<unsigned long int> positions;
373 processIDS.resize(0);
376 openInputFile(fastaFileNames[s], inFASTA);
379 while(!inFASTA.eof()){
380 input = getline(inFASTA);
381 if (input.length() != 0) {
382 if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
387 numSeqs = positions.size();
389 int numSeqsPerProcessor = numSeqs / processors;
391 for (int i = 0; i < processors; i++) {
392 unsigned long int startPos = positions[ i * numSeqsPerProcessor ];
393 if(i == processors - 1){
394 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
396 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
399 createProcesses(outputFileName, fastaFileNames[s], accnosFileName);
401 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
402 rename((accnosFileName + toString(processIDS[0]) + ".temp").c_str(), accnosFileName.c_str());
404 //append output files
405 for(int i=1;i<processors;i++){
406 appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
407 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
410 //append output files
411 for(int i=1;i<processors;i++){
412 appendFiles((accnosFileName + toString(processIDS[i]) + ".temp"), accnosFileName);
413 remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str());
416 if (m->control_pressed) {
417 remove(outputFileName.c_str());
418 remove(accnosFileName.c_str());
419 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
420 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
428 openInputFile(fastaFileNames[s], inFASTA);
429 getNumSeqs(inFASTA, numSeqs);
431 lines.push_back(new linePair(0, numSeqs));
433 driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
435 if (m->control_pressed) {
436 remove(outputFileName.c_str());
437 remove(accnosFileName.c_str());
438 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
439 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
448 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
450 outputNames.push_back(outputFileName);
451 outputNames.push_back(accnosFileName);
453 m->mothurOutEndLine();
454 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
457 m->mothurOutEndLine();
458 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
459 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
460 m->mothurOutEndLine();
465 catch(exception& e) {
466 m->errorOut(e, "ChimeraPintailCommand", "execute");
470 //**********************************************************************************************************************
472 int ChimeraPintailCommand::driver(linePair* line, string outputFName, string filename, string accnos){
475 openOutputFile(outputFName, out);
478 openOutputFile(accnos, out2);
481 openInputFile(filename, inFASTA);
483 inFASTA.seekg(line->start);
485 for(int i=0;i<line->numSeqs;i++){
487 if (m->control_pressed) { return 1; }
489 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
491 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
493 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
494 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
497 chimera->getChimeras(candidateSeq);
499 if (m->control_pressed) { delete candidateSeq; return 1; }
502 chimera->print(out, out2);
508 if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); }
511 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); }
519 catch(exception& e) {
520 m->errorOut(e, "ChimeraPintailCommand", "driver");
524 //**********************************************************************************************************************
526 int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long int>& MPIPos){
531 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
533 for(int i=0;i<num;i++){
535 if (m->control_pressed) { return 1; }
538 int length = MPIPos[start+i+1] - MPIPos[start+i];
540 char* buf4 = new char[length];
541 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
543 string tempBuf = buf4;
544 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
545 istringstream iss (tempBuf,istringstream::in);
548 Sequence* candidateSeq = new Sequence(iss); gobble(iss);
550 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
552 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
553 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
556 chimera->getChimeras(candidateSeq);
558 if (m->control_pressed) { delete candidateSeq; return 1; }
561 bool isChimeric = chimera->print(outMPI, outAccMPI);
567 if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); }
570 if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); }
575 catch(exception& e) {
576 m->errorOut(e, "ChimeraPintailCommand", "driverMPI");
582 /**************************************************************************************************/
584 int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) {
586 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
588 // processIDS.resize(0);
590 //loop through and create all the processes you want
591 while (process != processors) {
595 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
598 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp");
600 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
603 //force parent to wait until all the processes are done
604 for (int i=0;i<processors;i++) {
605 int temp = processIDS[i];
612 catch(exception& e) {
613 m->errorOut(e, "ChimeraPintailCommand", "createProcesses");
618 /**************************************************************************************************/