2 * chimerapintailcommand.cpp
5 * Created by westcott on 4/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "chimerapintailcommand.h"
13 //***************************************************************************************************************
15 ChimeraPintailCommand::ChimeraPintailCommand(string option) {
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"fasta","filter","processors","window" ,"increment","template","conservation","quantile","mask","outputdir","inputdir"};
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string,string> parameters = parser.getParameters();
30 ValidParameters validParameter("chimera.pintail");
31 map<string,string>::iterator it;
33 //check to make sure all parameters are valid for command
34 for (it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //if the user changes the input directory command factory will send this info to us in the output parameter
39 inputDir = validParameter.validFile(parameters, "inputdir", false);
40 if (inputDir == "not found"){ inputDir = ""; }
43 it = parameters.find("template");
44 //user has given a template file
45 if(it != parameters.end()){
46 path = hasPath(it->second);
47 //if the user has not given a path then, add inputdir. else leave path alone.
48 if (path == "") { parameters["template"] = inputDir + it->second; }
51 it = parameters.find("conservation");
52 //user has given a template file
53 if(it != parameters.end()){
54 path = hasPath(it->second);
55 //if the user has not given a path then, add inputdir. else leave path alone.
56 if (path == "") { parameters["conservation"] = inputDir + it->second; }
59 it = parameters.find("quantile");
60 //user has given a template file
61 if(it != parameters.end()){
62 path = hasPath(it->second);
63 //if the user has not given a path then, add inputdir. else leave path alone.
64 if (path == "") { parameters["quantile"] = inputDir + it->second; }
69 //check for required parameters
70 fastafile = validParameter.validFile(parameters, "fasta", false);
71 if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
73 splitAtDash(fastafile, fastaFileNames);
75 //go through files and make sure they are good, if not, then disregard them
76 for (int i = 0; i < fastaFileNames.size(); i++) {
78 string path = hasPath(fastaFileNames[i]);
79 //if the user has not given a path then, add inputdir. else leave path alone.
80 if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; }
86 ableToOpen = openInputFile(fastaFileNames[i], in, "noerror");
88 //if you can't open it, try default location
89 if (ableToOpen == 1) {
90 if (m->getDefaultPath() != "") { //default path is set
91 string tryPath = m->getDefaultPath() + getSimpleName(fastaFileNames[i]);
92 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
93 ableToOpen = openInputFile(tryPath, in, "noerror");
94 fastaFileNames[i] = tryPath;
99 if (ableToOpen == 1) {
100 m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
101 //erase from file list
102 fastaFileNames.erase(fastaFileNames.begin()+i);
107 //make sure there is at least one valid file left
108 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
112 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
113 filter = isTrue(temp);
115 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
116 convert(temp, processors);
118 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
119 convert(temp, window);
121 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
122 convert(temp, increment);
124 maskfile = validParameter.validFile(parameters, "mask", false);
125 if (maskfile == "not found") { maskfile = ""; }
126 else if (maskfile != "default") {
127 if (inputDir != "") {
128 string path = hasPath(maskfile);
129 //if the user has not given a path then, add inputdir. else leave path alone.
130 if (path == "") { maskfile = inputDir + maskfile; }
134 int ableToOpen = openInputFile(maskfile, in);
135 if (ableToOpen == 1) { abort = true; }
140 //if the user changes the output directory command factory will send this info to us in the output parameter
141 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
143 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
146 templatefile = validParameter.validFile(parameters, "template", true);
147 if (templatefile == "not open") { abort = true; }
148 else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
150 consfile = validParameter.validFile(parameters, "conservation", true);
151 if (consfile == "not open") { abort = true; }
152 else if (consfile == "not found") {
155 string tempConsFile = getRootName(inputDir + getSimpleName(templatefile)) + "freq";
156 ifstream FileTest(tempConsFile.c_str());
158 bool GoodFile = checkReleaseVersion(FileTest, m->getVersion());
160 m->mothurOut("I found " + tempConsFile + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); consfile = tempConsFile; FileTest.close();
165 quanfile = validParameter.validFile(parameters, "quantile", true);
166 if (quanfile == "not open") { abort = true; }
167 else if (quanfile == "not found") { quanfile = ""; }
170 catch(exception& e) {
171 m->errorOut(e, "ChimeraPintailCommand", "ChimeraPintailCommand");
175 //**********************************************************************************************************************
177 void ChimeraPintailCommand::help(){
180 m->mothurOut("The chimera.pintail command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
181 m->mothurOut("This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n");
182 m->mothurOut("The chimera.pintail command parameters are fasta, template, filter, mask, processors, window, increment, conservation and quantile.\n");
183 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
184 m->mothurOut("You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
185 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
186 m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
187 m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences, by default no mask is applied. You can apply an ecoli mask by typing, mask=default. \n");
188 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
190 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
192 m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n");
193 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
194 m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
195 m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
196 m->mothurOut("The chimera.pintail command should be in the following format: \n");
197 m->mothurOut("chimera.pintail(fasta=yourFastaFile, template=yourTemplate) \n");
198 m->mothurOut("Example: chimera.pintail(fasta=AD.align, template=silva.bacteria.fasta) \n");
199 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
201 catch(exception& e) {
202 m->errorOut(e, "ChimeraPintailCommand", "help");
207 //***************************************************************************************************************
209 ChimeraPintailCommand::~ChimeraPintailCommand(){ /* do nothing */ }
211 //***************************************************************************************************************
213 int ChimeraPintailCommand::execute(){
216 if (abort == true) { return 0; }
218 for (int s = 0; s < fastaFileNames.size(); s++) {
220 m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
222 int start = time(NULL);
225 if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); }
227 //check for quantile to save the time
228 string tempQuan = "";
229 if ((!filter) && (maskfile == "")) {
230 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.quan";
231 }else if ((!filter) && (maskfile != "")) {
232 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.masked.quan";
233 }else if ((filter) && (maskfile != "")) {
234 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "masked.quan";
235 }else if ((filter) && (maskfile == "")) {
236 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "quan";
239 ifstream FileTest(tempQuan.c_str());
241 bool GoodFile = checkReleaseVersion(FileTest, m->getVersion());
243 m->mothurOut("I found " + tempQuan + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); quanfile = tempQuan; FileTest.close();
247 chimera = new Pintail(fastaFileNames[s], templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
249 string outputFileName, accnosFileName;
250 if (maskfile != "") {
251 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.chimeras";
252 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.accnos";
254 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.chimeras";
255 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.accnos";
258 if (m->control_pressed) { delete chimera; for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } return 0; }
260 if (chimera->getUnaligned()) {
261 m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine();
265 templateSeqsLength = chimera->getLength();
268 int pid, end, numSeqsPerProcessor;
270 vector<unsigned long int> MPIPos;
273 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
274 MPI_Comm_size(MPI_COMM_WORLD, &processors);
278 MPI_File outMPIAccnos;
280 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
281 int inMode=MPI_MODE_RDONLY;
283 char outFilename[1024];
284 strcpy(outFilename, outputFileName.c_str());
286 char outAccnosFilename[1024];
287 strcpy(outAccnosFilename, accnosFileName.c_str());
289 char inFileName[1024];
290 strcpy(inFileName, fastaFileNames[s].c_str());
292 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
293 MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
294 MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos);
296 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
298 if (pid == 0) { //you are the root process
300 MPIPos = setFilePosFasta(fastaFileNames[s], numSeqs); //fills MPIPos, returns numSeqs
302 //send file positions to all processes
303 for(int i = 1; i < processors; i++) {
304 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
305 MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
308 //figure out how many sequences you have to align
309 numSeqsPerProcessor = numSeqs / processors;
310 int startIndex = pid * numSeqsPerProcessor;
311 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
314 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
316 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
318 }else{ //you are a child process
319 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
320 MPIPos.resize(numSeqs+1);
321 MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
323 //figure out how many sequences you have to align
324 numSeqsPerProcessor = numSeqs / processors;
325 int startIndex = pid * numSeqsPerProcessor;
326 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
329 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
331 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
335 MPI_File_close(&inMPI);
336 MPI_File_close(&outMPI);
337 MPI_File_close(&outMPIAccnos);
338 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
342 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
345 openInputFile(fastaFileNames[s], inFASTA);
346 getNumSeqs(inFASTA, numSeqs);
349 lines.push_back(new linePair(0, numSeqs));
351 driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
353 if (m->control_pressed) {
354 remove(outputFileName.c_str());
355 remove(accnosFileName.c_str());
356 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
357 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
363 vector<unsigned long int> positions;
364 processIDS.resize(0);
367 openInputFile(fastaFileNames[s], inFASTA);
370 while(!inFASTA.eof()){
371 input = getline(inFASTA);
372 if (input.length() != 0) {
373 if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
378 numSeqs = positions.size();
380 int numSeqsPerProcessor = numSeqs / processors;
382 for (int i = 0; i < processors; i++) {
383 unsigned long int startPos = positions[ i * numSeqsPerProcessor ];
384 if(i == processors - 1){
385 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
387 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
390 createProcesses(outputFileName, fastaFileNames[s], accnosFileName);
392 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
393 rename((accnosFileName + toString(processIDS[0]) + ".temp").c_str(), accnosFileName.c_str());
395 //append output files
396 for(int i=1;i<processors;i++){
397 appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
398 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
401 //append output files
402 for(int i=1;i<processors;i++){
403 appendFiles((accnosFileName + toString(processIDS[i]) + ".temp"), accnosFileName);
404 remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str());
407 if (m->control_pressed) {
408 remove(outputFileName.c_str());
409 remove(accnosFileName.c_str());
410 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
411 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
419 openInputFile(fastaFileNames[s], inFASTA);
420 getNumSeqs(inFASTA, numSeqs);
422 lines.push_back(new linePair(0, numSeqs));
424 driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
426 if (m->control_pressed) {
427 remove(outputFileName.c_str());
428 remove(accnosFileName.c_str());
429 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
430 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
439 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
441 outputNames.push_back(outputFileName);
442 outputNames.push_back(accnosFileName);
444 m->mothurOutEndLine();
445 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
448 m->mothurOutEndLine();
449 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
450 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
451 m->mothurOutEndLine();
456 catch(exception& e) {
457 m->errorOut(e, "ChimeraPintailCommand", "execute");
461 //**********************************************************************************************************************
463 int ChimeraPintailCommand::driver(linePair* line, string outputFName, string filename, string accnos){
466 openOutputFile(outputFName, out);
469 openOutputFile(accnos, out2);
472 openInputFile(filename, inFASTA);
474 inFASTA.seekg(line->start);
476 for(int i=0;i<line->numSeqs;i++){
478 if (m->control_pressed) { return 1; }
480 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
482 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
484 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
485 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
488 chimera->getChimeras(candidateSeq);
490 if (m->control_pressed) { delete candidateSeq; return 1; }
493 chimera->print(out, out2);
499 if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); }
502 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); }
510 catch(exception& e) {
511 m->errorOut(e, "ChimeraPintailCommand", "driver");
515 //**********************************************************************************************************************
517 int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long int>& MPIPos){
522 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
524 for(int i=0;i<num;i++){
526 if (m->control_pressed) { return 1; }
529 int length = MPIPos[start+i+1] - MPIPos[start+i];
531 char* buf4 = new char[length];
532 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
534 string tempBuf = buf4;
535 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
536 istringstream iss (tempBuf,istringstream::in);
539 Sequence* candidateSeq = new Sequence(iss); gobble(iss);
541 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
543 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
544 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
547 chimera->getChimeras(candidateSeq);
549 if (m->control_pressed) { delete candidateSeq; return 1; }
552 bool isChimeric = chimera->print(outMPI, outAccMPI);
558 if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); }
561 if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); }
566 catch(exception& e) {
567 m->errorOut(e, "ChimeraPintailCommand", "driverMPI");
573 /**************************************************************************************************/
575 int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) {
577 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
579 // processIDS.resize(0);
581 //loop through and create all the processes you want
582 while (process != processors) {
586 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
589 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp");
591 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
594 //force parent to wait until all the processes are done
595 for (int i=0;i<processors;i++) {
596 int temp = processIDS[i];
603 catch(exception& e) {
604 m->errorOut(e, "ChimeraPintailCommand", "createProcesses");
609 /**************************************************************************************************/