2 * chimerapintailcommand.cpp
5 * Created by westcott on 4/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "chimerapintailcommand.h"
13 //***************************************************************************************************************
15 ChimeraPintailCommand::ChimeraPintailCommand(string option) {
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"fasta","filter","processors","window" ,"increment","template","conservation","quantile","mask","outputdir","inputdir"};
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string,string> parameters = parser.getParameters();
30 ValidParameters validParameter("chimera.pintail");
31 map<string,string>::iterator it;
33 //check to make sure all parameters are valid for command
34 for (it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //if the user changes the input directory command factory will send this info to us in the output parameter
39 inputDir = validParameter.validFile(parameters, "inputdir", false);
40 if (inputDir == "not found"){ inputDir = ""; }
43 it = parameters.find("template");
44 //user has given a template file
45 if(it != parameters.end()){
46 path = hasPath(it->second);
47 //if the user has not given a path then, add inputdir. else leave path alone.
48 if (path == "") { parameters["template"] = inputDir + it->second; }
51 it = parameters.find("conservation");
52 //user has given a template file
53 if(it != parameters.end()){
54 path = hasPath(it->second);
55 //if the user has not given a path then, add inputdir. else leave path alone.
56 if (path == "") { parameters["conservation"] = inputDir + it->second; }
59 it = parameters.find("quantile");
60 //user has given a template file
61 if(it != parameters.end()){
62 path = hasPath(it->second);
63 //if the user has not given a path then, add inputdir. else leave path alone.
64 if (path == "") { parameters["quantile"] = inputDir + it->second; }
69 //check for required parameters
70 fastafile = validParameter.validFile(parameters, "fasta", false);
71 if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
73 splitAtDash(fastafile, fastaFileNames);
75 //go through files and make sure they are good, if not, then disregard them
76 for (int i = 0; i < fastaFileNames.size(); i++) {
78 string path = hasPath(fastaFileNames[i]);
79 //if the user has not given a path then, add inputdir. else leave path alone.
80 if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; }
88 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
89 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
94 ableToOpen = openInputFile(fastaFileNames[i], in, "noerror");
96 //if you can't open it, try default location
97 if (ableToOpen == 1) {
98 if (m->getDefaultPath() != "") { //default path is set
99 string tryPath = m->getDefaultPath() + getSimpleName(fastaFileNames[i]);
100 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
101 ableToOpen = openInputFile(tryPath, in, "noerror");
102 fastaFileNames[i] = tryPath;
108 for (int j = 1; j < processors; j++) {
109 MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD);
113 MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
118 if (ableToOpen == 1) {
119 m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
120 //erase from file list
121 fastaFileNames.erase(fastaFileNames.begin()+i);
126 //make sure there is at least one valid file left
127 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
131 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
132 filter = isTrue(temp);
134 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
135 convert(temp, processors);
137 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
138 convert(temp, window);
140 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
141 convert(temp, increment);
143 maskfile = validParameter.validFile(parameters, "mask", false);
144 if (maskfile == "not found") { maskfile = ""; }
145 else if (maskfile != "default") {
146 if (inputDir != "") {
147 string path = hasPath(maskfile);
148 //if the user has not given a path then, add inputdir. else leave path alone.
149 if (path == "") { maskfile = inputDir + maskfile; }
153 int ableToOpen = openInputFile(maskfile, in);
154 if (ableToOpen == 1) { abort = true; }
159 //if the user changes the output directory command factory will send this info to us in the output parameter
160 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
162 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
165 templatefile = validParameter.validFile(parameters, "template", true);
166 if (templatefile == "not open") { abort = true; }
167 else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
169 consfile = validParameter.validFile(parameters, "conservation", true);
170 if (consfile == "not open") { abort = true; }
171 else if (consfile == "not found") {
174 string tempConsFile = getRootName(inputDir + getSimpleName(templatefile)) + "freq";
175 ifstream FileTest(tempConsFile.c_str());
177 bool GoodFile = checkReleaseVersion(FileTest, m->getVersion());
179 m->mothurOut("I found " + tempConsFile + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); consfile = tempConsFile; FileTest.close();
184 quanfile = validParameter.validFile(parameters, "quantile", true);
185 if (quanfile == "not open") { abort = true; }
186 else if (quanfile == "not found") { quanfile = ""; }
189 catch(exception& e) {
190 m->errorOut(e, "ChimeraPintailCommand", "ChimeraPintailCommand");
194 //**********************************************************************************************************************
196 void ChimeraPintailCommand::help(){
199 m->mothurOut("The chimera.pintail command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
200 m->mothurOut("This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n");
201 m->mothurOut("The chimera.pintail command parameters are fasta, template, filter, mask, processors, window, increment, conservation and quantile.\n");
202 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
203 m->mothurOut("You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
204 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
205 m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
206 m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences, by default no mask is applied. You can apply an ecoli mask by typing, mask=default. \n");
207 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
209 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
211 m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n");
212 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
213 m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
214 m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
215 m->mothurOut("The chimera.pintail command should be in the following format: \n");
216 m->mothurOut("chimera.pintail(fasta=yourFastaFile, template=yourTemplate) \n");
217 m->mothurOut("Example: chimera.pintail(fasta=AD.align, template=silva.bacteria.fasta) \n");
218 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
220 catch(exception& e) {
221 m->errorOut(e, "ChimeraPintailCommand", "help");
226 //***************************************************************************************************************
228 ChimeraPintailCommand::~ChimeraPintailCommand(){ /* do nothing */ }
230 //***************************************************************************************************************
232 int ChimeraPintailCommand::execute(){
235 if (abort == true) { return 0; }
237 for (int s = 0; s < fastaFileNames.size(); s++) {
239 m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
241 int start = time(NULL);
244 if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); }
246 //check for quantile to save the time
247 string tempQuan = "";
248 if ((!filter) && (maskfile == "")) {
249 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.quan";
250 }else if ((!filter) && (maskfile != "")) {
251 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.masked.quan";
252 }else if ((filter) && (maskfile != "")) {
253 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "masked.quan";
254 }else if ((filter) && (maskfile == "")) {
255 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "quan";
258 ifstream FileTest(tempQuan.c_str());
260 bool GoodFile = checkReleaseVersion(FileTest, m->getVersion());
262 m->mothurOut("I found " + tempQuan + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); quanfile = tempQuan; FileTest.close();
266 chimera = new Pintail(fastaFileNames[s], templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
268 string outputFileName, accnosFileName;
269 if (maskfile != "") {
270 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.chimeras";
271 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.accnos";
273 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.chimeras";
274 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.accnos";
277 if (m->control_pressed) { delete chimera; for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } return 0; }
279 if (chimera->getUnaligned()) {
280 m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine();
284 templateSeqsLength = chimera->getLength();
287 int pid, end, numSeqsPerProcessor;
289 vector<unsigned long int> MPIPos;
292 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
293 MPI_Comm_size(MPI_COMM_WORLD, &processors);
297 MPI_File outMPIAccnos;
299 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
300 int inMode=MPI_MODE_RDONLY;
302 char outFilename[1024];
303 strcpy(outFilename, outputFileName.c_str());
305 char outAccnosFilename[1024];
306 strcpy(outAccnosFilename, accnosFileName.c_str());
308 char inFileName[1024];
309 strcpy(inFileName, fastaFileNames[s].c_str());
311 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
312 MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
313 MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos);
315 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
317 if (pid == 0) { //you are the root process
319 MPIPos = setFilePosFasta(fastaFileNames[s], numSeqs); //fills MPIPos, returns numSeqs
321 //send file positions to all processes
322 for(int i = 1; i < processors; i++) {
323 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
324 MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
327 //figure out how many sequences you have to align
328 numSeqsPerProcessor = numSeqs / processors;
329 int startIndex = pid * numSeqsPerProcessor;
330 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
333 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
335 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
337 }else{ //you are a child process
338 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
339 MPIPos.resize(numSeqs+1);
340 MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
342 //figure out how many sequences you have to align
343 numSeqsPerProcessor = numSeqs / processors;
344 int startIndex = pid * numSeqsPerProcessor;
345 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
348 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
350 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
354 MPI_File_close(&inMPI);
355 MPI_File_close(&outMPI);
356 MPI_File_close(&outMPIAccnos);
357 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
361 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
364 openInputFile(fastaFileNames[s], inFASTA);
365 getNumSeqs(inFASTA, numSeqs);
368 lines.push_back(new linePair(0, numSeqs));
370 driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
372 if (m->control_pressed) {
373 remove(outputFileName.c_str());
374 remove(accnosFileName.c_str());
375 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
376 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
382 vector<unsigned long int> positions;
383 processIDS.resize(0);
386 openInputFile(fastaFileNames[s], inFASTA);
389 while(!inFASTA.eof()){
390 input = getline(inFASTA);
391 if (input.length() != 0) {
392 if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
397 numSeqs = positions.size();
399 int numSeqsPerProcessor = numSeqs / processors;
401 for (int i = 0; i < processors; i++) {
402 unsigned long int startPos = positions[ i * numSeqsPerProcessor ];
403 if(i == processors - 1){
404 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
406 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
409 createProcesses(outputFileName, fastaFileNames[s], accnosFileName);
411 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
412 rename((accnosFileName + toString(processIDS[0]) + ".temp").c_str(), accnosFileName.c_str());
414 //append output files
415 for(int i=1;i<processors;i++){
416 appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
417 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
420 //append output files
421 for(int i=1;i<processors;i++){
422 appendFiles((accnosFileName + toString(processIDS[i]) + ".temp"), accnosFileName);
423 remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str());
426 if (m->control_pressed) {
427 remove(outputFileName.c_str());
428 remove(accnosFileName.c_str());
429 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
430 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
438 openInputFile(fastaFileNames[s], inFASTA);
439 getNumSeqs(inFASTA, numSeqs);
441 lines.push_back(new linePair(0, numSeqs));
443 driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
445 if (m->control_pressed) {
446 remove(outputFileName.c_str());
447 remove(accnosFileName.c_str());
448 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
449 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
458 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
460 outputNames.push_back(outputFileName);
461 outputNames.push_back(accnosFileName);
463 m->mothurOutEndLine();
464 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
467 m->mothurOutEndLine();
468 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
469 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
470 m->mothurOutEndLine();
475 catch(exception& e) {
476 m->errorOut(e, "ChimeraPintailCommand", "execute");
480 //**********************************************************************************************************************
482 int ChimeraPintailCommand::driver(linePair* line, string outputFName, string filename, string accnos){
485 openOutputFile(outputFName, out);
488 openOutputFile(accnos, out2);
491 openInputFile(filename, inFASTA);
493 inFASTA.seekg(line->start);
495 for(int i=0;i<line->numSeqs;i++){
497 if (m->control_pressed) { return 1; }
499 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
501 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
503 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
504 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
507 chimera->getChimeras(candidateSeq);
509 if (m->control_pressed) { delete candidateSeq; return 1; }
512 chimera->print(out, out2);
518 if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); }
521 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); }
529 catch(exception& e) {
530 m->errorOut(e, "ChimeraPintailCommand", "driver");
534 //**********************************************************************************************************************
536 int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long int>& MPIPos){
541 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
543 for(int i=0;i<num;i++){
545 if (m->control_pressed) { return 1; }
548 int length = MPIPos[start+i+1] - MPIPos[start+i];
550 char* buf4 = new char[length];
551 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
553 string tempBuf = buf4;
554 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
555 istringstream iss (tempBuf,istringstream::in);
558 Sequence* candidateSeq = new Sequence(iss); gobble(iss);
560 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
562 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
563 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
566 chimera->getChimeras(candidateSeq);
568 if (m->control_pressed) { delete candidateSeq; return 1; }
571 bool isChimeric = chimera->print(outMPI, outAccMPI);
577 if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); }
580 if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); }
585 catch(exception& e) {
586 m->errorOut(e, "ChimeraPintailCommand", "driverMPI");
592 /**************************************************************************************************/
594 int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) {
596 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
598 // processIDS.resize(0);
600 //loop through and create all the processes you want
601 while (process != processors) {
605 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
608 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp");
610 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
613 //force parent to wait until all the processes are done
614 for (int i=0;i<processors;i++) {
615 int temp = processIDS[i];
622 catch(exception& e) {
623 m->errorOut(e, "ChimeraPintailCommand", "createProcesses");
628 /**************************************************************************************************/