2 * chimerapintailcommand.cpp
5 * Created by westcott on 4/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "chimerapintailcommand.h"
13 //***************************************************************************************************************
15 ChimeraPintailCommand::ChimeraPintailCommand(string option) {
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"fasta","filter","processors","window" ,"increment","template","conservation","quantile","mask","outputdir","inputdir"};
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string,string> parameters = parser.getParameters();
30 ValidParameters validParameter;
31 map<string,string>::iterator it;
33 //check to make sure all parameters are valid for command
34 for (it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //if the user changes the input directory command factory will send this info to us in the output parameter
39 inputDir = validParameter.validFile(parameters, "inputdir", false);
40 if (inputDir == "not found"){ inputDir = ""; }
43 it = parameters.find("template");
44 //user has given a template file
45 if(it != parameters.end()){
46 path = hasPath(it->second);
47 //if the user has not given a path then, add inputdir. else leave path alone.
48 if (path == "") { parameters["template"] = inputDir + it->second; }
51 it = parameters.find("conservation");
52 //user has given a template file
53 if(it != parameters.end()){
54 path = hasPath(it->second);
55 //if the user has not given a path then, add inputdir. else leave path alone.
56 if (path == "") { parameters["conservation"] = inputDir + it->second; }
59 it = parameters.find("quantile");
60 //user has given a template file
61 if(it != parameters.end()){
62 path = hasPath(it->second);
63 //if the user has not given a path then, add inputdir. else leave path alone.
64 if (path == "") { parameters["quantile"] = inputDir + it->second; }
69 //check for required parameters
70 fastafile = validParameter.validFile(parameters, "fasta", false);
71 if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
73 splitAtDash(fastafile, fastaFileNames);
75 //go through files and make sure they are good, if not, then disregard them
76 for (int i = 0; i < fastaFileNames.size(); i++) {
78 string path = hasPath(fastaFileNames[i]);
79 //if the user has not given a path then, add inputdir. else leave path alone.
80 if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; }
88 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
89 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
94 ableToOpen = openInputFile(fastaFileNames[i], in);
98 for (int j = 1; j < processors; j++) {
99 MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD);
103 MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
108 if (ableToOpen == 1) {
109 m->mothurOut(fastaFileNames[i] + " will be disregarded."); m->mothurOutEndLine();
110 //erase from file list
111 fastaFileNames.erase(fastaFileNames.begin()+i);
116 //make sure there is at least one valid file left
117 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
121 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
122 filter = isTrue(temp);
124 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
125 convert(temp, processors);
127 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
128 convert(temp, window);
130 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
131 convert(temp, increment);
133 maskfile = validParameter.validFile(parameters, "mask", false);
134 if (maskfile == "not found") { maskfile = ""; }
135 else if (maskfile != "default") {
136 if (inputDir != "") {
137 string path = hasPath(maskfile);
138 //if the user has not given a path then, add inputdir. else leave path alone.
139 if (path == "") { maskfile = inputDir + maskfile; }
143 int ableToOpen = openInputFile(maskfile, in);
144 if (ableToOpen == 1) { abort = true; }
149 //if the user changes the output directory command factory will send this info to us in the output parameter
150 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
152 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
155 templatefile = validParameter.validFile(parameters, "template", true);
156 if (templatefile == "not open") { abort = true; }
157 else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
159 consfile = validParameter.validFile(parameters, "conservation", true);
160 if (consfile == "not open") { abort = true; }
161 else if (consfile == "not found") {
164 string tempConsFile = getRootName(inputDir + getSimpleName(templatefile)) + "freq";
165 ifstream FileTest(tempConsFile.c_str());
166 if(FileTest){ m->mothurOut("I found " + tempConsFile + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); consfile = tempConsFile; FileTest.close(); }
169 quanfile = validParameter.validFile(parameters, "quantile", true);
170 if (quanfile == "not open") { abort = true; }
171 else if (quanfile == "not found") { quanfile = ""; }
174 catch(exception& e) {
175 m->errorOut(e, "ChimeraPintailCommand", "ChimeraPintailCommand");
179 //**********************************************************************************************************************
181 void ChimeraPintailCommand::help(){
184 m->mothurOut("The chimera.pintail command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
185 m->mothurOut("This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n");
186 m->mothurOut("The chimera.pintail command parameters are fasta, template, filter, mask, processors, window, increment, conservation and quantile.\n");
187 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
188 m->mothurOut("You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
189 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
190 m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
191 m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences, by default no mask is applied. You can apply an ecoli mask by typing, mask=default. \n");
192 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
194 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
196 m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n");
197 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
198 m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
199 m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
200 m->mothurOut("The chimera.pintail command should be in the following format: \n");
201 m->mothurOut("chimera.pintail(fasta=yourFastaFile, template=yourTemplate) \n");
202 m->mothurOut("Example: chimera.pintail(fasta=AD.align, template=silva.bacteria.fasta) \n");
203 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
205 catch(exception& e) {
206 m->errorOut(e, "ChimeraPintailCommand", "help");
211 //***************************************************************************************************************
213 ChimeraPintailCommand::~ChimeraPintailCommand(){ /* do nothing */ }
215 //***************************************************************************************************************
217 int ChimeraPintailCommand::execute(){
220 if (abort == true) { return 0; }
222 for (int s = 0; s < fastaFileNames.size(); s++) {
224 m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
226 int start = time(NULL);
229 if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); }
231 //check for quantile to save the time
232 string tempQuan = "";
233 if ((!filter) && (maskfile == "")) {
234 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.quan";
235 }else if ((!filter) && (maskfile != "")) {
236 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.masked.quan";
237 }else if ((filter) && (maskfile != "")) {
238 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "masked.quan";
239 }else if ((filter) && (maskfile == "")) {
240 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "quan";
243 ifstream FileTest(tempQuan.c_str());
244 if(FileTest){ m->mothurOut("I found " + tempQuan + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); quanfile = tempQuan; FileTest.close(); }
246 chimera = new Pintail(fastaFileNames[s], templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
248 string outputFileName, accnosFileName;
249 if (maskfile != "") {
250 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.chimeras";
251 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.accnos";
253 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.chimeras";
254 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.accnos";
256 bool hasAccnos = true;
258 if (m->control_pressed) { delete chimera; for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } return 0; }
260 if (chimera->getUnaligned()) {
261 m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine();
265 templateSeqsLength = chimera->getLength();
268 int pid, end, numSeqsPerProcessor;
271 MPIWroteAccnos = false;
274 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
275 MPI_Comm_size(MPI_COMM_WORLD, &processors);
279 MPI_File outMPIAccnos;
281 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
282 int inMode=MPI_MODE_RDONLY;
284 char outFilename[1024];
285 strcpy(outFilename, outputFileName.c_str());
287 char outAccnosFilename[1024];
288 strcpy(outAccnosFilename, accnosFileName.c_str());
290 char inFileName[1024];
291 strcpy(inFileName, fastaFileNames[s].c_str());
293 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
294 MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
295 MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos);
297 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
299 if (pid == 0) { //you are the root process
301 MPIPos = setFilePosFasta(fastaFileNames[s], numSeqs); //fills MPIPos, returns numSeqs
303 //send file positions to all processes
304 for(int i = 1; i < processors; i++) {
305 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
306 MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
309 //figure out how many sequences you have to align
310 numSeqsPerProcessor = numSeqs / processors;
311 int startIndex = pid * numSeqsPerProcessor;
312 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
315 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
317 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
319 for (int i = 1; i < processors; i++) {
321 MPI_Recv(&tempResult, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
322 if (tempResult != 0) { MPIWroteAccnos = true; }
324 }else{ //you are a child process
325 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
326 MPIPos.resize(numSeqs+1);
327 MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
329 //figure out how many sequences you have to align
330 numSeqsPerProcessor = numSeqs / processors;
331 int startIndex = pid * numSeqsPerProcessor;
332 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
335 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
337 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
339 MPI_Send(&MPIWroteAccnos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
343 MPI_File_close(&inMPI);
344 MPI_File_close(&outMPI);
345 MPI_File_close(&outMPIAccnos);
346 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
348 //delete accnos file if blank
350 if (!MPIWroteAccnos) {
352 remove(accnosFileName.c_str());
359 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
362 openInputFile(fastaFileNames[s], inFASTA);
363 getNumSeqs(inFASTA, numSeqs);
366 lines.push_back(new linePair(0, numSeqs));
368 driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
370 if (m->control_pressed) {
371 remove(outputFileName.c_str());
372 remove(accnosFileName.c_str());
373 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
374 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
379 //delete accnos file if its blank
380 if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; }
383 vector<int> positions;
384 processIDS.resize(0);
387 openInputFile(fastaFileNames[s], inFASTA);
390 while(!inFASTA.eof()){
391 input = getline(inFASTA);
392 if (input.length() != 0) {
393 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
398 numSeqs = positions.size();
400 int numSeqsPerProcessor = numSeqs / processors;
402 for (int i = 0; i < processors; i++) {
403 long int startPos = positions[ i * numSeqsPerProcessor ];
404 if(i == processors - 1){
405 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
407 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
410 createProcesses(outputFileName, fastaFileNames[s], accnosFileName);
412 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
414 //append output files
415 for(int i=1;i<processors;i++){
416 appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
417 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
420 vector<string> nonBlankAccnosFiles;
421 //delete blank accnos files generated with multiple processes
422 for(int i=0;i<processors;i++){
423 if (!(isBlank(accnosFileName + toString(processIDS[i]) + ".temp"))) {
424 nonBlankAccnosFiles.push_back(accnosFileName + toString(processIDS[i]) + ".temp");
425 }else { remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str()); }
428 //append accnos files
429 if (nonBlankAccnosFiles.size() != 0) {
430 rename(nonBlankAccnosFiles[0].c_str(), accnosFileName.c_str());
432 for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
433 appendFiles(nonBlankAccnosFiles[h], accnosFileName);
434 remove(nonBlankAccnosFiles[h].c_str());
436 }else{ hasAccnos = false; }
438 if (m->control_pressed) {
439 remove(outputFileName.c_str());
440 remove(accnosFileName.c_str());
441 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
442 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
450 openInputFile(fastaFileNames[s], inFASTA);
451 getNumSeqs(inFASTA, numSeqs);
453 lines.push_back(new linePair(0, numSeqs));
455 driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
457 if (m->control_pressed) {
458 remove(outputFileName.c_str());
459 remove(accnosFileName.c_str());
460 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
461 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
466 //delete accnos file if its blank
467 if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; }
473 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
475 outputNames.push_back(outputFileName);
476 if (hasAccnos) { outputNames.push_back(accnosFileName); }
478 m->mothurOutEndLine();
479 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
482 m->mothurOutEndLine();
483 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
484 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
485 m->mothurOutEndLine();
490 catch(exception& e) {
491 m->errorOut(e, "ChimeraPintailCommand", "execute");
495 //**********************************************************************************************************************
497 int ChimeraPintailCommand::driver(linePair* line, string outputFName, string filename, string accnos){
500 openOutputFile(outputFName, out);
503 openOutputFile(accnos, out2);
506 openInputFile(filename, inFASTA);
508 inFASTA.seekg(line->start);
510 for(int i=0;i<line->numSeqs;i++){
512 if (m->control_pressed) { return 1; }
514 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
516 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
518 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
519 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
522 chimera->getChimeras(candidateSeq);
524 if (m->control_pressed) { delete candidateSeq; return 1; }
527 chimera->print(out, out2);
533 if((i+1) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine(); }
536 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine(); }
544 catch(exception& e) {
545 m->errorOut(e, "ChimeraPintailCommand", "driver");
549 //**********************************************************************************************************************
551 int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<long>& MPIPos){
556 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
558 for(int i=0;i<num;i++){
560 if (m->control_pressed) { return 1; }
563 int length = MPIPos[start+i+1] - MPIPos[start+i];
565 char* buf4 = new char[length];
566 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
568 string tempBuf = buf4;
569 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
570 istringstream iss (tempBuf,istringstream::in);
573 Sequence* candidateSeq = new Sequence(iss); gobble(iss);
575 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
577 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
578 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
581 chimera->getChimeras(candidateSeq);
583 if (m->control_pressed) { delete candidateSeq; return 1; }
586 bool isChimeric = chimera->print(outMPI, outAccMPI);
587 if (isChimeric) { MPIWroteAccnos = true; }
593 if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); }
596 if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); }
601 catch(exception& e) {
602 m->errorOut(e, "ChimeraPintailCommand", "driverMPI");
608 /**************************************************************************************************/
610 int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) {
612 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
614 // processIDS.resize(0);
616 //loop through and create all the processes you want
617 while (process != processors) {
621 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
624 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp");
626 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
629 //force parent to wait until all the processes are done
630 for (int i=0;i<processors;i++) {
631 int temp = processIDS[i];
638 catch(exception& e) {
639 m->errorOut(e, "ChimeraPintailCommand", "createProcesses");
644 /**************************************************************************************************/