2 * chimerapintailcommand.cpp
5 * Created by westcott on 4/1/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "chimerapintailcommand.h"
13 //***************************************************************************************************************
15 ChimeraPintailCommand::ChimeraPintailCommand(string option) {
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"fasta","filter","processors","window" ,"increment","template","conservation","quantile","mask","outputdir","inputdir"};
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string,string> parameters = parser.getParameters();
30 ValidParameters validParameter("chimera.pintail");
31 map<string,string>::iterator it;
33 //check to make sure all parameters are valid for command
34 for (it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //if the user changes the input directory command factory will send this info to us in the output parameter
39 inputDir = validParameter.validFile(parameters, "inputdir", false);
40 if (inputDir == "not found"){ inputDir = ""; }
43 it = parameters.find("template");
44 //user has given a template file
45 if(it != parameters.end()){
46 path = hasPath(it->second);
47 //if the user has not given a path then, add inputdir. else leave path alone.
48 if (path == "") { parameters["template"] = inputDir + it->second; }
51 it = parameters.find("conservation");
52 //user has given a template file
53 if(it != parameters.end()){
54 path = hasPath(it->second);
55 //if the user has not given a path then, add inputdir. else leave path alone.
56 if (path == "") { parameters["conservation"] = inputDir + it->second; }
59 it = parameters.find("quantile");
60 //user has given a template file
61 if(it != parameters.end()){
62 path = hasPath(it->second);
63 //if the user has not given a path then, add inputdir. else leave path alone.
64 if (path == "") { parameters["quantile"] = inputDir + it->second; }
69 //check for required parameters
70 fastafile = validParameter.validFile(parameters, "fasta", false);
71 if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
73 splitAtDash(fastafile, fastaFileNames);
75 //go through files and make sure they are good, if not, then disregard them
76 for (int i = 0; i < fastaFileNames.size(); i++) {
78 string path = hasPath(fastaFileNames[i]);
79 //if the user has not given a path then, add inputdir. else leave path alone.
80 if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; }
86 ableToOpen = openInputFile(fastaFileNames[i], in, "noerror");
88 //if you can't open it, try default location
89 if (ableToOpen == 1) {
90 if (m->getDefaultPath() != "") { //default path is set
91 string tryPath = m->getDefaultPath() + getSimpleName(fastaFileNames[i]);
92 m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
93 ableToOpen = openInputFile(tryPath, in, "noerror");
94 fastaFileNames[i] = tryPath;
99 if (ableToOpen == 1) {
100 m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
101 //erase from file list
102 fastaFileNames.erase(fastaFileNames.begin()+i);
107 //make sure there is at least one valid file left
108 if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
112 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
113 filter = isTrue(temp);
115 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
116 convert(temp, processors);
118 temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
119 convert(temp, window);
121 temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
122 convert(temp, increment);
124 maskfile = validParameter.validFile(parameters, "mask", false);
125 if (maskfile == "not found") { maskfile = ""; }
126 else if (maskfile != "default") {
127 if (inputDir != "") {
128 string path = hasPath(maskfile);
129 //if the user has not given a path then, add inputdir. else leave path alone.
130 if (path == "") { maskfile = inputDir + maskfile; }
134 int ableToOpen = openInputFile(maskfile, in);
135 if (ableToOpen == 1) { abort = true; }
140 //if the user changes the output directory command factory will send this info to us in the output parameter
141 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
143 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
146 templatefile = validParameter.validFile(parameters, "template", true);
147 if (templatefile == "not open") { abort = true; }
148 else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.pintail command."); m->mothurOutEndLine(); abort = true; }
150 consfile = validParameter.validFile(parameters, "conservation", true);
151 if (consfile == "not open") { abort = true; }
152 else if (consfile == "not found") {
155 string tempConsFile = getRootName(inputDir + getSimpleName(templatefile)) + "freq";
156 ifstream FileTest(tempConsFile.c_str());
158 bool GoodFile = checkReleaseVersion(FileTest, m->getVersion());
160 m->mothurOut("I found " + tempConsFile + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); consfile = tempConsFile; FileTest.close();
165 quanfile = validParameter.validFile(parameters, "quantile", true);
166 if (quanfile == "not open") { abort = true; }
167 else if (quanfile == "not found") { quanfile = ""; }
170 catch(exception& e) {
171 m->errorOut(e, "ChimeraPintailCommand", "ChimeraPintailCommand");
175 //**********************************************************************************************************************
177 void ChimeraPintailCommand::help(){
180 m->mothurOut("The chimera.pintail command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
181 m->mothurOut("This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n");
182 m->mothurOut("The chimera.pintail command parameters are fasta, template, filter, mask, processors, window, increment, conservation and quantile.\n");
183 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
184 m->mothurOut("You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
185 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
186 m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
187 m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences, by default no mask is applied. You can apply an ecoli mask by typing, mask=default. \n");
188 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
190 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
192 m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n");
193 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
194 m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
195 m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
196 m->mothurOut("The chimera.pintail command should be in the following format: \n");
197 m->mothurOut("chimera.pintail(fasta=yourFastaFile, template=yourTemplate) \n");
198 m->mothurOut("Example: chimera.pintail(fasta=AD.align, template=silva.bacteria.fasta) \n");
199 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
201 catch(exception& e) {
202 m->errorOut(e, "ChimeraPintailCommand", "help");
207 //***************************************************************************************************************
209 ChimeraPintailCommand::~ChimeraPintailCommand(){ /* do nothing */ }
211 //***************************************************************************************************************
213 int ChimeraPintailCommand::execute(){
216 if (abort == true) { return 0; }
218 for (int s = 0; s < fastaFileNames.size(); s++) {
220 m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
222 int start = time(NULL);
225 if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); }
227 //check for quantile to save the time
228 string tempQuan = "";
229 if ((!filter) && (maskfile == "")) {
230 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.quan";
231 }else if ((!filter) && (maskfile != "")) {
232 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.masked.quan";
233 }else if ((filter) && (maskfile != "")) {
234 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "masked.quan";
235 }else if ((filter) && (maskfile == "")) {
236 tempQuan = inputDir + getRootName(getSimpleName(templatefile)) + "pintail.filtered." + getSimpleName(getRootName(fastaFileNames[s])) + "quan";
239 ifstream FileTest(tempQuan.c_str());
241 bool GoodFile = checkReleaseVersion(FileTest, m->getVersion());
243 m->mothurOut("I found " + tempQuan + " in your input file directory. I will use it to save time."); m->mothurOutEndLine(); quanfile = tempQuan; FileTest.close();
247 chimera = new Pintail(fastaFileNames[s], templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
249 string outputFileName, accnosFileName;
250 if (maskfile != "") {
251 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.chimeras";
252 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + maskfile + ".pintail.accnos";
254 outputFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.chimeras";
255 accnosFileName = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "pintail.accnos";
258 if (m->control_pressed) { delete chimera; for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } return 0; }
260 if (chimera->getUnaligned()) {
261 m->mothurOut("Your template sequences are different lengths, please correct."); m->mothurOutEndLine();
265 templateSeqsLength = chimera->getLength();
268 int pid, end, numSeqsPerProcessor;
270 vector<unsigned long int> MPIPos;
273 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
274 MPI_Comm_size(MPI_COMM_WORLD, &processors);
278 MPI_File outMPIAccnos;
280 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
281 int inMode=MPI_MODE_RDONLY;
283 char outFilename[1024];
284 strcpy(outFilename, outputFileName.c_str());
286 char outAccnosFilename[1024];
287 strcpy(outAccnosFilename, accnosFileName.c_str());
289 char inFileName[1024];
290 strcpy(inFileName, fastaFileNames[s].c_str());
292 MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
293 MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
294 MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos);
296 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
298 if (pid == 0) { //you are the root process
300 MPIPos = setFilePosFasta(fastaFileNames[s], numSeqs); //fills MPIPos, returns numSeqs
302 //send file positions to all processes
303 for(int i = 1; i < processors; i++) {
304 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
305 MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
308 //figure out how many sequences you have to align
309 numSeqsPerProcessor = numSeqs / processors;
310 int startIndex = pid * numSeqsPerProcessor;
311 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
314 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
316 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
318 }else{ //you are a child process
319 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
320 MPIPos.resize(numSeqs+1);
321 MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
323 //figure out how many sequences you have to align
324 numSeqsPerProcessor = numSeqs / processors;
325 int startIndex = pid * numSeqsPerProcessor;
326 if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
329 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
331 if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPI); MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
335 MPI_File_close(&inMPI);
336 MPI_File_close(&outMPI);
337 MPI_File_close(&outMPIAccnos);
338 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
340 vector<unsigned long int> positions = divideFile(fastaFileNames[s], processors);
342 for (int i = 0; i < (positions.size()-1); i++) {
343 lines.push_back(new linePair(positions[i], positions[(i+1)]));
347 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
350 numSeqs = driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
352 if (m->control_pressed) { remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; }
355 processIDS.resize(0);
357 numSeqs = createProcesses(outputFileName, fastaFileNames[s], accnosFileName);
359 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
360 rename((accnosFileName + toString(processIDS[0]) + ".temp").c_str(), accnosFileName.c_str());
362 //append output files
363 for(int i=1;i<processors;i++){
364 appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
365 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
368 //append output files
369 for(int i=1;i<processors;i++){
370 appendFiles((accnosFileName + toString(processIDS[i]) + ".temp"), accnosFileName);
371 remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str());
374 if (m->control_pressed) {
375 remove(outputFileName.c_str());
376 remove(accnosFileName.c_str());
377 for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); }
378 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
385 numSeqs = driver(lines[0], outputFileName, fastaFileNames[s], accnosFileName);
387 if (m->control_pressed) { remove(outputFileName.c_str()); remove(accnosFileName.c_str()); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); delete chimera; return 0; }
393 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
395 outputNames.push_back(outputFileName);
396 outputNames.push_back(accnosFileName);
398 m->mothurOutEndLine();
399 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
402 m->mothurOutEndLine();
403 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
404 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
405 m->mothurOutEndLine();
410 catch(exception& e) {
411 m->errorOut(e, "ChimeraPintailCommand", "execute");
415 //**********************************************************************************************************************
417 int ChimeraPintailCommand::driver(linePair* filePos, string outputFName, string filename, string accnos){
420 openOutputFile(outputFName, out);
423 openOutputFile(accnos, out2);
426 openInputFile(filename, inFASTA);
428 inFASTA.seekg(filePos->start);
435 if (m->control_pressed) { return 1; }
437 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
439 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
441 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
442 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
445 chimera->getChimeras(candidateSeq);
447 if (m->control_pressed) { delete candidateSeq; return 1; }
450 chimera->print(out, out2);
456 unsigned long int pos = inFASTA.tellg();
457 if ((pos == -1) || (pos >= filePos->end)) { break; }
460 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
463 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
471 catch(exception& e) {
472 m->errorOut(e, "ChimeraPintailCommand", "driver");
476 //**********************************************************************************************************************
478 int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long int>& MPIPos){
483 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
485 for(int i=0;i<num;i++){
487 if (m->control_pressed) { return 1; }
490 int length = MPIPos[start+i+1] - MPIPos[start+i];
492 char* buf4 = new char[length];
493 MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
495 string tempBuf = buf4;
496 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
497 istringstream iss (tempBuf,istringstream::in);
500 Sequence* candidateSeq = new Sequence(iss); gobble(iss);
502 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
504 if (candidateSeq->getAligned().length() != templateSeqsLength) { //chimeracheck does not require seqs to be aligned
505 m->mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); m->mothurOutEndLine();
508 chimera->getChimeras(candidateSeq);
510 if (m->control_pressed) { delete candidateSeq; return 1; }
513 bool isChimeric = chimera->print(outMPI, outAccMPI);
519 if((i+1) % 100 == 0){ cout << "Processing sequence: " << (i+1) << endl; m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n"); }
522 if(num % 100 != 0){ cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n"); }
527 catch(exception& e) {
528 m->errorOut(e, "ChimeraPintailCommand", "driverMPI");
534 /**************************************************************************************************/
536 int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) {
538 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
542 //loop through and create all the processes you want
543 while (process != processors) {
547 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
550 num = driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp");
552 //pass numSeqs to parent
554 string tempFile = outputFileName + toString(getpid()) + ".num.temp";
555 openOutputFile(tempFile, out);
560 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
563 //force parent to wait until all the processes are done
564 for (int i=0;i<processors;i++) {
565 int temp = processIDS[i];
569 for (int i = 0; i < processIDS.size(); i++) {
571 string tempFile = outputFileName + toString(processIDS[i]) + ".num.temp";
572 openInputFile(tempFile, in);
573 if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
574 in.close(); remove(tempFile.c_str());
580 catch(exception& e) {
581 m->errorOut(e, "ChimeraPintailCommand", "createProcesses");
586 /**************************************************************************************************/