2 * chimeraseqscommand.cpp
5 * Created by Sarah Westcott on 6/29/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "chimeraseqscommand.h"
11 #include "bellerophon.h"
14 #include "chimeracheckrdp.h"
15 #include "chimeraslayer.h"
18 //***************************************************************************************************************
20 ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
24 //allow user to run help
25 if(option == "help") { help(); abort = true; }
28 //valid paramters for this command
29 string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask",
30 "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" };
31 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33 OptionParser parser(option);
34 map<string,string> parameters = parser.getParameters();
36 ValidParameters validParameter;
37 map<string,string>::iterator it;
39 //check to make sure all parameters are valid for command
40 for (it = parameters.begin(); it != parameters.end(); it++) {
41 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
44 //if the user changes the input directory command factory will send this info to us in the output parameter
45 string inputDir = validParameter.validFile(parameters, "inputdir", false);
46 if (inputDir == "not found"){ inputDir = ""; }
49 it = parameters.find("fasta");
50 //user has given a template file
51 if(it != parameters.end()){
52 path = hasPath(it->second);
53 //if the user has not given a path then, add inputdir. else leave path alone.
54 if (path == "") { parameters["fasta"] = inputDir + it->second; }
57 it = parameters.find("template");
58 //user has given a template file
59 if(it != parameters.end()){
60 path = hasPath(it->second);
61 //if the user has not given a path then, add inputdir. else leave path alone.
62 if (path == "") { parameters["template"] = inputDir + it->second; }
65 it = parameters.find("conservation");
66 //user has given a template file
67 if(it != parameters.end()){
68 path = hasPath(it->second);
69 //if the user has not given a path then, add inputdir. else leave path alone.
70 if (path == "") { parameters["conservation"] = inputDir + it->second; }
73 it = parameters.find("quantile");
74 //user has given a template file
75 if(it != parameters.end()){
76 path = hasPath(it->second);
77 //if the user has not given a path then, add inputdir. else leave path alone.
78 if (path == "") { parameters["quantile"] = inputDir + it->second; }
81 it = parameters.find("name");
82 //user has given a template file
83 if(it != parameters.end()){
84 path = hasPath(it->second);
85 //if the user has not given a path then, add inputdir. else leave path alone.
86 if (path == "") { parameters["name"] = inputDir + it->second; }
91 //check for required parameters
92 fastafile = validParameter.validFile(parameters, "fasta", true);
93 if (fastafile == "not open") { abort = true; }
94 else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; }
96 //if the user changes the output directory command factory will send this info to us in the output parameter
97 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
99 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
102 templatefile = validParameter.validFile(parameters, "template", true);
103 if (templatefile == "not open") { abort = true; }
104 else if (templatefile == "not found") { templatefile = ""; }
106 consfile = validParameter.validFile(parameters, "conservation", true);
107 if (consfile == "not open") { abort = true; }
108 else if (consfile == "not found") { consfile = ""; }
110 quanfile = validParameter.validFile(parameters, "quantile", true);
111 if (quanfile == "not open") { abort = true; }
112 else if (quanfile == "not found") { quanfile = ""; }
114 namefile = validParameter.validFile(parameters, "name", true);
115 if (namefile == "not open") { abort = true; }
116 else if (namefile == "not found") { namefile = ""; }
118 maskfile = validParameter.validFile(parameters, "mask", false);
119 if (maskfile == "not found") { maskfile = ""; }
120 else if (maskfile != "default") {
121 if (inputDir != "") {
122 string path = hasPath(maskfile);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { maskfile = inputDir + maskfile; }
128 int ableToOpen = openInputFile(maskfile, in);
129 if (ableToOpen == 1) { abort = true; }
133 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; }
136 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
137 filter = isTrue(temp);
139 temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; }
140 correction = isTrue(temp);
142 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
143 convert(temp, processors);
145 temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; }
146 convert(temp, ksize);
148 temp = validParameter.validFile(parameters, "svg", false); if (temp == "not found") { temp = "F"; }
151 temp = validParameter.validFile(parameters, "window", false);
152 if ((temp == "not found") && (method == "chimeraslayer")) { temp = "50"; }
153 else if (temp == "not found") { temp = "0"; }
154 convert(temp, window);
156 temp = validParameter.validFile(parameters, "match", false); if (temp == "not found") { temp = "5"; }
157 convert(temp, match);
159 temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found") { temp = "-4"; }
160 convert(temp, mismatch);
162 temp = validParameter.validFile(parameters, "divergence", false); if (temp == "not found") { temp = "1.007"; }
165 temp = validParameter.validFile(parameters, "minsim", false); if (temp == "not found") { temp = "90"; }
166 convert(temp, minSimilarity);
168 temp = validParameter.validFile(parameters, "mincov", false); if (temp == "not found") { temp = "70"; }
169 convert(temp, minCoverage);
171 temp = validParameter.validFile(parameters, "minbs", false); if (temp == "not found") { temp = "90"; }
172 convert(temp, minBS);
174 temp = validParameter.validFile(parameters, "minsnp", false); if (temp == "not found") { temp = "10"; }
175 convert(temp, minSNP);
177 temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "3"; }
178 convert(temp, parents);
180 temp = validParameter.validFile(parameters, "realign", false); if (temp == "not found") { temp = "f"; }
181 realign = isTrue(temp);
183 search = validParameter.validFile(parameters, "search", false); if (search == "not found") { search = "distance"; }
185 temp = validParameter.validFile(parameters, "iters", false);
186 if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; }
187 else if (temp == "not found") { temp = "1000"; }
188 convert(temp, iters);
190 temp = validParameter.validFile(parameters, "increment", false);
191 if ((temp == "not found") && (method == "chimeracheck")) { temp = "10"; }
192 else if ((temp == "not found") && (method == "chimeraslayer")) { temp = "5"; }
193 else if (temp == "not found") { temp = "25"; }
194 convert(temp, increment);
196 temp = validParameter.validFile(parameters, "numwanted", false);
197 if ((temp == "not found") && (method == "chimeraslayer")) { temp = "15"; }
198 else if (temp == "not found") { temp = "20"; }
199 convert(temp, numwanted);
201 if ((search != "distance") && (search != "blast") && (search != "kmer")) { mothurOut(search + " is not a valid search."); mothurOutEndLine(); abort = true; }
203 if (((method != "bellerophon")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail, ccode, chimeraslayer or chimeracheck methods."); mothurOutEndLine(); abort = true; }
208 catch(exception& e) {
209 errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand");
213 //**********************************************************************************************************************
215 void ChimeraSeqsCommand::help(){
218 //"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name"
219 //mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED and if using a template that the template file sequences are the same length as the fasta file sequences.\n\n");
220 mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n");
221 mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation, quantile, numwanted, ksize, svg, name, iters, search, realign.\n");
222 mothurOut("The fasta parameter is always required and template is required if using pintail, ccode or chimeracheck.\n");
223 mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
224 mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n");
225 mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
226 mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. Options include bellerophon, ccode and chimeracheck \n");
227 mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n");
228 mothurOut("The window parameter allows you to specify the window size for searching for chimeras. \n");
229 mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences.\n");
230 mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences. \n");
231 mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
232 mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences.\n");
233 mothurOut("The numwanted parameter allows you to specify how many sequences you would each query sequence compared with.\n");
234 mothurOut("The ksize parameter allows you to input kmersize. \n");
235 mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence.\n");
236 mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n");
237 mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n");
238 mothurOut("The minsim parameter allows you .... \n");
239 mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n");
240 mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n");
241 mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
242 mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. -used only by chimeraslayer. \n");
243 mothurOut("The realign parameter allows you to realign the query to the potential paretns. Choices are true or false, default false. -used only by chimeraslayer. \n");
244 mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
245 mothurOut("Details for each method: \n");
246 mothurOut("\tpintail: \n");
247 mothurOut("\t\tparameters: fasta=required, template=required, filter=F, mask=no mask, processors=1, window=300, increment=25, conservation=not required, but will improve speed, quantile=not required, but will greatly improve speed. \n");
248 mothurOut("\t\tIf you have run chimera.seqs using pintail a .quan and .freq file will be created for your template, if you have not provided them for use in future command executions.\n");
249 mothurOut("\tbellerophon: \n");
250 mothurOut("\t\tparameters: fasta=required, filter=F, processors=1, window=1/4 length of seq, increment=25, correction=T. \n");
251 mothurOut("\tccode: \n");
252 mothurOut("\t\tparameters: fasta=required, template=required, filter=F, mask=no mask, processors=1, window=10% of length, numwanted=20\n");
253 mothurOut("\tchimeracheck: \n");
254 mothurOut("\t\tparameters: fasta=required, template=required, processors=1, increment=10, ksize=7, svg=F, name=none\n\n");
255 mothurOut("\tchimeraslayer: \n");
256 mothurOut("\t\tparameters: fasta=required, template=required, processors=1, increment=10, mask=no mask, numwanted=10, match=5, mismatch=-4, divergence=1.0, minsim=90, parents=5, iters=1000, window=100. \n\n");
257 mothurOut("The chimera.seqs command should be in the following format: \n");
258 mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
259 mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, method=bellerophon, window=200) \n");
260 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
262 catch(exception& e) {
263 errorOut(e, "ChimeraSeqsCommand", "help");
268 //***************************************************************************************************************
270 ChimeraSeqsCommand::~ChimeraSeqsCommand(){ /* do nothing */ }
272 //***************************************************************************************************************
274 int ChimeraSeqsCommand::execute(){
277 if (abort == true) { return 0; }
279 int start = time(NULL);
281 if (method == "bellerophon") { chimera = new Bellerophon(fastafile, outputDir); }
282 else if (method == "pintail") { chimera = new Pintail(fastafile, outputDir); }
283 else if (method == "ccode") { chimera = new Ccode(fastafile, outputDir); }
284 else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, outputDir); }
285 else if (method == "chimeraslayer") { chimera = new ChimeraSlayer(search, realign, fastafile); }
286 else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; }
289 if (maskfile == "default") { mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine(); }
291 chimera->setCons(consfile);
292 chimera->setQuantiles(quanfile);
293 chimera->setMask(maskfile);
294 chimera->setFilter(filter);
295 chimera->setCorrection(correction);
296 chimera->setProcessors(processors);
297 chimera->setWindow(window);
298 chimera->setIncrement(increment);
299 chimera->setNumWanted(numwanted);
300 chimera->setKmerSize(ksize);
301 chimera->setSVG(svg);
302 chimera->setName(namefile);
303 chimera->setMatch(match);
304 chimera->setMisMatch(mismatch);
305 chimera->setDivR(divR);
306 chimera->setParents(parents);
307 chimera->setMinSim(minSimilarity);
308 chimera->setMinCoverage(minCoverage);
309 chimera->setMinBS(minBS);
310 chimera->setMinSNP(minSNP);
311 chimera->setIters(iters);
312 chimera->setTemplateFile(templatefile);
316 vector<Sequence*> templateSeqs;
317 if ((method != "bellerophon") && (method != "chimeracheck")) {
318 templateSeqs = chimera->readSeqs(templatefile);
319 if (chimera->getUnaligned()) {
320 mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine();
322 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
327 chimera->setTemplateSeqs(templateSeqs);
329 }else if (method == "bellerophon") {//run bellerophon separately since you need to read entire fastafile to run it
330 chimera->getChimeras();
332 string outputFName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras";
334 openOutputFile(outputFName, out);
341 //some methods need to do prep work before processing the chimeras
345 string tempHeader = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras.tempHeader";
346 openOutputFile(tempHeader, outHeader);
348 chimera->printHeader(outHeader);
351 string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras";
354 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
357 openInputFile(fastafile, inFASTA);
358 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
361 lines.push_back(new linePair(0, numSeqs));
363 driver(lines[0], outputFileName, fastafile);
366 vector<int> positions;
367 processIDS.resize(0);
370 openInputFile(fastafile, inFASTA);
373 while(!inFASTA.eof()){
374 input = getline(inFASTA);
375 if (input.length() != 0) {
376 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
381 numSeqs = positions.size();
383 int numSeqsPerProcessor = numSeqs / processors;
385 for (int i = 0; i < processors; i++) {
386 long int startPos = positions[ i * numSeqsPerProcessor ];
387 if(i == processors - 1){
388 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
390 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
394 createProcesses(outputFileName, fastafile);
396 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
398 //append alignment and report files
399 for(int i=1;i<processors;i++){
400 appendOutputFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
401 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
407 openInputFile(candidateFileNames[s], inFASTA);
408 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
410 lines.push_back(new linePair(0, numSeqs));
412 driver(lines[0], outputFileName, fastafile);
415 //mothurOut("Output File Names: ");
416 //if ((filter) && (method == "bellerophon")) { mothurOut(
417 //if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; }
418 // else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
420 appendOutputFiles(tempHeader, outputFileName);
421 remove(outputFileName.c_str());
422 rename(tempHeader.c_str(), outputFileName.c_str());
424 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
426 if (method == "chimeracheck") { mothurOutEndLine(); mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine(); }
428 mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); mothurOutEndLine();
433 catch(exception& e) {
434 errorOut(e, "ChimeraSeqsCommand", "execute");
437 }//**********************************************************************************************************************
439 int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filename){
442 openOutputFile(outputFName, out);
445 openInputFile(filename, inFASTA);
447 inFASTA.seekg(line->start);
449 for(int i=0;i<line->numSeqs;i++){
451 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
453 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
456 chimera->getChimeras(candidateSeq);
464 if((i+1) % 100 == 0){ mothurOut("Processing sequence: " + toString(i+1)); mothurOutEndLine(); }
467 if((line->numSeqs) % 100 != 0){ mothurOut("Processing sequence: " + toString(line->numSeqs)); mothurOutEndLine(); }
474 catch(exception& e) {
475 errorOut(e, "ChimeraSeqsCommand", "driver");
480 /**************************************************************************************************/
482 void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename) {
484 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
486 // processIDS.resize(0);
488 //loop through and create all the processes you want
489 while (process != processors) {
493 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
496 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename);
498 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
501 //force parent to wait until all the processes are done
502 for (int i=0;i<processors;i++) {
503 int temp = processIDS[i];
508 catch(exception& e) {
509 errorOut(e, "ChimeraSeqsCommand", "createProcesses");
514 /**************************************************************************************************/
516 void ChimeraSeqsCommand::appendOutputFiles(string temp, string filename) {
522 openOutputFileAppend(temp, output);
523 openInputFile(filename, input);
525 while(char c = input.get()){
526 if(input.eof()) { break; }
527 else { output << c; }
533 catch(exception& e) {
534 errorOut(e, "ChimeraSeqsCommand", "appendOuputFiles");
538 //**********************************************************************************************************************