2 * chimeraseqscommand.cpp
5 * Created by Sarah Westcott on 6/29/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "chimeraseqscommand.h"
11 #include "bellerophon.h"
14 #include "chimeracheckrdp.h"
15 #include "chimeraslayer.h"
18 //***************************************************************************************************************
20 ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
24 //allow user to run help
25 if(option == "help") { help(); abort = true; }
28 //valid paramters for this command
29 string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask",
30 "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir" };
31 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
33 OptionParser parser(option);
34 map<string,string> parameters = parser.getParameters();
36 ValidParameters validParameter;
37 map<string,string>::iterator it;
39 //check to make sure all parameters are valid for command
40 for (it = parameters.begin(); it != parameters.end(); it++) {
41 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
44 //if the user changes the input directory command factory will send this info to us in the output parameter
45 string inputDir = validParameter.validFile(parameters, "inputdir", false);
46 if (inputDir == "not found"){ inputDir = ""; }
49 it = parameters.find("fasta");
50 //user has given a template file
51 if(it != parameters.end()){
52 path = hasPath(it->second);
53 //if the user has not given a path then, add inputdir. else leave path alone.
54 if (path == "") { parameters["fasta"] = inputDir + it->second; }
57 it = parameters.find("template");
58 //user has given a template file
59 if(it != parameters.end()){
60 path = hasPath(it->second);
61 //if the user has not given a path then, add inputdir. else leave path alone.
62 if (path == "") { parameters["template"] = inputDir + it->second; }
65 it = parameters.find("conservation");
66 //user has given a template file
67 if(it != parameters.end()){
68 path = hasPath(it->second);
69 //if the user has not given a path then, add inputdir. else leave path alone.
70 if (path == "") { parameters["conservation"] = inputDir + it->second; }
73 it = parameters.find("quantile");
74 //user has given a template file
75 if(it != parameters.end()){
76 path = hasPath(it->second);
77 //if the user has not given a path then, add inputdir. else leave path alone.
78 if (path == "") { parameters["quantile"] = inputDir + it->second; }
81 it = parameters.find("name");
82 //user has given a template file
83 if(it != parameters.end()){
84 path = hasPath(it->second);
85 //if the user has not given a path then, add inputdir. else leave path alone.
86 if (path == "") { parameters["name"] = inputDir + it->second; }
91 //check for required parameters
92 fastafile = validParameter.validFile(parameters, "fasta", true);
93 if (fastafile == "not open") { abort = true; }
94 else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; }
96 //if the user changes the output directory command factory will send this info to us in the output parameter
97 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
99 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
102 templatefile = validParameter.validFile(parameters, "template", true);
103 if (templatefile == "not open") { abort = true; }
104 else if (templatefile == "not found") { templatefile = ""; }
106 consfile = validParameter.validFile(parameters, "conservation", true);
107 if (consfile == "not open") { abort = true; }
108 else if (consfile == "not found") { consfile = ""; }
110 quanfile = validParameter.validFile(parameters, "quantile", true);
111 if (quanfile == "not open") { abort = true; }
112 else if (quanfile == "not found") { quanfile = ""; }
114 namefile = validParameter.validFile(parameters, "name", true);
115 if (namefile == "not open") { abort = true; }
116 else if (namefile == "not found") { namefile = ""; }
118 maskfile = validParameter.validFile(parameters, "mask", false);
119 if (maskfile == "not found") { maskfile = ""; }
120 else if (maskfile != "default") {
121 if (inputDir != "") {
122 string path = hasPath(maskfile);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { maskfile = inputDir + maskfile; }
128 int ableToOpen = openInputFile(maskfile, in);
129 if (ableToOpen == 1) { abort = true; }
133 method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; }
136 temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
137 filter = isTrue(temp);
139 temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; }
140 correction = isTrue(temp);
142 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
143 convert(temp, processors);
145 temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; }
146 convert(temp, ksize);
148 temp = validParameter.validFile(parameters, "svg", false); if (temp == "not found") { temp = "F"; }
151 temp = validParameter.validFile(parameters, "window", false);
152 if ((temp == "not found") && (method == "chimeraslayer")) { temp = "50"; }
153 else if (temp == "not found") { temp = "0"; }
154 convert(temp, window);
156 temp = validParameter.validFile(parameters, "match", false); if (temp == "not found") { temp = "5"; }
157 convert(temp, match);
159 temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found") { temp = "-4"; }
160 convert(temp, mismatch);
162 temp = validParameter.validFile(parameters, "divergence", false); if (temp == "not found") { temp = "1.007"; }
165 temp = validParameter.validFile(parameters, "minsim", false); if (temp == "not found") { temp = "90"; }
166 convert(temp, minSimilarity);
168 temp = validParameter.validFile(parameters, "mincov", false); if (temp == "not found") { temp = "70"; }
169 convert(temp, minCoverage);
171 temp = validParameter.validFile(parameters, "minbs", false); if (temp == "not found") { temp = "90"; }
172 convert(temp, minBS);
174 temp = validParameter.validFile(parameters, "minsnp", false); if (temp == "not found") { temp = "10"; }
175 convert(temp, minSNP);
177 temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "3"; }
178 convert(temp, parents);
180 temp = validParameter.validFile(parameters, "iters", false);
181 if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; }
182 else if (temp == "not found") { temp = "1000"; }
183 convert(temp, iters);
185 temp = validParameter.validFile(parameters, "increment", false);
186 if ((temp == "not found") && (method == "chimeracheck")) { temp = "10"; }
187 else if ((temp == "not found") && (method == "chimeraslayer")) { temp = "5"; }
188 else if (temp == "not found") { temp = "25"; }
189 convert(temp, increment);
191 temp = validParameter.validFile(parameters, "numwanted", false);
192 if ((temp == "not found") && (method == "chimeraslayer")) { temp = "15"; }
193 else if (temp == "not found") { temp = "20"; }
194 convert(temp, numwanted);
198 if (((method != "bellerophon")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail, ccode, chimeraslayer or chimeracheck methods."); mothurOutEndLine(); abort = true; }
203 catch(exception& e) {
204 errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand");
208 //**********************************************************************************************************************
210 void ChimeraSeqsCommand::help(){
213 //"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name"
214 //mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED and if using a template that the template file sequences are the same length as the fasta file sequences.\n\n");
215 mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n");
216 mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation, quantile, numwanted, ksize, svg, name, iters.\n");
217 mothurOut("The fasta parameter is always required and template is required if using pintail, ccode or chimeracheck.\n");
218 mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n");
219 mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n");
220 mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
221 mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. Options include bellerophon, ccode and chimeracheck \n");
222 mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n");
223 mothurOut("The window parameter allows you to specify the window size for searching for chimeras. \n");
224 mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences.\n");
225 mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences. \n");
226 mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
227 mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences.\n");
228 mothurOut("The numwanted parameter allows you to specify how many sequences you would each query sequence compared with.\n");
229 mothurOut("The ksize parameter allows you to input kmersize. \n");
230 mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence.\n");
231 mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n");
232 mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n");
233 mothurOut("The minsim parameter allows you .... \n");
234 mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n");
235 mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n");
236 mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
237 mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
238 mothurOut("Details for each method: \n");
239 mothurOut("\tpintail: \n");
240 mothurOut("\t\tparameters: fasta=required, template=required, filter=F, mask=no mask, processors=1, window=300, increment=25, conservation=not required, but will improve speed, quantile=not required, but will greatly improve speed. \n");
241 mothurOut("\t\tIf you have run chimera.seqs using pintail a .quan and .freq file will be created for your template, if you have not provided them for use in future command executions.\n");
242 mothurOut("\tbellerophon: \n");
243 mothurOut("\t\tparameters: fasta=required, filter=F, processors=1, window=1/4 length of seq, increment=25, correction=T. \n");
244 mothurOut("\tccode: \n");
245 mothurOut("\t\tparameters: fasta=required, template=required, filter=F, mask=no mask, processors=1, window=10% of length, numwanted=20\n");
246 mothurOut("\tchimeracheck: \n");
247 mothurOut("\t\tparameters: fasta=required, template=required, processors=1, increment=10, ksize=7, svg=F, name=none\n\n");
248 mothurOut("\tchimeraslayer: \n");
249 mothurOut("\t\tparameters: fasta=required, template=required, processors=1, increment=10, mask=no mask, numwanted=10, match=5, mismatch=-4, divergence=1.0, minsim=90, parents=5, iters=1000, window=100. \n\n");
250 mothurOut("The chimera.seqs command should be in the following format: \n");
251 mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
252 mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, method=bellerophon, window=200) \n");
253 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
255 catch(exception& e) {
256 errorOut(e, "ChimeraSeqsCommand", "help");
261 //***************************************************************************************************************
263 ChimeraSeqsCommand::~ChimeraSeqsCommand(){ /* do nothing */ }
265 //***************************************************************************************************************
267 int ChimeraSeqsCommand::execute(){
270 if (abort == true) { return 0; }
272 int start = time(NULL);
274 if (method == "bellerophon") { chimera = new Bellerophon(fastafile, outputDir); }
275 else if (method == "pintail") { chimera = new Pintail(fastafile, outputDir); }
276 else if (method == "ccode") { chimera = new Ccode(fastafile, outputDir); }
277 else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, outputDir); }
278 else if (method == "chimeraslayer") { chimera = new ChimeraSlayer("blast"); }
279 else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; }
282 if (maskfile == "default") { mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine(); }
284 chimera->setCons(consfile);
285 chimera->setQuantiles(quanfile);
286 chimera->setMask(maskfile);
287 chimera->setFilter(filter);
288 chimera->setCorrection(correction);
289 chimera->setProcessors(processors);
290 chimera->setWindow(window);
291 chimera->setIncrement(increment);
292 chimera->setNumWanted(numwanted);
293 chimera->setKmerSize(ksize);
294 chimera->setSVG(svg);
295 chimera->setName(namefile);
296 chimera->setMatch(match);
297 chimera->setMisMatch(mismatch);
298 chimera->setDivR(divR);
299 chimera->setParents(parents);
300 chimera->setMinSim(minSimilarity);
301 chimera->setMinCoverage(minCoverage);
302 chimera->setMinBS(minBS);
303 chimera->setMinSNP(minSNP);
304 chimera->setIters(iters);
305 chimera->setTemplateFile(templatefile);
309 vector<Sequence*> templateSeqs;
310 if ((method != "bellerophon") && (method != "chimeracheck")) {
311 templateSeqs = chimera->readSeqs(templatefile);
312 if (chimera->getUnaligned()) {
313 mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine();
315 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
320 chimera->setTemplateSeqs(templateSeqs);
322 }else if (method == "bellerophon") {//run bellerophon separately since you need to read entire fastafile to run it
323 chimera->getChimeras();
325 string outputFName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras";
327 openOutputFile(outputFName, out);
334 //some methods need to do prep work before processing the chimeras
338 string tempHeader = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras.tempHeader";
339 openOutputFile(tempHeader, outHeader);
341 chimera->printHeader(outHeader);
344 string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras";
347 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
350 openInputFile(fastafile, inFASTA);
351 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
354 lines.push_back(new linePair(0, numSeqs));
356 driver(lines[0], outputFileName, fastafile);
359 vector<int> positions;
360 processIDS.resize(0);
363 openInputFile(fastafile, inFASTA);
366 while(!inFASTA.eof()){
367 input = getline(inFASTA);
368 if (input.length() != 0) {
369 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
374 numSeqs = positions.size();
376 int numSeqsPerProcessor = numSeqs / processors;
378 for (int i = 0; i < processors; i++) {
379 long int startPos = positions[ i * numSeqsPerProcessor ];
380 if(i == processors - 1){
381 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
383 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
387 createProcesses(outputFileName, fastafile);
389 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
391 //append alignment and report files
392 for(int i=1;i<processors;i++){
393 appendOutputFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
394 remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
400 openInputFile(candidateFileNames[s], inFASTA);
401 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
403 lines.push_back(new linePair(0, numSeqs));
405 driver(lines[0], outputFileName, fastafile);
408 //mothurOut("Output File Names: ");
409 //if ((filter) && (method == "bellerophon")) { mothurOut(
410 //if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; }
411 // else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
413 appendOutputFiles(tempHeader, outputFileName);
414 remove(tempHeader.c_str());
416 for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
418 if (method == "chimeracheck") { mothurOutEndLine(); mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine(); }
420 mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); mothurOutEndLine();
425 catch(exception& e) {
426 errorOut(e, "ChimeraSeqsCommand", "execute");
429 }//**********************************************************************************************************************
431 int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filename){
434 openOutputFile(outputFName, out);
437 openInputFile(filename, inFASTA);
439 inFASTA.seekg(line->start);
441 for(int i=0;i<line->numSeqs;i++){
443 Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
445 if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
448 chimera->getChimeras(candidateSeq);
456 if((i+1) % 100 == 0){ mothurOut("Processing sequence: " + toString(i+1)); mothurOutEndLine(); }
459 if((line->numSeqs) % 100 != 0){ mothurOut("Processing sequence: " + toString(line->numSeqs)); mothurOutEndLine(); }
466 catch(exception& e) {
467 errorOut(e, "ChimeraSeqsCommand", "driver");
472 /**************************************************************************************************/
474 void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename) {
476 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
478 // processIDS.resize(0);
480 //loop through and create all the processes you want
481 while (process != processors) {
485 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
488 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename);
490 }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
493 //force parent to wait until all the processes are done
494 for (int i=0;i<processors;i++) {
495 int temp = processIDS[i];
500 catch(exception& e) {
501 errorOut(e, "ChimeraSeqsCommand", "createProcesses");
506 /**************************************************************************************************/
508 void ChimeraSeqsCommand::appendOutputFiles(string temp, string filename) {
514 openOutputFileAppend(temp, output);
515 openInputFile(filename, input);
517 while(char c = input.get()){
518 if(input.eof()) { break; }
519 else { output << c; }
525 catch(exception& e) {
526 errorOut(e, "ChimeraSeqsCommand", "appendOuputFiles");
530 //**********************************************************************************************************************