2 * filterseqscommand.cpp
5 * Created by Thomas Ryabin on 5/4/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "filterseqscommand.h"
11 #include "sequence.hpp"
13 /**************************************************************************************/
15 FilterSeqsCommand::FilterSeqsCommand(string option) {
20 //allow user to run help
21 if(option == "help") { help(); abort = true; }
24 //valid paramters for this command
25 string Array[] = {"fasta", "trump", "soft", "hard", "vertical", "outputdir","inputdir", "processors"};
26 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28 OptionParser parser(option);
29 map<string,string> parameters = parser.getParameters();
31 ValidParameters validParameter;
32 map<string,string>::iterator it;
34 //check to make sure all parameters are valid for command
35 for (it = parameters.begin(); it != parameters.end(); it++) {
36 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
39 //if the user changes the input directory command factory will send this info to us in the output parameter
40 string inputDir = validParameter.validFile(parameters, "inputdir", false);
41 if (inputDir == "not found"){ inputDir = ""; }
44 it = parameters.find("fasta");
45 //user has given a template file
46 if(it != parameters.end()){
47 path = hasPath(it->second);
48 //if the user has not given a path then, add inputdir. else leave path alone.
49 if (path == "") { parameters["fasta"] = inputDir + it->second; }
52 it = parameters.find("hard");
53 //user has given a template file
54 if(it != parameters.end()){
55 path = hasPath(it->second);
56 //if the user has not given a path then, add inputdir. else leave path alone.
57 if (path == "") { parameters["hard"] = inputDir + it->second; }
61 //check for required parameters
62 fasta = validParameter.validFile(parameters, "fasta", false);
63 if (fasta == "not found") { m->mothurOut("fasta is a required parameter for the filter.seqs command."); m->mothurOutEndLine(); abort = true; }
65 splitAtDash(fasta, fastafileNames);
67 //go through files and make sure they are good, if not, then disregard them
68 for (int i = 0; i < fastafileNames.size(); i++) {
70 string path = hasPath(fastafileNames[i]);
71 //if the user has not given a path then, add inputdir. else leave path alone.
72 if (path == "") { fastafileNames[i] = inputDir + fastafileNames[i]; }
77 ableToOpen = openInputFile(fastafileNames[i], in);
78 if (ableToOpen == 1) {
79 m->mothurOut(fastafileNames[i] + " will be disregarded."); m->mothurOutEndLine();
80 //erase from file list
81 fastafileNames.erase(fastafileNames.begin()+i);
84 string simpleName = getSimpleName(fastafileNames[i]);
85 filterFileName += simpleName.substr(0, simpleName.find_first_of('.'));
90 //make sure there is at least one valid file left
91 if (fastafileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
95 //if the user changes the output directory command factory will send this info to us in the output parameter
96 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
98 outputDir += hasPath(fastafileNames[0]); //if user entered a file with a path then preserve it
101 //check for optional parameter and set defaults
102 // ...at some point should added some additional type checking...
105 temp = validParameter.validFile(parameters, "trump", false); if (temp == "not found") { temp = "*"; }
108 temp = validParameter.validFile(parameters, "soft", false); if (temp == "not found") { soft = 0; }
109 else { soft = (float)atoi(temp.c_str()) / 100.0; }
111 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
112 convert(temp, processors);
114 hard = validParameter.validFile(parameters, "hard", true); if (hard == "not found") { hard = ""; }
115 else if (hard == "not open") { abort = true; }
117 vertical = validParameter.validFile(parameters, "vertical", false); if (vertical == "not found") { vertical = "T"; }
124 catch(exception& e) {
125 m->errorOut(e, "FilterSeqsCommand", "FilterSeqsCommand");
130 //**********************************************************************************************************************
132 void FilterSeqsCommand::help(){
134 m->mothurOut("The filter.seqs command reads a file containing sequences and creates a .filter and .filter.fasta file.\n");
135 m->mothurOut("The filter.seqs command parameters are fasta, trump, soft, hard and vertical. \n");
136 m->mothurOut("The fasta parameter is required. You may enter several fasta files to build the filter from and filter, by separating their names with -'s.\n");
137 m->mothurOut("For example: fasta=abrecovery.fasta-amazon.fasta \n");
138 m->mothurOut("The trump parameter .... The default is ...\n");
139 m->mothurOut("The soft parameter .... The default is ....\n");
140 m->mothurOut("The hard parameter .... The default is ....\n");
141 m->mothurOut("The vertical parameter .... The default is T.\n");
142 m->mothurOut("The filter.seqs command should be in the following format: \n");
143 m->mothurOut("filter.seqs(fasta=yourFastaFile, trump=yourTrump, soft=yourSoft, hard=yourHard, vertical=yourVertical) \n");
144 m->mothurOut("Example filter.seqs(fasta=abrecovery.fasta, trump=..., soft=..., hard=..., vertical=T).\n");
145 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
148 catch(exception& e) {
149 m->errorOut(e, "FilterSeqsCommand", "help");
154 /**************************************************************************************/
156 int FilterSeqsCommand::execute() {
159 if (abort == true) { return 0; }
160 vector<string> outputNames;
163 openInputFile(fastafileNames[0], inFASTA);
165 Sequence testSeq(inFASTA);
166 alignmentLength = testSeq.getAlignLength();
169 ////////////create filter/////////////////
171 filter = createFilter();
175 string filterFile = outputDir + filterFileName + ".filter";
176 openOutputFile(filterFile, outFilter);
177 outFilter << filter << endl;
179 outputNames.push_back(filterFile);
182 ////////////run filter/////////////////
185 for (int i = 0; i < fastafileNames.size(); i++) {
187 openInputFile(fastafileNames[i], in);
188 string filteredFasta = outputDir + getRootName(getSimpleName(fastafileNames[i])) + "filter.fasta";
190 openOutputFile(filteredFasta, outFASTA);
191 outputNames.push_back(filteredFasta);
195 if (m->control_pressed) { in.close(); outFASTA.close(); for(int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
198 if (seq.getName() != "") {
199 string align = seq.getAligned();
200 string filterSeq = "";
202 for(int j=0;j<alignmentLength;j++){
203 if(filter[j] == '1'){
204 filterSeq += align[j];
208 outFASTA << '>' << seq.getName() << endl << filterSeq << endl;
217 int filteredLength = 0;
218 for(int i=0;i<alignmentLength;i++){
219 if(filter[i] == '1'){ filteredLength++; }
222 if (m->control_pressed) { for(int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
225 m->mothurOutEndLine();
226 m->mothurOut("Length of filtered alignment: " + toString(filteredLength)); m->mothurOutEndLine();
227 m->mothurOut("Number of columns removed: " + toString((alignmentLength-filteredLength))); m->mothurOutEndLine();
228 m->mothurOut("Length of the original alignment: " + toString(alignmentLength)); m->mothurOutEndLine();
229 m->mothurOut("Number of sequences used to construct filter: " + toString(numSeqs)); m->mothurOutEndLine();
232 m->mothurOutEndLine();
233 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
234 for(int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
235 m->mothurOutEndLine();
240 catch(exception& e) {
241 m->errorOut(e, "FilterSeqsCommand", "execute");
245 /**************************************************************************************/
246 string FilterSeqsCommand::createFilter() {
248 string filterString = "";
252 if (soft != 0) { F.setSoft(soft); }
253 if (trump != '*') { F.setTrump(trump); }
255 F.setLength(alignmentLength);
257 if(soft != 0 || isTrue(vertical)){
261 if(hard.compare("") != 0) { F.doHard(hard); }
262 else { F.setFilter(string(alignmentLength, '1')); }
266 if(trump != '*' || isTrue(vertical) || soft != 0){
267 for (int s = 0; s < fastafileNames.size(); s++) {
269 for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
271 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
274 openInputFile(fastafileNames[s], inFASTA);
275 int numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
278 numSeqs += numFastaSeqs;
280 lines.push_back(new linePair(0, numFastaSeqs));
282 driverCreateFilter(F, fastafileNames[s], lines[0]);
284 vector<int> positions;
287 openInputFile(fastafileNames[s], inFASTA);
290 while(!inFASTA.eof()){
291 input = getline(inFASTA);
292 if (input.length() != 0) {
293 if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
298 int numFastaSeqs = positions.size();
300 numSeqs += numFastaSeqs;
302 int numSeqsPerProcessor = numFastaSeqs / processors;
304 for (int i = 0; i < processors; i++) {
305 long int startPos = positions[ i * numSeqsPerProcessor ];
306 if(i == processors - 1){
307 numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
309 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
312 createProcessesCreateFilter(F, fastafileNames[s]);
316 openInputFile(fastafileNames[s], inFASTA);
317 int numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
320 numSeqs += numFastaSeqs;
322 lines.push_back(new linePair(0, numFastaSeqs));
324 driverCreateFilter(F, lines[0], fastafileNames[s]);
331 F.setNumSeqs(numSeqs);
333 if(isTrue(vertical) == 1) { F.doVertical(); }
334 if(soft != 0) { F.doSoft(); }
336 filterString = F.getFilter();
340 catch(exception& e) {
341 m->errorOut(e, "FilterSeqsCommand", "createFilter");
345 /**************************************************************************************/
346 int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair* line) {
350 openInputFile(filename, in);
352 in.seekg(line->start);
354 for(int i=0;i<line->numSeqs;i++){
356 if (m->control_pressed) { in.close(); return 1; }
359 if (seq.getName() != "") {
360 if(trump != '*'){ F.doTrump(seq); }
361 if(isTrue(vertical) || soft != 0){ F.getFreqs(seq); }
370 catch(exception& e) {
371 m->errorOut(e, "FilterSeqsCommand", "driverCreateFilter");
375 /**************************************************************************************************/
377 int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) {
379 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
382 vector<int> processIDS;
384 //loop through and create all the processes you want
385 while (process != processors) {
389 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
392 driverCreateFilter(F, filename, lines[process]);
394 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
397 //force parent to wait until all the processes are done
398 for (int i=0;i<processors;i++) {
399 int temp = processIDS[i];
406 catch(exception& e) {
407 m->errorOut(e, "FilterSeqsCommand", "createProcessesCreateFilter");
411 /**************************************************************************************/