5 * Created by westcott on 5/10/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "chopseqscommand.h"
11 #include "sequence.hpp"
13 //**********************************************************************************************************************
14 vector<string> ChopSeqsCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
17 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
18 CommandParameter pnumbases("numbases", "Number", "", "0", "", "", "","",false,true,true); parameters.push_back(pnumbases);
19 CommandParameter pcountgaps("countgaps", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcountgaps);
20 CommandParameter pshort("short", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pshort);
21 CommandParameter pkeep("keep", "Multiple", "front-back", "front", "", "", "","",false,false); parameters.push_back(pkeep);
22 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
23 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
25 vector<string> myArray;
26 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
30 m->errorOut(e, "ChopSeqsCommand", "setParameters");
34 //**********************************************************************************************************************
35 string ChopSeqsCommand::getHelpString(){
37 string helpString = "";
38 helpString += "The chop.seqs command reads a fasta file and outputs a .chop.fasta containing the trimmed sequences. Note: If a sequence is completely 'chopped', an accnos file will be created with the names of the sequences removed. \n";
39 helpString += "The chop.seqs command parameters are fasta, numbases, countgaps and keep. fasta is required unless you have a valid current fasta file. numbases is required.\n";
40 helpString += "The chop.seqs command should be in the following format: chop.seqs(fasta=yourFasta, numbases=yourNum, keep=yourKeep).\n";
41 helpString += "The numbases parameter allows you to specify the number of bases you want to keep.\n";
42 helpString += "The keep parameter allows you to specify whether you want to keep the front or the back of your sequence, default=front.\n";
43 helpString += "The countgaps parameter allows you to specify whether you want to count gaps as bases, default=false.\n";
44 helpString += "The short parameter allows you to specify you want to keep sequences that are too short to chop, default=false.\n";
45 helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
46 helpString += "For example, if you ran chop.seqs with numbases=200 and short=t, if a sequence had 100 bases mothur would keep the sequence rather than eliminate it.\n";
47 helpString += "Example chop.seqs(fasta=amazon.fasta, numbases=200, keep=front).\n";
48 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
52 m->errorOut(e, "ChopSeqsCommand", "getHelpString");
56 //**********************************************************************************************************************
57 string ChopSeqsCommand::getOutputPattern(string type) {
61 if (type == "fasta") { pattern = "[filename],chop.fasta"; }
62 else if (type == "accnos") { pattern = "[filename],chop.accnos"; }
63 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
68 m->errorOut(e, "ChopSeqsCommand", "getOutputPattern");
72 //**********************************************************************************************************************
73 ChopSeqsCommand::ChopSeqsCommand(){
75 abort = true; calledHelp = true;
77 vector<string> tempOutNames;
78 outputTypes["fasta"] = tempOutNames;
79 outputTypes["accnos"] = tempOutNames;
82 m->errorOut(e, "ChopSeqsCommand", "ChopSeqsCommand");
86 //**********************************************************************************************************************
87 ChopSeqsCommand::ChopSeqsCommand(string option) {
89 abort = false; calledHelp = false;
91 //allow user to run help
92 if(option == "help") { help(); abort = true; calledHelp = true; }
93 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
96 vector<string> myArray = setParameters();
98 OptionParser parser(option);
99 map<string,string> parameters = parser.getParameters();
101 ValidParameters validParameter;
102 map<string,string>::iterator it;
104 //check to make sure all parameters are valid for command
105 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
106 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
109 //initialize outputTypes
110 vector<string> tempOutNames;
111 outputTypes["fasta"] = tempOutNames;
112 outputTypes["accnos"] = tempOutNames;
114 //if the user changes the input directory command factory will send this info to us in the output parameter
115 string inputDir = validParameter.validFile(parameters, "inputdir", false);
116 if (inputDir == "not found"){ inputDir = ""; }
119 it = parameters.find("fasta");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["fasta"] = inputDir + it->second; }
128 //check for required parameters
129 fastafile = validParameter.validFile(parameters, "fasta", true);
130 if (fastafile == "not open") { abort = true; }
131 else if (fastafile == "not found") { //if there is a current fasta file, use it
132 fastafile = m->getFastaFile();
133 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
134 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
135 }else { m->setFastaFile(fastafile); }
137 //if the user changes the output directory command factory will send this info to us in the output parameter
138 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
140 string temp = validParameter.validFile(parameters, "numbases", false); if (temp == "not found") { temp = "0"; }
141 m->mothurConvert(temp, numbases);
143 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
144 m->setProcessors(temp);
145 m->mothurConvert(temp, processors);
147 temp = validParameter.validFile(parameters, "countgaps", false); if (temp == "not found") { temp = "f"; }
148 countGaps = m->isTrue(temp);
150 temp = validParameter.validFile(parameters, "short", false); if (temp == "not found") { temp = "f"; }
151 Short = m->isTrue(temp);
153 keep = validParameter.validFile(parameters, "keep", false); if (keep == "not found") { keep = "front"; }
155 if (numbases == 0) { m->mothurOut("You must provide the number of bases you want to keep for the chops.seqs command."); m->mothurOutEndLine(); abort = true; }
159 catch(exception& e) {
160 m->errorOut(e, "ChopSeqsCommand", "ChopSeqsCommand");
164 //**********************************************************************************************************************
166 int ChopSeqsCommand::execute(){
169 if (abort == true) { if (calledHelp) { return 0; } return 2; }
171 map<string, string> variables;
172 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
173 string outputFileName = getOutputFileName("fasta", variables);
174 string outputFileNameAccnos = getOutputFileName("accnos", variables);
176 vector<unsigned long long> positions;
177 vector<linePair> lines;
178 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
179 positions = m->divideFile(fastafile, processors);
180 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
183 positions = m->setFilePosFasta(fastafile, numSeqs);
184 if (positions.size() < processors) { processors = positions.size(); }
186 //figure out how many sequences you have to process
187 int numSeqsPerProcessor = numSeqs / processors;
188 for (int i = 0; i < processors; i++) {
189 int startIndex = i * numSeqsPerProcessor;
190 if(i == (processors - 1)){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; }
191 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
195 bool wroteAccnos = false;
196 if(processors == 1) { wroteAccnos = driver(lines[0], fastafile, outputFileName, outputFileNameAccnos); }
197 else { wroteAccnos = createProcesses(lines, fastafile, outputFileName, outputFileNameAccnos); }
199 if (m->control_pressed) { return 0; }
201 m->mothurOutEndLine();
202 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
203 m->mothurOut(outputFileName); m->mothurOutEndLine(); outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
205 if (wroteAccnos) { m->mothurOut(outputFileNameAccnos); m->mothurOutEndLine(); outputNames.push_back(outputFileNameAccnos); outputTypes["accnos"].push_back(outputFileNameAccnos); }
206 else { m->mothurRemove(outputFileNameAccnos); }
208 m->mothurOutEndLine();
210 //set fasta file as new current fastafile
212 itTypes = outputTypes.find("fasta");
213 if (itTypes != outputTypes.end()) {
214 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
217 if (wroteAccnos) { //set accnos file as new current accnosfile
218 itTypes = outputTypes.find("accnos");
219 if (itTypes != outputTypes.end()) {
220 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
228 catch(exception& e) {
229 m->errorOut(e, "ChopSeqsCommand", "execute");
233 /**************************************************************************************************/
234 bool ChopSeqsCommand::createProcesses(vector<linePair> lines, string filename, string outFasta, string outAccnos) {
237 bool wroteAccnos = false;
238 vector<int> processIDS;
239 vector<string> nonBlankAccnosFiles;
241 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
243 //loop through and create all the processes you want
244 while (process != processors) {
248 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
251 wroteAccnos = driver(lines[process], filename, outFasta + toString(getpid()) + ".temp", outAccnos + toString(getpid()) + ".temp");
253 //pass numSeqs to parent
255 string tempFile = fastafile + toString(getpid()) + ".bool.temp";
256 m->openOutputFile(tempFile, out);
257 out << wroteAccnos << endl;
262 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
263 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
269 wroteAccnos = driver(lines[0], filename, outFasta, outAccnos);
271 //force parent to wait until all the processes are done
272 for (int i=0;i<processIDS.size();i++) {
273 int temp = processIDS[i];
278 if (wroteAccnos) { nonBlankAccnosFiles.push_back(outAccnos); }
279 else { m->mothurRemove(outAccnos); } //remove so other files can be renamed to it
281 //parent reads in and combine Filter info
282 for (int i = 0; i < processIDS.size(); i++) {
283 string tempFilename = fastafile + toString(processIDS[i]) + ".bool.temp";
285 m->openInputFile(tempFilename, in);
288 in >> temp; m->gobble(in);
289 if (temp) { wroteAccnos = temp; nonBlankAccnosFiles.push_back(outAccnos + toString(processIDS[i]) + ".temp"); }
290 else { m->mothurRemove((outAccnos + toString(processIDS[i]) + ".temp")); }
293 m->mothurRemove(tempFilename);
296 //////////////////////////////////////////////////////////////////////////////////////////////////////
297 //Windows version shared memory, so be careful when passing variables through the seqSumData struct.
298 //Above fork() will clone, so memory is separate, but that's not the case with windows,
299 //Taking advantage of shared memory to allow both threads to add info to vectors.
300 //////////////////////////////////////////////////////////////////////////////////////////////////////
302 vector<chopData*> pDataArray;
303 DWORD dwThreadIdArray[processors-1];
304 HANDLE hThreadArray[processors-1];
306 //Create processor worker threads.
307 for( int i=0; i<processors-1; i++ ){
309 string extension = "";
310 if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
311 // Allocate memory for thread data.
312 chopData* tempChop = new chopData(filename, (outFasta+extension), (outAccnos+extension), m, lines[i].start, lines[i].end, keep, countGaps, numbases, Short);
313 pDataArray.push_back(tempChop);
315 //MyChopThreadFunction is in header. It must be global or static to work with the threads.
316 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
317 hThreadArray[i] = CreateThread(NULL, 0, MyChopThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
321 wroteAccnos = driver(lines[processors-1], filename, (outFasta + toString(processors-1) + ".temp"), (outAccnos + toString(processors-1) + ".temp"));
322 processIDS.push_back(processors-1);
324 //Wait until all threads have terminated.
325 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
327 if (wroteAccnos) { nonBlankAccnosFiles.push_back(outAccnos); }
328 else { m->mothurRemove(outAccnos); } //remove so other files can be renamed to it
330 //Close all thread handles and free memory allocations.
331 for(int i=0; i < pDataArray.size(); i++){
332 if (pDataArray[i]->wroteAccnos) { wroteAccnos = pDataArray[i]->wroteAccnos; nonBlankAccnosFiles.push_back(outAccnos + toString(processIDS[i]) + ".temp"); }
333 else { m->mothurRemove((outAccnos + toString(processIDS[i]) + ".temp")); }
334 CloseHandle(hThreadArray[i]);
335 delete pDataArray[i];
339 for (int i = 0; i < processIDS.size(); i++) {
340 m->appendFiles((outFasta + toString(processIDS[i]) + ".temp"), outFasta);
341 m->mothurRemove((outFasta + toString(processIDS[i]) + ".temp"));
344 if (nonBlankAccnosFiles.size() != 0) {
345 m->renameFile(nonBlankAccnosFiles[0], outAccnos);
347 for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
348 m->appendFiles(nonBlankAccnosFiles[h], outAccnos);
349 m->mothurRemove(nonBlankAccnosFiles[h]);
351 }else { //recreate the accnosfile if needed
353 m->openOutputFile(outAccnos, out);
359 catch(exception& e) {
360 m->errorOut(e, "ChopSeqsCommand", "createProcesses");
364 /**************************************************************************************/
365 bool ChopSeqsCommand::driver(linePair filePos, string filename, string outFasta, string outAccnos) {
369 m->openOutputFile(outFasta, out);
372 m->openOutputFile(outAccnos, outAcc);
375 m->openInputFile(filename, in);
377 in.seekg(filePos.start);
380 bool wroteAccnos = false;
385 if (m->control_pressed) { in.close(); out.close(); return 1; }
387 Sequence seq(in); m->gobble(in);
389 if (m->control_pressed) { in.close(); out.close(); outAcc.close(); m->mothurRemove(outFasta); m->mothurRemove(outAccnos); return 0; }
391 if (seq.getName() != "") {
392 string newSeqString = getChopped(seq);
394 //output trimmed sequence
395 if (newSeqString != "") {
396 out << ">" << seq.getName() << endl << newSeqString << endl;
398 outAcc << seq.getName() << endl;
404 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
405 unsigned long long pos = in.tellg();
406 if ((pos == -1) || (pos >= filePos.end)) { break; }
408 if (in.eof()) { break; }
411 if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
415 if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
424 catch(exception& e) {
425 m->errorOut(e, "ChopSeqsCommand", "driver");
429 //**********************************************************************************************************************
430 string ChopSeqsCommand::getChopped(Sequence seq) {
432 string temp = seq.getAligned();
433 string tempUnaligned = seq.getUnaligned();
436 //if needed trim sequence
437 if (keep == "front") {//you want to keep the beginning
438 int tempLength = temp.length();
440 if (tempLength > numbases) { //you have enough bases to remove some
443 int numBasesCounted = 0;
445 for (int i = 0; i < temp.length(); i++) {
447 if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
451 if (numBasesCounted >= numbases) { stopSpot = i; break; }
454 if (stopSpot == 0) { temp = ""; }
455 else { temp = temp.substr(0, stopSpot+1); }
458 if (!Short) { temp = ""; } //sequence too short
460 }else { //you are keeping the back
461 int tempLength = temp.length();
462 if (tempLength > numbases) { //you have enough bases to remove some
465 int numBasesCounted = 0;
467 for (int i = (temp.length()-1); i >= 0; i--) {
469 if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
473 if (numBasesCounted >= numbases) { stopSpot = i; break; }
476 if (stopSpot == 0) { temp = ""; }
477 else { temp = temp.substr(stopSpot+1); }
479 if (!Short) { temp = ""; } //sequence too short
485 //if needed trim sequence
486 if (keep == "front") {//you want to keep the beginning
487 int tempLength = tempUnaligned.length();
489 if (tempLength > numbases) { //you have enough bases to remove some
492 int numBasesCounted = 0;
494 for (int i = 0; i < temp.length(); i++) {
496 if (toupper(temp[i]) == 'N') {
499 if (tempLength < numbases) { stopSpot = 0; break; }
502 if(isalpha(temp[i])) { numBasesCounted++; }
504 if (numBasesCounted >= numbases) { stopSpot = i; break; }
507 if (stopSpot == 0) { temp = ""; }
508 else { temp = temp.substr(0, stopSpot+1); }
511 if (!Short) { temp = ""; } //sequence too short
513 }else { //you are keeping the back
514 int tempLength = tempUnaligned.length();
515 if (tempLength > numbases) { //you have enough bases to remove some
518 int numBasesCounted = 0;
520 for (int i = (temp.length()-1); i >= 0; i--) {
522 if (toupper(temp[i]) == 'N') {
525 if (tempLength < numbases) { stopSpot = 0; break; }
528 if(isalpha(temp[i])) { numBasesCounted++; }
530 if (numBasesCounted >= numbases) { stopSpot = i; break; }
533 if (stopSpot == 0) { temp = ""; }
534 else { temp = temp.substr(stopSpot); }
536 if (!Short) { temp = ""; } //sequence too short
543 catch(exception& e) {
544 m->errorOut(e, "ChopSeqsCommand", "getChopped");
548 //**********************************************************************************************************************