]> git.donarmstrong.com Git - mothur.git/blob - chopseqscommand.cpp
Merge remote-tracking branch 'origin/master'
[mothur.git] / chopseqscommand.cpp
1 /*
2  *  chopseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 5/10/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chopseqscommand.h"
11 #include "sequence.hpp"
12
13 //**********************************************************************************************************************
14 vector<string> ChopSeqsCommand::setParameters(){        
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
17                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
18         CommandParameter pnumbases("numbases", "Number", "", "0", "", "", "","",false,true,true); parameters.push_back(pnumbases);
19                 CommandParameter pcountgaps("countgaps", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcountgaps);
20                 CommandParameter pshort("short", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pshort);
21                 CommandParameter pkeep("keep", "Multiple", "front-back", "front", "", "", "","",false,false); parameters.push_back(pkeep);
22                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
23                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
24                 
25                 vector<string> myArray;
26                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
27                 return myArray;
28         }
29         catch(exception& e) {
30                 m->errorOut(e, "ChopSeqsCommand", "setParameters");
31                 exit(1);
32         }
33 }
34 //**********************************************************************************************************************
35 string ChopSeqsCommand::getHelpString(){        
36         try {
37                 string helpString = "";
38                 helpString += "The chop.seqs command reads a fasta file and outputs a .chop.fasta containing the trimmed sequences. Note: If a sequence is completely 'chopped', an accnos file will be created with the names of the sequences removed. \n";
39                 helpString += "The chop.seqs command parameters are fasta, numbases, countgaps and keep. fasta is required unless you have a valid current fasta file. numbases is required.\n";
40                 helpString += "The chop.seqs command should be in the following format: chop.seqs(fasta=yourFasta, numbases=yourNum, keep=yourKeep).\n";
41                 helpString += "The numbases parameter allows you to specify the number of bases you want to keep.\n";
42                 helpString += "The keep parameter allows you to specify whether you want to keep the front or the back of your sequence, default=front.\n";
43                 helpString += "The countgaps parameter allows you to specify whether you want to count gaps as bases, default=false.\n";
44                 helpString += "The short parameter allows you to specify you want to keep sequences that are too short to chop, default=false.\n";
45                 helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
46         helpString += "For example, if you ran chop.seqs with numbases=200 and short=t, if a sequence had 100 bases mothur would keep the sequence rather than eliminate it.\n";
47                 helpString += "Example chop.seqs(fasta=amazon.fasta, numbases=200, keep=front).\n";
48                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
49                 return helpString;
50         }
51         catch(exception& e) {
52                 m->errorOut(e, "ChopSeqsCommand", "getHelpString");
53                 exit(1);
54         }
55 }
56 //**********************************************************************************************************************
57 string ChopSeqsCommand::getOutputPattern(string type) {
58     try {
59         string pattern = "";
60         
61         if (type == "fasta") {  pattern = "[filename],chop.fasta"; } 
62         else if (type == "accnos") {  pattern = "[filename],chop.accnos"; } 
63         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
64         
65         return pattern;
66     }
67     catch(exception& e) {
68         m->errorOut(e, "ChopSeqsCommand", "getOutputPattern");
69         exit(1);
70     }
71 }
72 //**********************************************************************************************************************
73 ChopSeqsCommand::ChopSeqsCommand(){     
74         try {
75                 abort = true; calledHelp = true; 
76                 setParameters();
77                 vector<string> tempOutNames;
78                 outputTypes["fasta"] = tempOutNames;
79                 outputTypes["accnos"] = tempOutNames;
80         }
81         catch(exception& e) {
82                 m->errorOut(e, "ChopSeqsCommand", "ChopSeqsCommand");
83                 exit(1);
84         }
85 }
86 //**********************************************************************************************************************
87 ChopSeqsCommand::ChopSeqsCommand(string option)  {
88         try {
89                 abort = false; calledHelp = false;   
90                 
91                 //allow user to run help
92                 if(option == "help") { help(); abort = true; calledHelp = true; }
93                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
94                 
95                 else {
96                         vector<string> myArray = setParameters();
97                         
98                         OptionParser parser(option);
99                         map<string,string> parameters = parser.getParameters();
100                         
101                         ValidParameters validParameter;
102                         map<string,string>::iterator it;
103                         
104                         //check to make sure all parameters are valid for command
105                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
106                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
107                         }
108                         
109                         //initialize outputTypes
110                         vector<string> tempOutNames;
111                         outputTypes["fasta"] = tempOutNames;
112                         outputTypes["accnos"] = tempOutNames;
113                 
114                         //if the user changes the input directory command factory will send this info to us in the output parameter 
115                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
116                         if (inputDir == "not found"){   inputDir = "";          }
117                         else {
118                                 string path;
119                                 it = parameters.find("fasta");
120                                 //user has given a template file
121                                 if(it != parameters.end()){ 
122                                         path = m->hasPath(it->second);
123                                         //if the user has not given a path then, add inputdir. else leave path alone.
124                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
125                                 }
126                         }
127
128                         //check for required parameters
129                         fastafile = validParameter.validFile(parameters, "fasta", true);
130                         if (fastafile == "not open") { abort = true; }
131                         else if (fastafile == "not found") {                            //if there is a current fasta file, use it
132                                 fastafile = m->getFastaFile(); 
133                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
134                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
135                         }else { m->setFastaFile(fastafile); }   
136                         
137                         //if the user changes the output directory command factory will send this info to us in the output parameter 
138                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(fastafile);      }
139                         
140                         string temp = validParameter.validFile(parameters, "numbases", false);  if (temp == "not found") { temp = "0"; } 
141                         m->mothurConvert(temp, numbases);   
142                         
143             temp = validParameter.validFile(parameters, "processors", false);   if (temp == "not found"){       temp = m->getProcessors();      }
144                         m->setProcessors(temp);
145                         m->mothurConvert(temp, processors);
146             
147                         temp = validParameter.validFile(parameters, "countgaps", false);        if (temp == "not found") { temp = "f"; } 
148                         countGaps = m->isTrue(temp);  
149                         
150                         temp = validParameter.validFile(parameters, "short", false);    if (temp == "not found") { temp = "f"; } 
151                         Short = m->isTrue(temp);   
152                 
153                         keep = validParameter.validFile(parameters, "keep", false);             if (keep == "not found") { keep = "front"; } 
154                                 
155                         if (numbases == 0)  { m->mothurOut("You must provide the number of bases you want to keep for the chops.seqs command."); m->mothurOutEndLine(); abort = true;  }
156                 }
157
158         }
159         catch(exception& e) {
160                 m->errorOut(e, "ChopSeqsCommand", "ChopSeqsCommand");
161                 exit(1);
162         }
163 }
164 //**********************************************************************************************************************
165
166 int ChopSeqsCommand::execute(){
167         try {
168                 
169                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
170                 
171         map<string, string> variables; 
172         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
173         string outputFileName = getOutputFileName("fasta", variables);
174         string outputFileNameAccnos = getOutputFileName("accnos", variables);        
175         
176         vector<unsigned long long> positions; 
177         vector<linePair> lines;
178 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
179         positions = m->divideFile(fastafile, processors);
180         for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(linePair(positions[i], positions[(i+1)]));      }
181 #else
182         int numSeqs = 0;
183         positions = m->setFilePosFasta(fastafile, numSeqs); 
184         if (positions.size() < processors) { processors = positions.size(); }
185                 
186         //figure out how many sequences you have to process
187         int numSeqsPerProcessor = numSeqs / processors;
188         for (int i = 0; i < processors; i++) {
189             int startIndex =  i * numSeqsPerProcessor;
190             if(i == (processors - 1)){  numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;        }
191             lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
192         }
193 #endif
194         
195         bool wroteAccnos = false;
196         if(processors == 1) {   wroteAccnos = driver(lines[0], fastafile, outputFileName, outputFileNameAccnos);        }
197         else                {   wroteAccnos = createProcesses(lines, fastafile, outputFileName, outputFileNameAccnos);  }
198         
199         if (m->control_pressed) {  return 0; }
200                 
201                 m->mothurOutEndLine();
202                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
203                 m->mothurOut(outputFileName); m->mothurOutEndLine();    outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
204                 
205                 if (wroteAccnos) { m->mothurOut(outputFileNameAccnos); m->mothurOutEndLine(); outputNames.push_back(outputFileNameAccnos); outputTypes["accnos"].push_back(outputFileNameAccnos); }
206                 else {  m->mothurRemove(outputFileNameAccnos);  }
207                 
208                 m->mothurOutEndLine();
209                 
210                 //set fasta file as new current fastafile
211                 string current = "";
212                 itTypes = outputTypes.find("fasta");
213                 if (itTypes != outputTypes.end()) {
214                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
215                 }
216                 
217                 if (wroteAccnos) { //set accnos file as new current accnosfile
218                         itTypes = outputTypes.find("accnos");
219                         if (itTypes != outputTypes.end()) {
220                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
221                         }
222                 }
223                 
224                 
225                 return 0;               
226         }
227
228         catch(exception& e) {
229                 m->errorOut(e, "ChopSeqsCommand", "execute");
230                 exit(1);
231         }
232 }
233 /**************************************************************************************************/
234 bool ChopSeqsCommand::createProcesses(vector<linePair> lines, string filename, string outFasta, string outAccnos) {
235         try {
236                 int process = 1;
237                 bool wroteAccnos = false;
238                 vector<int> processIDS;
239         vector<string> nonBlankAccnosFiles;
240                 
241 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
242                 
243                 //loop through and create all the processes you want
244                 while (process != processors) {
245                         int pid = fork();
246                         
247                         if (pid > 0) {
248                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
249                                 process++;
250                         }else if (pid == 0){
251                                 wroteAccnos = driver(lines[process], filename, outFasta + toString(getpid()) + ".temp", outAccnos + toString(getpid()) + ".temp");
252                                 
253                                 //pass numSeqs to parent
254                                 ofstream out;
255                                 string tempFile = fastafile + toString(getpid()) + ".bool.temp";
256                                 m->openOutputFile(tempFile, out);
257                                 out << wroteAccnos << endl;                             
258                                 out.close();
259                                 
260                                 exit(0);
261                         }else { 
262                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
263                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
264                                 exit(0);
265                         }
266                 }
267                 
268                 //do your part
269                 wroteAccnos = driver(lines[0], filename, outFasta, outAccnos);
270         
271                 //force parent to wait until all the processes are done
272                 for (int i=0;i<processIDS.size();i++) { 
273                         int temp = processIDS[i];
274                         wait(&temp);
275                 }
276                 
277         
278                 if (wroteAccnos) { nonBlankAccnosFiles.push_back(outAccnos); }
279                 else { m->mothurRemove(outAccnos); } //remove so other files can be renamed to it
280         
281                 //parent reads in and combine Filter info
282                 for (int i = 0; i < processIDS.size(); i++) {
283                         string tempFilename = fastafile + toString(processIDS[i]) + ".bool.temp";
284                         ifstream in;
285                         m->openInputFile(tempFilename, in);
286                         
287                         bool temp;
288                         in >> temp; m->gobble(in); 
289             if (temp) { wroteAccnos = temp; nonBlankAccnosFiles.push_back(outAccnos + toString(processIDS[i]) + ".temp");  }
290                         else { m->mothurRemove((outAccnos + toString(processIDS[i]) + ".temp"));  }
291             
292                         in.close();
293                         m->mothurRemove(tempFilename);
294                 }
295 #else
296                 //////////////////////////////////////////////////////////////////////////////////////////////////////
297                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
298                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
299                 //Taking advantage of shared memory to allow both threads to add info to vectors.
300                 //////////////////////////////////////////////////////////////////////////////////////////////////////
301                 
302                 vector<chopData*> pDataArray; 
303                 DWORD   dwThreadIdArray[processors-1];
304                 HANDLE  hThreadArray[processors-1]; 
305                 
306                 //Create processor worker threads.
307                 for( int i=0; i<processors-1; i++ ){
308             
309             string extension = "";
310             if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
311                         // Allocate memory for thread data.
312                         chopData* tempChop = new chopData(filename, (outFasta+extension), (outAccnos+extension), m, lines[i].start, lines[i].end, keep, countGaps, numbases, Short);
313                         pDataArray.push_back(tempChop);
314                         
315                         //MyChopThreadFunction is in header. It must be global or static to work with the threads.
316                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
317                         hThreadArray[i] = CreateThread(NULL, 0, MyChopThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
318                 }
319                 
320         //do your part
321                 wroteAccnos = driver(lines[processors-1], filename, (outFasta + toString(processors-1) + ".temp"), (outAccnos + toString(processors-1) + ".temp"));
322         processIDS.push_back(processors-1);
323         
324                 //Wait until all threads have terminated.
325                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
326                 
327         if (wroteAccnos) { nonBlankAccnosFiles.push_back(outAccnos); }
328                 else { m->mothurRemove(outAccnos); } //remove so other files can be renamed to it
329
330                 //Close all thread handles and free memory allocations.
331                 for(int i=0; i < pDataArray.size(); i++){
332             if (pDataArray[i]->wroteAccnos) { wroteAccnos = pDataArray[i]->wroteAccnos; nonBlankAccnosFiles.push_back(outAccnos + toString(processIDS[i]) + ".temp");  }
333                         else { m->mothurRemove((outAccnos + toString(processIDS[i]) + ".temp"));  }
334                         CloseHandle(hThreadArray[i]);
335                         delete pDataArray[i];
336                 }
337 #endif          
338                 
339                 for (int i = 0; i < processIDS.size(); i++) {
340                         m->appendFiles((outFasta + toString(processIDS[i]) + ".temp"), outFasta);
341                         m->mothurRemove((outFasta + toString(processIDS[i]) + ".temp"));
342                 }
343                 
344         if (nonBlankAccnosFiles.size() != 0) { 
345                         m->renameFile(nonBlankAccnosFiles[0], outAccnos);
346                         
347                         for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
348                                 m->appendFiles(nonBlankAccnosFiles[h], outAccnos);
349                                 m->mothurRemove(nonBlankAccnosFiles[h]);
350                         }
351                 }else { //recreate the accnosfile if needed
352                         ofstream out;
353                         m->openOutputFile(outAccnos, out);
354                         out.close();
355                 }
356
357                 return wroteAccnos;
358         }
359         catch(exception& e) {
360                 m->errorOut(e, "ChopSeqsCommand", "createProcesses");
361                 exit(1);
362         }
363 }
364 /**************************************************************************************/
365 bool ChopSeqsCommand::driver(linePair filePos, string filename, string outFasta, string outAccnos) {    
366         try {
367                 
368                 ofstream out;
369                 m->openOutputFile(outFasta, out);
370         
371         ofstream outAcc;
372                 m->openOutputFile(outAccnos, outAcc);
373         
374                 ifstream in;
375                 m->openInputFile(filename, in);
376         
377                 in.seekg(filePos.start);
378         
379                 bool done = false;
380         bool wroteAccnos = false;
381                 int count = 0;
382         
383                 while (!done) {
384             
385                         if (m->control_pressed) { in.close(); out.close(); return 1; }
386             
387                         Sequence seq(in); m->gobble(in);
388                         
389                         if (m->control_pressed) {  in.close(); out.close(); outAcc.close(); m->mothurRemove(outFasta); m->mothurRemove(outAccnos); return 0;  }
390                         
391                         if (seq.getName() != "") {
392                                 string newSeqString = getChopped(seq);
393                                 
394                                 //output trimmed sequence
395                                 if (newSeqString != "") {
396                                         out << ">" << seq.getName() << endl << newSeqString << endl;
397                                 }else{
398                                         outAcc << seq.getName() << endl;
399                                         wroteAccnos = true;
400                                 }
401                 count++;
402                         }
403                         
404 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
405             unsigned long long pos = in.tellg();
406             if ((pos == -1) || (pos >= filePos.end)) { break; }
407 #else
408             if (in.eof()) { break; }
409 #endif
410             //report progress
411                         if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
412                         
413                 }
414                 //report progress
415                 if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
416
417                 
418                 in.close();
419         out.close();
420         outAcc.close();
421                 
422                 return wroteAccnos;
423         }
424         catch(exception& e) {
425                 m->errorOut(e, "ChopSeqsCommand", "driver");
426                 exit(1);
427         }
428 }
429 //**********************************************************************************************************************
430 string ChopSeqsCommand::getChopped(Sequence seq) {
431         try {
432                 string temp = seq.getAligned();
433                 string tempUnaligned = seq.getUnaligned();
434                 
435                 if (countGaps) {
436                         //if needed trim sequence
437                         if (keep == "front") {//you want to keep the beginning
438                                 int tempLength = temp.length();
439
440                                 if (tempLength > numbases) { //you have enough bases to remove some
441                                 
442                                         int stopSpot = 0;
443                                         int numBasesCounted = 0;
444                                         
445                                         for (int i = 0; i < temp.length(); i++) {
446                                                 //eliminate N's
447                                                 if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
448                                                 
449                                                 numBasesCounted++; 
450                                                 
451                                                 if (numBasesCounted >= numbases) { stopSpot = i; break; }
452                                         }
453                                         
454                                         if (stopSpot == 0) { temp = ""; }
455                                         else {  temp = temp.substr(0, stopSpot+1);  }
456                                                         
457                                 }else { 
458                                         if (!Short) { temp = ""; } //sequence too short
459                                 }
460                         }else { //you are keeping the back
461                                 int tempLength = temp.length();
462                                 if (tempLength > numbases) { //you have enough bases to remove some
463                                         
464                                         int stopSpot = 0;
465                                         int numBasesCounted = 0;
466                                         
467                                         for (int i = (temp.length()-1); i >= 0; i--) {
468                                                 //eliminate N's
469                                                 if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
470                                                 
471                                                 numBasesCounted++; 
472
473                                                 if (numBasesCounted >= numbases) { stopSpot = i; break; }
474                                         }
475                                 
476                                         if (stopSpot == 0) { temp = ""; }
477                                         else {  temp = temp.substr(stopSpot+1);  }
478                                 }else { 
479                                         if (!Short) { temp = ""; } //sequence too short
480                                 }
481                         }
482
483                 }else{
484                                 
485                         //if needed trim sequence
486                         if (keep == "front") {//you want to keep the beginning
487                                 int tempLength = tempUnaligned.length();
488
489                                 if (tempLength > numbases) { //you have enough bases to remove some
490                                         
491                                         int stopSpot = 0;
492                                         int numBasesCounted = 0;
493                                         
494                                         for (int i = 0; i < temp.length(); i++) {
495                                                 //eliminate N's
496                                                 if (toupper(temp[i]) == 'N') { 
497                                                         temp[i] = '.'; 
498                                                         tempLength--;
499                                                         if (tempLength < numbases) { stopSpot = 0; break; }
500                                                 }
501                                                 
502                                                 if(isalpha(temp[i])) { numBasesCounted++; }
503                                                 
504                                                 if (numBasesCounted >= numbases) { stopSpot = i; break; }
505                                         }
506                                         
507                                         if (stopSpot == 0) { temp = ""; }
508                                         else {  temp = temp.substr(0, stopSpot+1);  }
509                                                         
510                                 }else { 
511                                         if (!Short) { temp = ""; } //sequence too short
512                                 }                               
513                         }else { //you are keeping the back
514                                 int tempLength = tempUnaligned.length();
515                                 if (tempLength > numbases) { //you have enough bases to remove some
516                                         
517                                         int stopSpot = 0;
518                                         int numBasesCounted = 0;
519                                         
520                                         for (int i = (temp.length()-1); i >= 0; i--) {
521                                                 //eliminate N's
522                                                 if (toupper(temp[i]) == 'N') { 
523                                                         temp[i] = '.'; 
524                                                         tempLength--;
525                                                         if (tempLength < numbases) { stopSpot = 0; break; }
526                                                 }
527                                                 
528                                                 if(isalpha(temp[i])) { numBasesCounted++; }
529
530                                                 if (numBasesCounted >= numbases) { stopSpot = i; break; }
531                                         }
532                                 
533                                         if (stopSpot == 0) { temp = ""; }
534                                         else {  temp = temp.substr(stopSpot);  }
535                                 }else { 
536                                         if (!Short) { temp = ""; } //sequence too short
537                                 }
538                         }
539                 }
540                 
541                 return temp;
542         }
543         catch(exception& e) {
544                 m->errorOut(e, "ChopSeqsCommand", "getChopped");
545                 exit(1);
546         }
547 }
548 //**********************************************************************************************************************
549
550