]> git.donarmstrong.com Git - mothur.git/blob - chimeracheckcommand.cpp
some changes while testing 1.9
[mothur.git] / chimeracheckcommand.cpp
1 /*
2  *  chimeracheckcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 3/31/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeracheckcommand.h"
11 #include "chimeracheckrdp.h"
12
13 //***************************************************************************************************************
14
15 ChimeraCheckCommand::ChimeraCheckCommand(string option)  {
16         try {
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string Array[] =  {"fasta","processors","increment","template","ksize","svg", "name","outputdir","inputdir" };
25                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26                         
27                         OptionParser parser(option);
28                         map<string,string> parameters = parser.getParameters();
29                         
30                         ValidParameters validParameter;
31                         map<string,string>::iterator it;
32                         
33                         //check to make sure all parameters are valid for command
34                         for (it = parameters.begin(); it != parameters.end(); it++) { 
35                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
36                         }
37                         
38                         //if the user changes the input directory command factory will send this info to us in the output parameter 
39                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
40                         if (inputDir == "not found"){   inputDir = "";          }
41                         else {
42                                 string path;
43                                 it = parameters.find("fasta");
44                                 //user has given a template file
45                                 if(it != parameters.end()){ 
46                                         path = hasPath(it->second);
47                                         //if the user has not given a path then, add inputdir. else leave path alone.
48                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
49                                 }
50                                 
51                                 it = parameters.find("template");
52                                 //user has given a template file
53                                 if(it != parameters.end()){ 
54                                         path = hasPath(it->second);
55                                         //if the user has not given a path then, add inputdir. else leave path alone.
56                                         if (path == "") {       parameters["template"] = inputDir + it->second;         }
57                                 }
58                                 
59                                 it = parameters.find("name");
60                                 //user has given a template file
61                                 if(it != parameters.end()){ 
62                                         path = hasPath(it->second);
63                                         //if the user has not given a path then, add inputdir. else leave path alone.
64                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
65                                 }
66                         }
67
68                         
69                         //check for required parameters
70                         fastafile = validParameter.validFile(parameters, "fasta", true);
71                         if (fastafile == "not open") { abort = true; }
72                         else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.check command."); m->mothurOutEndLine(); abort = true;  }      
73                         
74                         //if the user changes the output directory command factory will send this info to us in the output parameter 
75                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
76                                 outputDir = ""; 
77                                 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it  
78                         }
79
80                         templatefile = validParameter.validFile(parameters, "template", true);
81                         if (templatefile == "not open") { abort = true; }
82                         else if (templatefile == "not found") { templatefile = "";  m->mothurOut("template is a required parameter for the chimera.check command."); m->mothurOutEndLine(); abort = true;  }    
83                         
84                         namefile = validParameter.validFile(parameters, "name", true);
85                         if (namefile == "not open") { abort = true; }
86                         else if (namefile == "not found") { namefile = "";  }
87
88                         string temp = validParameter.validFile(parameters, "processors", false);                if (temp == "not found") { temp = "1"; }
89                         convert(temp, processors);
90                         
91                         temp = validParameter.validFile(parameters, "ksize", false);                    if (temp == "not found") { temp = "7"; }
92                         convert(temp, ksize);
93                         
94                         temp = validParameter.validFile(parameters, "svg", false);                              if (temp == "not found") { temp = "F"; }
95                         svg = isTrue(temp);
96                         
97                         temp = validParameter.validFile(parameters, "increment", false);                if (temp == "not found") { temp = "10"; }
98                         convert(temp, increment);                       
99                 }
100         }
101         catch(exception& e) {
102                 m->errorOut(e, "ChimeraCheckCommand", "ChimeraCheckCommand");
103                 exit(1);
104         }
105 }
106 //**********************************************************************************************************************
107
108 void ChimeraCheckCommand::help(){
109         try {
110         
111                 m->mothurOut("The chimera.check command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
112                 m->mothurOut("This command was created using the algorythms described in CHIMERA_CHECK version 2.7 written by Niels Larsen. \n");
113                 m->mothurOut("The chimera.check command parameters are fasta, template, processors, ksize, increment, svg and name.\n");
114                 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
115                 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
116                 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
117                 #ifdef USE_MPI
118                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
119                 #endif
120                 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default is 10.\n");
121                 m->mothurOut("The ksize parameter allows you to input kmersize, default is 7. \n");
122                 m->mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence, default is False.\n");
123                 m->mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n");
124                 m->mothurOut("The chimera.check command should be in the following format: \n");
125                 m->mothurOut("chimera.check(fasta=yourFastaFile, template=yourTemplateFile, processors=yourProcessors, ksize=yourKmerSize) \n");
126                 m->mothurOut("Example: chimera.check(fasta=AD.fasta, template=core_set_aligned,imputed.fasta, processors=4, ksize=8) \n");
127                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");     
128         }
129         catch(exception& e) {
130                 m->errorOut(e, "ChimeraCheckCommand", "help");
131                 exit(1);
132         }
133 }
134
135 //***************************************************************************************************************
136
137 ChimeraCheckCommand::~ChimeraCheckCommand(){    /*      do nothing      */      }
138
139 //***************************************************************************************************************
140
141 int ChimeraCheckCommand::execute(){
142         try{
143                 
144                 if (abort == true) { return 0; }
145                 
146                 int start = time(NULL); 
147                 
148                 chimera = new ChimeraCheckRDP(fastafile, templatefile, namefile, svg, increment, ksize, outputDir);                     
149
150                 if (m->control_pressed) { delete chimera;       return 0;       }
151                 
152                 string outputFileName = outputDir + getRootName(getSimpleName(fastafile))  + "chimeracheck.chimeras";
153                 
154         #ifdef USE_MPI
155         
156                         int pid, end, numSeqsPerProcessor; 
157                         int tag = 2001;
158                         vector<long> MPIPos;
159                         
160                         MPI_Status status; 
161                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
162                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
163
164                         MPI_File inMPI;
165                         MPI_File outMPI;
166                                                 
167                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
168                         int inMode=MPI_MODE_RDONLY; 
169                                                         
170                         char outFilename[outputFileName.length()];
171                         strcpy(outFilename, outputFileName.c_str());
172                         
173                         char inFileName[fastafile.length()];
174                         strcpy(inFileName, fastafile.c_str());
175
176                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
177                         MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
178                         
179                         if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);  delete chimera; return 0;  }
180                         
181                         if (pid == 0) { //you are the root process 
182                                 MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
183                                 
184                                 //send file positions to all processes
185                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
186                                 MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos        
187                                 
188                                 //figure out how many sequences you have to align
189                                 numSeqsPerProcessor = numSeqs / processors;
190                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
191                                 int startIndex =  pid * numSeqsPerProcessor;
192                         
193                                 //align your part
194                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos);
195                                 
196                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);  remove(outputFileName.c_str());  delete chimera; return 0;  }
197                                 
198                                 //wait on chidren
199                                 for(int i = 1; i < processors; i++) { 
200                                         char buf[4];
201                                         MPI_Recv(buf, 4, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
202                                 }
203                         }else{ //you are a child process
204                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
205                                 MPIPos.resize(numSeqs+1);
206                                 MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
207                                 
208                                 //figure out how many sequences you have to align
209                                 numSeqsPerProcessor = numSeqs / processors;
210                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
211                                 int startIndex =  pid * numSeqsPerProcessor;
212                                 
213                                 //align your part
214                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos);
215                                 
216                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);   delete chimera; return 0;  }
217                                 
218                                 //tell parent you are done.
219                                 char buf[4];
220                                 strcpy(buf, "done"); 
221                                 MPI_Send(buf, 4, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
222                         }
223                         
224                         //close files 
225                         MPI_File_close(&inMPI);
226                         MPI_File_close(&outMPI);
227         #else
228                 
229                 //break up file
230                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
231                         if(processors == 1){
232                                 ifstream inFASTA;
233                                 openInputFile(fastafile, inFASTA);
234                                 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
235                                 inFASTA.close();
236                                 
237                                 lines.push_back(new linePair(0, numSeqs));
238                                 
239                                 driver(lines[0], outputFileName, fastafile);
240                                 
241                                 if (m->control_pressed) { 
242                                         remove(outputFileName.c_str()); 
243                                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
244                                         delete chimera;
245                                         return 0;
246                                 }
247                                                                 
248                         }else{
249                                 vector<int> positions;
250                                 processIDS.resize(0);
251                                 
252                                 ifstream inFASTA;
253                                 openInputFile(fastafile, inFASTA);
254                                 
255                                 string input;
256                                 while(!inFASTA.eof()){
257                                         input = getline(inFASTA);
258                                         if (input.length() != 0) {
259                                                 if(input[0] == '>'){    long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);  }
260                                         }
261                                 }
262                                 inFASTA.close();
263                                 
264                                 numSeqs = positions.size();
265                                 
266                                 int numSeqsPerProcessor = numSeqs / processors;
267                                 
268                                 for (int i = 0; i < processors; i++) {
269                                         long int startPos = positions[ i * numSeqsPerProcessor ];
270                                         if(i == processors - 1){
271                                                 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
272                                         }
273                                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
274                                 }
275                                 
276                                 
277                                 createProcesses(outputFileName, fastafile); 
278                         
279                                 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
280                                         
281                                 //append output files
282                                 for(int i=1;i<processors;i++){
283                                         appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
284                                         remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
285                                 }
286                                 
287                                 if (m->control_pressed) { 
288                                         remove(outputFileName.c_str()); 
289                                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
290                                         delete chimera;
291                                         return 0;
292                                 }
293                         }
294
295                 #else
296                         ifstream inFASTA;
297                         openInputFile(fastafile, inFASTA);
298                         numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
299                         inFASTA.close();
300                         lines.push_back(new linePair(0, numSeqs));
301                         
302                         driver(lines[0], outputFileName, fastafile);
303                         
304                         if (m->control_pressed) { 
305                                         remove(outputFileName.c_str()); 
306                                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
307                                         delete chimera;
308                                         return 0;
309                         }
310                 #endif
311         #endif          
312                 delete chimera;
313                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
314                 
315                 m->mothurOutEndLine(); m->mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); m->mothurOutEndLine(); 
316                 
317                 m->mothurOutEndLine();
318                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
319                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
320                 m->mothurOutEndLine();
321                 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
322
323                 return 0;
324                 
325         }
326         catch(exception& e) {
327                 m->errorOut(e, "ChimeraCheckCommand", "execute");
328                 exit(1);
329         }
330 }
331 //**********************************************************************************************************************
332
333 int ChimeraCheckCommand::driver(linePair* line, string outputFName, string filename){
334         try {
335                 ofstream out;
336                 openOutputFile(outputFName, out);
337                 
338                 ofstream out2;
339                 
340                 ifstream inFASTA;
341                 openInputFile(filename, inFASTA);
342
343                 inFASTA.seekg(line->start);
344                 
345                 for(int i=0;i<line->numSeqs;i++){
346                 
347                         if (m->control_pressed) {       return 1;       }
348                 
349                         Sequence* candidateSeq = new Sequence(inFASTA);  gobble(inFASTA);
350                                 
351                         if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
352                                 //find chimeras
353                                 chimera->getChimeras(candidateSeq);
354                                 
355                                 if (m->control_pressed) {       delete candidateSeq; return 1;  }
356         
357                                 //print results
358                                 chimera->print(out, out2);
359                         }
360                         delete candidateSeq;
361                         
362                         //report progress
363                         if((i+1) % 100 == 0){   m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine();           }
364                 }
365                 //report progress
366                 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine();         }
367                 
368                 out.close();
369                 inFASTA.close();
370                                 
371                 return 0;
372         }
373         catch(exception& e) {
374                 m->errorOut(e, "ChimeraCheckCommand", "driver");
375                 exit(1);
376         }
377 }
378 //**********************************************************************************************************************
379 #ifdef USE_MPI
380 int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<long>& MPIPos){
381         try {
382                 MPI_File outAccMPI;
383                 MPI_Status status; 
384                 int pid;
385                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
386                 
387                 for(int i=0;i<num;i++){
388                         
389                         if (m->control_pressed) { return 0; }
390                         
391                         //read next sequence
392                         int length = MPIPos[start+i+1] - MPIPos[start+i];
393         
394                         char buf4[length];
395                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
396                         
397                         string tempBuf = buf4;
398                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
399                         istringstream iss (tempBuf,istringstream::in);
400
401                         Sequence* candidateSeq = new Sequence(iss);  gobble(iss);
402                                 
403                         if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
404                                 //find chimeras
405                                 chimera->getChimeras(candidateSeq);
406                                         
407                                 //print results
408                                 chimera->print(outMPI, outAccMPI);
409                         }
410                         delete candidateSeq;
411                         
412                         //report progress
413                         if((i+1) % 100 == 0){  cout << "Processing sequence: " << (i+1) << endl;        m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n");          }
414                 }
415                 //report progress
416                 if(num % 100 != 0){             cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n");  }
417                 
418                 return 0;
419         }
420         catch(exception& e) {
421                 m->errorOut(e, "ChimeraCheckCommand", "driverMPI");
422                 exit(1);
423         }
424 }
425 #endif
426
427 /**************************************************************************************************/
428
429 int ChimeraCheckCommand::createProcesses(string outputFileName, string filename) {
430         try {
431 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
432                 int process = 0;
433                 //              processIDS.resize(0);
434                 
435                 //loop through and create all the processes you want
436                 while (process != processors) {
437                         int pid = fork();
438                         
439                         if (pid > 0) {
440                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
441                                 process++;
442                         }else if (pid == 0){
443                                 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename);
444                                 exit(0);
445                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
446                 }
447                 
448                 //force parent to wait until all the processes are done
449                 for (int i=0;i<processors;i++) { 
450                         int temp = processIDS[i];
451                         wait(&temp);
452                 }
453                 
454                 return 0;
455 #endif          
456         }
457         catch(exception& e) {
458                 m->errorOut(e, "ChimeraCheckCommand", "createProcesses");
459                 exit(1);
460         }
461 }
462 /**************************************************************************************************/
463
464