]> git.donarmstrong.com Git - mothur.git/blob - chimeracheckcommand.cpp
4947dc97939b6b147718968bb70e544ca262cc00
[mothur.git] / chimeracheckcommand.cpp
1 /*
2  *  chimeracheckcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 3/31/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeracheckcommand.h"
11 #include "chimeracheckrdp.h"
12
13 //***************************************************************************************************************
14
15 ChimeraCheckCommand::ChimeraCheckCommand(string option)  {
16         try {
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string Array[] =  {"fasta","processors","increment","template","ksize","svg", "name","outputdir","inputdir" };
25                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26                         
27                         OptionParser parser(option);
28                         map<string,string> parameters = parser.getParameters();
29                         
30                         ValidParameters validParameter;
31                         map<string,string>::iterator it;
32                         
33                         //check to make sure all parameters are valid for command
34                         for (it = parameters.begin(); it != parameters.end(); it++) { 
35                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
36                         }
37                         
38                         //if the user changes the input directory command factory will send this info to us in the output parameter 
39                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
40                         if (inputDir == "not found"){   inputDir = "";          }
41                         else {
42                                 string path;
43                                 it = parameters.find("fasta");
44                                 //user has given a template file
45                                 if(it != parameters.end()){ 
46                                         path = hasPath(it->second);
47                                         //if the user has not given a path then, add inputdir. else leave path alone.
48                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
49                                 }
50                                 
51                                 it = parameters.find("template");
52                                 //user has given a template file
53                                 if(it != parameters.end()){ 
54                                         path = hasPath(it->second);
55                                         //if the user has not given a path then, add inputdir. else leave path alone.
56                                         if (path == "") {       parameters["template"] = inputDir + it->second;         }
57                                 }
58                                 
59                                 it = parameters.find("name");
60                                 //user has given a template file
61                                 if(it != parameters.end()){ 
62                                         path = hasPath(it->second);
63                                         //if the user has not given a path then, add inputdir. else leave path alone.
64                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
65                                 }
66                         }
67
68                         
69                         //check for required parameters
70                         fastafile = validParameter.validFile(parameters, "fasta", true);
71                         if (fastafile == "not open") { abort = true; }
72                         else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.check command."); m->mothurOutEndLine(); abort = true;  }      
73                         
74                         //if the user changes the output directory command factory will send this info to us in the output parameter 
75                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
76                                 outputDir = ""; 
77                                 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it  
78                         }
79
80                         templatefile = validParameter.validFile(parameters, "template", true);
81                         if (templatefile == "not open") { abort = true; }
82                         else if (templatefile == "not found") { templatefile = "";  m->mothurOut("template is a required parameter for the chimera.check command."); m->mothurOutEndLine(); abort = true;  }    
83                         
84                         namefile = validParameter.validFile(parameters, "name", true);
85                         if (namefile == "not open") { abort = true; }
86                         else if (namefile == "not found") { namefile = "";  }
87
88                         string temp = validParameter.validFile(parameters, "processors", false);                if (temp == "not found") { temp = "1"; }
89                         convert(temp, processors);
90                         
91                         temp = validParameter.validFile(parameters, "ksize", false);                    if (temp == "not found") { temp = "7"; }
92                         convert(temp, ksize);
93                         
94                         temp = validParameter.validFile(parameters, "svg", false);                              if (temp == "not found") { temp = "F"; }
95                         svg = isTrue(temp);
96                         
97                         temp = validParameter.validFile(parameters, "increment", false);                if (temp == "not found") { temp = "10"; }
98                         convert(temp, increment);                       
99                 }
100         }
101         catch(exception& e) {
102                 m->errorOut(e, "ChimeraCheckCommand", "ChimeraCheckCommand");
103                 exit(1);
104         }
105 }
106 //**********************************************************************************************************************
107
108 void ChimeraCheckCommand::help(){
109         try {
110         
111                 m->mothurOut("The chimera.check command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
112                 m->mothurOut("This command was created using the algorythms described in CHIMERA_CHECK version 2.7 written by Niels Larsen. \n");
113                 m->mothurOut("The chimera.check command parameters are fasta, template, processors, ksize, increment, svg and name.\n");
114                 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
115                 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
116                 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
117                 #ifdef USE_MPI
118                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
119                 #endif
120                 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default is 10.\n");
121                 m->mothurOut("The ksize parameter allows you to input kmersize, default is 7. \n");
122                 m->mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence, default is False.\n");
123                 m->mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n");
124                 m->mothurOut("The chimera.check command should be in the following format: \n");
125                 m->mothurOut("chimera.check(fasta=yourFastaFile, template=yourTemplateFile, processors=yourProcessors, ksize=yourKmerSize) \n");
126                 m->mothurOut("Example: chimera.check(fasta=AD.fasta, template=core_set_aligned,imputed.fasta, processors=4, ksize=8) \n");
127                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");     
128         }
129         catch(exception& e) {
130                 m->errorOut(e, "ChimeraCheckCommand", "help");
131                 exit(1);
132         }
133 }
134
135 //***************************************************************************************************************
136
137 ChimeraCheckCommand::~ChimeraCheckCommand(){    /*      do nothing      */      }
138
139 //***************************************************************************************************************
140
141 int ChimeraCheckCommand::execute(){
142         try{
143                 
144                 if (abort == true) { return 0; }
145                 
146                 int start = time(NULL); 
147                 
148                 chimera = new ChimeraCheckRDP(fastafile, templatefile, namefile, svg, increment, ksize, outputDir);                     
149
150                 if (m->control_pressed) { delete chimera;       return 0;       }
151                 
152                 string outputFileName = outputDir + getRootName(getSimpleName(fastafile))  + "chimeracheck.chimeras";
153                 
154         #ifdef USE_MPI
155         
156                         int pid, end, numSeqsPerProcessor; 
157                         int tag = 2001;
158                         vector<long> MPIPos;
159                         
160                         MPI_Status status; 
161                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
162                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
163
164                         MPI_File inMPI;
165                         MPI_File outMPI;
166                                                 
167                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
168                         int inMode=MPI_MODE_RDONLY; 
169                         
170                         //char* outFilename = new char[outputFileName.length()];
171                         //memcpy(outFilename, outputFileName.c_str(), outputFileName.length());
172                         
173                         char outFilename[1024];
174                         strcpy(outFilename, outputFileName.c_str());
175
176                         //char* inFileName = new char[fastafile.length()];
177                         //memcpy(inFileName, fastafile.c_str(), fastafile.length());
178                         
179                         char inFileName[1024];
180                         strcpy(inFileName, fastafile.c_str());
181
182                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
183                         MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
184                         
185                         //delete outFilename;
186                         //delete inFileName;
187
188                         if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);  delete chimera; return 0;  }
189                         
190                         if (pid == 0) { //you are the root process 
191                                 MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
192                                 
193                                 //send file positions to all processes
194                                 for(int i = 1; i < processors; i++) { 
195                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
196                                         MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
197                                 }       
198                                 
199                                 //figure out how many sequences you have to align
200                                 numSeqsPerProcessor = numSeqs / processors;
201                                 int startIndex =  pid * numSeqsPerProcessor;
202                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
203                                 
204                         
205                                 //align your part
206                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos);
207                                 
208                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);  remove(outputFileName.c_str());  delete chimera; return 0;  }
209                                 
210                                 //wait on chidren
211                                 for(int i = 1; i < processors; i++) { 
212                                         char buf[4];
213                                         MPI_Recv(buf, 4, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
214                                 }
215                         }else{ //you are a child process
216                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
217                                 MPIPos.resize(numSeqs+1);
218                                 MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
219                                 
220                                 //figure out how many sequences you have to align
221                                 numSeqsPerProcessor = numSeqs / processors;
222                                 int startIndex =  pid * numSeqsPerProcessor;
223                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
224                                 
225                                 
226                                 //align your part
227                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos);
228                                 
229                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);   delete chimera; return 0;  }
230                                 
231                                 //tell parent you are done.
232                                 char buf[4];
233                                 strcpy(buf, "done"); 
234                                 MPI_Send(buf, 4, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
235                         }
236                         
237                         //close files 
238                         MPI_File_close(&inMPI);
239                         MPI_File_close(&outMPI);
240                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
241         #else
242                 
243                 //break up file
244                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
245                         if(processors == 1){
246                                 ifstream inFASTA;
247                                 openInputFile(fastafile, inFASTA);
248                                 getNumSeqs(inFASTA, numSeqs);
249                                 inFASTA.close();
250                                 
251                                 lines.push_back(new linePair(0, numSeqs));
252                                 
253                                 driver(lines[0], outputFileName, fastafile);
254                                 
255                                 if (m->control_pressed) { 
256                                         remove(outputFileName.c_str()); 
257                                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
258                                         delete chimera;
259                                         return 0;
260                                 }
261                                                                 
262                         }else{
263                                 vector<int> positions;
264                                 processIDS.resize(0);
265                                 
266                                 ifstream inFASTA;
267                                 openInputFile(fastafile, inFASTA);
268                                 
269                                 string input;
270                                 while(!inFASTA.eof()){
271                                         input = getline(inFASTA);
272                                         if (input.length() != 0) {
273                                                 if(input[0] == '>'){    long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);  }
274                                         }
275                                 }
276                                 inFASTA.close();
277                                 
278                                 numSeqs = positions.size();
279                                 
280                                 int numSeqsPerProcessor = numSeqs / processors;
281                                 
282                                 for (int i = 0; i < processors; i++) {
283                                         long int startPos = positions[ i * numSeqsPerProcessor ];
284                                         if(i == processors - 1){
285                                                 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
286                                         }
287                                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
288                                 }
289                                 
290                                 
291                                 createProcesses(outputFileName, fastafile); 
292                         
293                                 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
294                                         
295                                 //append output files
296                                 for(int i=1;i<processors;i++){
297                                         appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
298                                         remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
299                                 }
300                                 
301                                 if (m->control_pressed) { 
302                                         remove(outputFileName.c_str()); 
303                                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
304                                         delete chimera;
305                                         return 0;
306                                 }
307                         }
308
309                 #else
310                         ifstream inFASTA;
311                         openInputFile(fastafile, inFASTA);
312                         getNumSeqs(inFASTA, numSeqs);
313                         inFASTA.close();
314                         lines.push_back(new linePair(0, numSeqs));
315                         
316                         driver(lines[0], outputFileName, fastafile);
317                         
318                         if (m->control_pressed) { 
319                                         remove(outputFileName.c_str()); 
320                                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
321                                         delete chimera;
322                                         return 0;
323                         }
324                 #endif
325         #endif          
326                 delete chimera;
327                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
328                 
329                 m->mothurOutEndLine(); m->mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); m->mothurOutEndLine(); 
330                 
331                 m->mothurOutEndLine();
332                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
333                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
334                 m->mothurOutEndLine();
335                 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
336
337                 return 0;
338                 
339         }
340         catch(exception& e) {
341                 m->errorOut(e, "ChimeraCheckCommand", "execute");
342                 exit(1);
343         }
344 }
345 //**********************************************************************************************************************
346
347 int ChimeraCheckCommand::driver(linePair* line, string outputFName, string filename){
348         try {
349                 ofstream out;
350                 openOutputFile(outputFName, out);
351                 
352                 ofstream out2;
353                 
354                 ifstream inFASTA;
355                 openInputFile(filename, inFASTA);
356
357                 inFASTA.seekg(line->start);
358                 
359                 for(int i=0;i<line->numSeqs;i++){
360                 
361                         if (m->control_pressed) {       return 1;       }
362                 
363                         Sequence* candidateSeq = new Sequence(inFASTA);  gobble(inFASTA);
364                                 
365                         if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
366                                 //find chimeras
367                                 chimera->getChimeras(candidateSeq);
368                                 
369                                 if (m->control_pressed) {       delete candidateSeq; return 1;  }
370         
371                                 //print results
372                                 chimera->print(out, out2);
373                         }
374                         delete candidateSeq;
375                         
376                         //report progress
377                         if((i+1) % 100 == 0){   m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine();           }
378                 }
379                 //report progress
380                 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine();         }
381                 
382                 out.close();
383                 inFASTA.close();
384                                 
385                 return 0;
386         }
387         catch(exception& e) {
388                 m->errorOut(e, "ChimeraCheckCommand", "driver");
389                 exit(1);
390         }
391 }
392 //**********************************************************************************************************************
393 #ifdef USE_MPI
394 int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<long>& MPIPos){
395         try {
396                 MPI_File outAccMPI;
397                 MPI_Status status; 
398                 int pid;
399                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
400                 
401                 for(int i=0;i<num;i++){
402                         
403                         if (m->control_pressed) { return 0; }
404                         
405                         //read next sequence
406                         int length = MPIPos[start+i+1] - MPIPos[start+i];
407         
408                         char* buf4 = new char[length];
409                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
410                         
411                         string tempBuf = buf4;
412                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
413                         istringstream iss (tempBuf,istringstream::in);
414                         delete buf4;
415
416                         Sequence* candidateSeq = new Sequence(iss);  gobble(iss);
417                                 
418                         if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
419                                 //find chimeras
420                                 chimera->getChimeras(candidateSeq);
421                                         
422                                 //print results
423                                 chimera->print(outMPI, outAccMPI);
424                         }
425                         delete candidateSeq;
426                         
427                         //report progress
428                         if((i+1) % 100 == 0){  cout << "Processing sequence: " << (i+1) << endl;        m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n");          }
429                 }
430                 //report progress
431                 if(num % 100 != 0){             cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n");  }
432                 
433                 return 0;
434         }
435         catch(exception& e) {
436                 m->errorOut(e, "ChimeraCheckCommand", "driverMPI");
437                 exit(1);
438         }
439 }
440 #endif
441
442 /**************************************************************************************************/
443
444 int ChimeraCheckCommand::createProcesses(string outputFileName, string filename) {
445         try {
446 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
447                 int process = 0;
448                 //              processIDS.resize(0);
449                 
450                 //loop through and create all the processes you want
451                 while (process != processors) {
452                         int pid = fork();
453                         
454                         if (pid > 0) {
455                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
456                                 process++;
457                         }else if (pid == 0){
458                                 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename);
459                                 exit(0);
460                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
461                 }
462                 
463                 //force parent to wait until all the processes are done
464                 for (int i=0;i<processors;i++) { 
465                         int temp = processIDS[i];
466                         wait(&temp);
467                 }
468                 
469                 return 0;
470 #endif          
471         }
472         catch(exception& e) {
473                 m->errorOut(e, "ChimeraCheckCommand", "createProcesses");
474                 exit(1);
475         }
476 }
477 /**************************************************************************************************/
478
479