]> git.donarmstrong.com Git - mothur.git/blob - chimeracheckcommand.cpp
fe919dea41de925a5d27227f3805de3e156f9128
[mothur.git] / chimeracheckcommand.cpp
1 /*
2  *  chimeracheckcommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 3/31/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeracheckcommand.h"
11 #include "chimeracheckrdp.h"
12
13 //***************************************************************************************************************
14
15 ChimeraCheckCommand::ChimeraCheckCommand(string option)  {
16         try {
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string Array[] =  {"fasta","processors","increment","template","ksize","svg", "name","outputdir","inputdir" };
25                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26                         
27                         OptionParser parser(option);
28                         map<string,string> parameters = parser.getParameters();
29                         
30                         ValidParameters validParameter;
31                         map<string,string>::iterator it;
32                         
33                         //check to make sure all parameters are valid for command
34                         for (it = parameters.begin(); it != parameters.end(); it++) { 
35                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
36                         }
37                         
38                         //if the user changes the input directory command factory will send this info to us in the output parameter 
39                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
40                         if (inputDir == "not found"){   inputDir = "";          }
41                         else {
42                                 string path;
43                                 it = parameters.find("fasta");
44                                 //user has given a template file
45                                 if(it != parameters.end()){ 
46                                         path = hasPath(it->second);
47                                         //if the user has not given a path then, add inputdir. else leave path alone.
48                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
49                                 }
50                                 
51                                 it = parameters.find("template");
52                                 //user has given a template file
53                                 if(it != parameters.end()){ 
54                                         path = hasPath(it->second);
55                                         //if the user has not given a path then, add inputdir. else leave path alone.
56                                         if (path == "") {       parameters["template"] = inputDir + it->second;         }
57                                 }
58                                 
59                                 it = parameters.find("name");
60                                 //user has given a template file
61                                 if(it != parameters.end()){ 
62                                         path = hasPath(it->second);
63                                         //if the user has not given a path then, add inputdir. else leave path alone.
64                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
65                                 }
66                         }
67
68                         
69                         //check for required parameters
70                         fastafile = validParameter.validFile(parameters, "fasta", true);
71                         if (fastafile == "not open") { abort = true; }
72                         else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.check command."); m->mothurOutEndLine(); abort = true;  }      
73                         
74                         //if the user changes the output directory command factory will send this info to us in the output parameter 
75                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
76                                 outputDir = ""; 
77                                 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it  
78                         }
79
80                         templatefile = validParameter.validFile(parameters, "template", true);
81                         if (templatefile == "not open") { abort = true; }
82                         else if (templatefile == "not found") { templatefile = "";  m->mothurOut("template is a required parameter for the chimera.check command."); m->mothurOutEndLine(); abort = true;  }    
83                         
84                         namefile = validParameter.validFile(parameters, "name", true);
85                         if (namefile == "not open") { abort = true; }
86                         else if (namefile == "not found") { namefile = "";  }
87
88                         string temp = validParameter.validFile(parameters, "processors", false);                if (temp == "not found") { temp = "1"; }
89                         convert(temp, processors);
90                         
91                         temp = validParameter.validFile(parameters, "ksize", false);                    if (temp == "not found") { temp = "7"; }
92                         convert(temp, ksize);
93                         
94                         temp = validParameter.validFile(parameters, "svg", false);                              if (temp == "not found") { temp = "F"; }
95                         svg = isTrue(temp);
96                         
97                         temp = validParameter.validFile(parameters, "increment", false);                if (temp == "not found") { temp = "10"; }
98                         convert(temp, increment);                       
99                 }
100         }
101         catch(exception& e) {
102                 m->errorOut(e, "ChimeraCheckCommand", "ChimeraCheckCommand");
103                 exit(1);
104         }
105 }
106 //**********************************************************************************************************************
107
108 void ChimeraCheckCommand::help(){
109         try {
110         
111                 m->mothurOut("The chimera.check command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
112                 m->mothurOut("This command was created using the algorythms described in CHIMERA_CHECK version 2.7 written by Niels Larsen. \n");
113                 m->mothurOut("The chimera.check command parameters are fasta, template, processors, ksize, increment, svg and name.\n");
114                 m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
115                 m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
116                 m->mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
117                 #ifdef USE_MPI
118                 m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
119                 #endif
120                 m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default is 10.\n");
121                 m->mothurOut("The ksize parameter allows you to input kmersize, default is 7. \n");
122                 m->mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence, default is False.\n");
123                 m->mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n");
124                 m->mothurOut("The chimera.check command should be in the following format: \n");
125                 m->mothurOut("chimera.check(fasta=yourFastaFile, template=yourTemplateFile, processors=yourProcessors, ksize=yourKmerSize) \n");
126                 m->mothurOut("Example: chimera.check(fasta=AD.fasta, template=core_set_aligned,imputed.fasta, processors=4, ksize=8) \n");
127                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");     
128         }
129         catch(exception& e) {
130                 m->errorOut(e, "ChimeraCheckCommand", "help");
131                 exit(1);
132         }
133 }
134
135 //***************************************************************************************************************
136
137 ChimeraCheckCommand::~ChimeraCheckCommand(){    /*      do nothing      */      }
138
139 //***************************************************************************************************************
140
141 int ChimeraCheckCommand::execute(){
142         try{
143                 
144                 if (abort == true) { return 0; }
145                 
146                 int start = time(NULL); 
147                 
148                 chimera = new ChimeraCheckRDP(fastafile, templatefile, namefile, svg, increment, ksize, outputDir);                     
149
150                 if (m->control_pressed) { delete chimera;       return 0;       }
151                 
152                 string outputFileName = outputDir + getRootName(getSimpleName(fastafile))  + "chimeracheck.chimeras";
153                 
154         #ifdef USE_MPI
155         
156                         int pid, end, numSeqsPerProcessor; 
157                         int tag = 2001;
158                         vector<long> MPIPos;
159                         
160                         MPI_Status status; 
161                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
162                         MPI_Comm_size(MPI_COMM_WORLD, &processors); 
163
164                         MPI_File inMPI;
165                         MPI_File outMPI;
166                                                 
167                         int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
168                         int inMode=MPI_MODE_RDONLY; 
169                         
170                         char* outFilename = new char[outputFileName.length()];\r
171                         memcpy(outFilename, outputFileName.c_str(), outputFileName.length());
172
173                         char* inFileName = new char[fastafile.length()];\r
174                         memcpy(inFileName, fastafile.c_str(), fastafile.length());
175
176                         MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
177                         MPI_File_open(MPI_COMM_WORLD, outFilename, outMode, MPI_INFO_NULL, &outMPI);
178                         
179                         delete outFilename;
180                         delete inFileName;
181
182                         if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);  delete chimera; return 0;  }
183                         
184                         if (pid == 0) { //you are the root process 
185                                 MPIPos = setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
186                                 
187                                 //send file positions to all processes
188                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
189                                 MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos        
190                                 
191                                 //figure out how many sequences you have to align
192                                 numSeqsPerProcessor = numSeqs / processors;
193                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
194                                 int startIndex =  pid * numSeqsPerProcessor;
195                         
196                                 //align your part
197                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos);
198                                 
199                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);  remove(outputFileName.c_str());  delete chimera; return 0;  }
200                                 
201                                 //wait on chidren
202                                 for(int i = 1; i < processors; i++) { 
203                                         char buf[4];
204                                         MPI_Recv(buf, 4, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); 
205                                 }
206                         }else{ //you are a child process
207                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
208                                 MPIPos.resize(numSeqs+1);
209                                 MPI_Bcast(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
210                                 
211                                 //figure out how many sequences you have to align
212                                 numSeqsPerProcessor = numSeqs / processors;
213                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
214                                 int startIndex =  pid * numSeqsPerProcessor;
215                                 
216                                 //align your part
217                                 driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, MPIPos);
218                                 
219                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI);   delete chimera; return 0;  }
220                                 
221                                 //tell parent you are done.
222                                 char buf[4];
223                                 strcpy(buf, "done"); 
224                                 MPI_Send(buf, 4, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
225                         }
226                         
227                         //close files 
228                         MPI_File_close(&inMPI);
229                         MPI_File_close(&outMPI);
230         #else
231                 
232                 //break up file
233                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
234                         if(processors == 1){
235                                 ifstream inFASTA;
236                                 openInputFile(fastafile, inFASTA);
237                                 numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
238                                 inFASTA.close();
239                                 
240                                 lines.push_back(new linePair(0, numSeqs));
241                                 
242                                 driver(lines[0], outputFileName, fastafile);
243                                 
244                                 if (m->control_pressed) { 
245                                         remove(outputFileName.c_str()); 
246                                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
247                                         delete chimera;
248                                         return 0;
249                                 }
250                                                                 
251                         }else{
252                                 vector<int> positions;
253                                 processIDS.resize(0);
254                                 
255                                 ifstream inFASTA;
256                                 openInputFile(fastafile, inFASTA);
257                                 
258                                 string input;
259                                 while(!inFASTA.eof()){
260                                         input = getline(inFASTA);
261                                         if (input.length() != 0) {
262                                                 if(input[0] == '>'){    long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);  }
263                                         }
264                                 }
265                                 inFASTA.close();
266                                 
267                                 numSeqs = positions.size();
268                                 
269                                 int numSeqsPerProcessor = numSeqs / processors;
270                                 
271                                 for (int i = 0; i < processors; i++) {
272                                         long int startPos = positions[ i * numSeqsPerProcessor ];
273                                         if(i == processors - 1){
274                                                 numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
275                                         }
276                                         lines.push_back(new linePair(startPos, numSeqsPerProcessor));
277                                 }
278                                 
279                                 
280                                 createProcesses(outputFileName, fastafile); 
281                         
282                                 rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
283                                         
284                                 //append output files
285                                 for(int i=1;i<processors;i++){
286                                         appendFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
287                                         remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
288                                 }
289                                 
290                                 if (m->control_pressed) { 
291                                         remove(outputFileName.c_str()); 
292                                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
293                                         delete chimera;
294                                         return 0;
295                                 }
296                         }
297
298                 #else
299                         ifstream inFASTA;
300                         openInputFile(fastafile, inFASTA);
301                         numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
302                         inFASTA.close();
303                         lines.push_back(new linePair(0, numSeqs));
304                         
305                         driver(lines[0], outputFileName, fastafile);
306                         
307                         if (m->control_pressed) { 
308                                         remove(outputFileName.c_str()); 
309                                         for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
310                                         delete chimera;
311                                         return 0;
312                         }
313                 #endif
314         #endif          
315                 delete chimera;
316                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
317                 
318                 m->mothurOutEndLine(); m->mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); m->mothurOutEndLine(); 
319                 
320                 m->mothurOutEndLine();
321                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
322                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
323                 m->mothurOutEndLine();
324                 m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
325
326                 return 0;
327                 
328         }
329         catch(exception& e) {
330                 m->errorOut(e, "ChimeraCheckCommand", "execute");
331                 exit(1);
332         }
333 }
334 //**********************************************************************************************************************
335
336 int ChimeraCheckCommand::driver(linePair* line, string outputFName, string filename){
337         try {
338                 ofstream out;
339                 openOutputFile(outputFName, out);
340                 
341                 ofstream out2;
342                 
343                 ifstream inFASTA;
344                 openInputFile(filename, inFASTA);
345
346                 inFASTA.seekg(line->start);
347                 
348                 for(int i=0;i<line->numSeqs;i++){
349                 
350                         if (m->control_pressed) {       return 1;       }
351                 
352                         Sequence* candidateSeq = new Sequence(inFASTA);  gobble(inFASTA);
353                                 
354                         if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
355                                 //find chimeras
356                                 chimera->getChimeras(candidateSeq);
357                                 
358                                 if (m->control_pressed) {       delete candidateSeq; return 1;  }
359         
360                                 //print results
361                                 chimera->print(out, out2);
362                         }
363                         delete candidateSeq;
364                         
365                         //report progress
366                         if((i+1) % 100 == 0){   m->mothurOut("Processing sequence: " + toString(i+1)); m->mothurOutEndLine();           }
367                 }
368                 //report progress
369                 if((line->numSeqs) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(line->numSeqs)); m->mothurOutEndLine();         }
370                 
371                 out.close();
372                 inFASTA.close();
373                                 
374                 return 0;
375         }
376         catch(exception& e) {
377                 m->errorOut(e, "ChimeraCheckCommand", "driver");
378                 exit(1);
379         }
380 }
381 //**********************************************************************************************************************
382 #ifdef USE_MPI
383 int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<long>& MPIPos){
384         try {
385                 MPI_File outAccMPI;
386                 MPI_Status status; 
387                 int pid;
388                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
389                 
390                 for(int i=0;i<num;i++){
391                         
392                         if (m->control_pressed) { return 0; }
393                         
394                         //read next sequence
395                         int length = MPIPos[start+i+1] - MPIPos[start+i];
396         
397                         char* buf4 = new char[length];
398                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
399                         
400                         string tempBuf = buf4;
401                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
402                         istringstream iss (tempBuf,istringstream::in);
403                         delete buf4;
404
405                         Sequence* candidateSeq = new Sequence(iss);  gobble(iss);
406                                 
407                         if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
408                                 //find chimeras
409                                 chimera->getChimeras(candidateSeq);
410                                         
411                                 //print results
412                                 chimera->print(outMPI, outAccMPI);
413                         }
414                         delete candidateSeq;
415                         
416                         //report progress
417                         if((i+1) % 100 == 0){  cout << "Processing sequence: " << (i+1) << endl;        m->mothurOutJustToLog("Processing sequence: " + toString(i+1) + "\n");          }
418                 }
419                 //report progress
420                 if(num % 100 != 0){             cout << "Processing sequence: " << num << endl; m->mothurOutJustToLog("Processing sequence: " + toString(num) + "\n");  }
421                 
422                 return 0;
423         }
424         catch(exception& e) {
425                 m->errorOut(e, "ChimeraCheckCommand", "driverMPI");
426                 exit(1);
427         }
428 }
429 #endif
430
431 /**************************************************************************************************/
432
433 int ChimeraCheckCommand::createProcesses(string outputFileName, string filename) {
434         try {
435 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
436                 int process = 0;
437                 //              processIDS.resize(0);
438                 
439                 //loop through and create all the processes you want
440                 while (process != processors) {
441                         int pid = fork();
442                         
443                         if (pid > 0) {
444                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
445                                 process++;
446                         }else if (pid == 0){
447                                 driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename);
448                                 exit(0);
449                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
450                 }
451                 
452                 //force parent to wait until all the processes are done
453                 for (int i=0;i<processors;i++) { 
454                         int temp = processIDS[i];
455                         wait(&temp);
456                 }
457                 
458                 return 0;
459 #endif          
460         }
461         catch(exception& e) {
462                 m->errorOut(e, "ChimeraCheckCommand", "createProcesses");
463                 exit(1);
464         }
465 }
466 /**************************************************************************************************/
467
468