]> git.donarmstrong.com Git - mothur.git/blob - classifyseqscommand.cpp
finished work on classify.seqs bayesian method and various bug fixes
[mothur.git] / classifyseqscommand.cpp
1 /*
2  *  classifyseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/2/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyseqscommand.h"
11 #include "sequence.hpp"
12 #include "bayesian.h"
13 #include "doTaxonomy.h"
14 #include "knn.h"
15
16 //**********************************************************************************************************************
17
18 ClassifySeqsCommand::ClassifySeqsCommand(string option){
19         try {
20                 //              globaldata = GlobalData::getInstance();
21                 abort = false;
22                 
23                 //allow user to run help
24                 if(option == "help") { help(); abort = true; }
25                 
26                 else {
27                         
28                         //valid paramters for this command
29                         string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff"};
30                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
31                         
32                         OptionParser parser(option);
33                         map<string, string> parameters = parser.getParameters(); 
34                         
35                         ValidParameters validParameter;
36                         
37                         //check to make sure all parameters are valid for command
38                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
39                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
40                         }
41                         
42                         //check for required parameters
43                         templateFileName = validParameter.validFile(parameters, "template", true);
44                         if (templateFileName == "not found") { 
45                                 mothurOut("template is a required parameter for the classify.seqs command."); 
46                                 mothurOutEndLine();
47                                 abort = true; 
48                         }
49                         else if (templateFileName == "not open") { abort = true; }      
50                         
51                         fastaFileName = validParameter.validFile(parameters, "fasta", true);
52                         if (fastaFileName == "not found") { 
53                                 mothurOut("fasta is a required parameter for the classify.seqs command."); 
54                                 mothurOutEndLine();
55                                 abort = true; 
56                         }
57                         else if (fastaFileName == "not open") { abort = true; } 
58                         
59                         taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
60                         if (taxonomyFileName == "not found") { 
61                                 mothurOut("taxonomy is a required parameter for the classify.seqs command."); 
62                                 mothurOutEndLine();
63                                 abort = true; 
64                         }
65                         else if (taxonomyFileName == "not open") { abort = true; }      
66
67                         
68                         //check for optional parameter and set defaults
69                         // ...at some point should added some additional type checking...
70                         string temp;
71                         temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
72                         convert(temp, kmerSize); 
73                         
74                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
75                         convert(temp, processors); 
76                         
77                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
78                         
79                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "bayesian";    }
80                         
81                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
82                         convert(temp, match);  
83                         
84                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
85                         convert(temp, misMatch);  
86                         
87                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
88                         convert(temp, gapOpen);  
89                         
90                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
91                         convert(temp, gapExtend); 
92                         
93                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
94                         convert(temp, numWanted);
95                         
96                         temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found"){       temp = "0";                             }
97                         convert(temp, cutoff);
98
99                         
100                         if ((method == "bayesian") && (search != "kmer"))  { 
101                                 mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); mothurOutEndLine();
102                                 search = "kmer";
103                         }
104                 }
105                 
106         }
107         catch(exception& e) {
108                 errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
109                 exit(1);
110         }
111 }
112
113 //**********************************************************************************************************************
114
115 ClassifySeqsCommand::~ClassifySeqsCommand(){    
116
117         if (abort == false) {
118                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
119         }
120 }
121
122 //**********************************************************************************************************************
123
124 void ClassifySeqsCommand::help(){
125         try {
126                 mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
127                 mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n");
128                 mothurOut("The template, fasta and taxonomy parameters are required.\n");
129                 mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer and blast. The default is kmer.\n");
130                 mothurOut("The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n");
131                 mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
132                 mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
133                 mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
134                 mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
135                 mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
136                 mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -2.0.\n");
137                 mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n");
138                 mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n");
139                 mothurOut("The classify.seqs command should be in the following format: \n");
140                 mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
141                 mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
142                 mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
143                 mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
144                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
145         }
146         catch(exception& e) {
147                 errorOut(e, "ClassifySeqsCommand", "help");
148                 exit(1);
149         }
150 }
151
152
153 //**********************************************************************************************************************
154
155 int ClassifySeqsCommand::execute(){
156         try {
157                 if (abort == true) {    return 0;       }
158                 
159                 
160                 
161                 if(method == "bayesian")                        {       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff);          }
162                 else if(method == "knn")                        {       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
163                 else {
164                         mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
165                         mothurOutEndLine();
166                         classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff);  
167                 }
168
169                 int numFastaSeqs = 0;
170                 
171                 string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
172                 string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
173                 string taxSummary = getRootName(fastaFileName) + "tax.summary";
174                 
175                 int start = time(NULL);
176 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
177                 if(processors == 1){
178                         ifstream inFASTA;
179                         openInputFile(fastaFileName, inFASTA);
180                         numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
181                         inFASTA.close();
182                         
183                         lines.push_back(new linePair(0, numFastaSeqs));
184                         
185                         driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
186                         
187                 }
188                 else{
189                         vector<int> positions;
190                         processIDS.resize(0);
191                         
192                         ifstream inFASTA;
193                         openInputFile(fastaFileName, inFASTA);
194                         
195                         string input;
196                         while(!inFASTA.eof()){
197                                 input = getline(inFASTA);
198                                 if (input.length() != 0) {
199                                         if(input[0] == '>'){    int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);       }
200                                 }
201                         }
202                         inFASTA.close();
203                         
204                         numFastaSeqs = positions.size();
205                         
206                         int numSeqsPerProcessor = numFastaSeqs / processors;
207                         
208                         for (int i = 0; i < processors; i++) {
209                                 int startPos = positions[ i * numSeqsPerProcessor ];
210                                 if(i == processors - 1){
211                                         numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
212                                 }
213                                 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
214                         }
215                         createProcesses(newTaxonomyFile, tempTaxonomyFile); 
216                         
217                         rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
218                         rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
219                         
220                         for(int i=1;i<processors;i++){
221                                 appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
222                                 appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
223                                 remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
224                                 remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
225                         }
226                         
227                 }
228 #else
229                 ifstream inFASTA;
230                 openInputFile(candidateFileName, inFASTA);
231                 numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
232                 inFASTA.close();
233                 
234                 lines.push_back(new linePair(0, numFastaSeqs));
235                 
236                 driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
237 #endif
238                 
239                 delete classify;
240                 
241                 //make taxonomy tree from new taxonomy file 
242                 ifstream inTaxonomy;
243                 openInputFile(tempTaxonomyFile, inTaxonomy);
244                 
245                 string accession, taxaList;
246                 PhyloTree taxaBrowser;
247                 
248                 //read in users taxonomy file and add sequences to tree
249                 while(!inTaxonomy.eof()){
250                         inTaxonomy >> accession >> taxaList;
251                         
252                         taxaBrowser.addSeqToTree(accession, taxaList);
253                         
254                         gobble(inTaxonomy);
255                 }
256                 inTaxonomy.close();
257                 remove(tempTaxonomyFile.c_str());
258                 
259                 taxaBrowser.assignHeirarchyIDs(0);
260                 
261                 ofstream outTaxTree;
262                 openOutputFile(taxSummary, outTaxTree);
263                 
264                 taxaBrowser.print(outTaxTree);
265                 
266                 mothurOutEndLine();
267                 mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences.");
268                 mothurOutEndLine();
269                 mothurOutEndLine();
270                 
271                 return 0;
272         }
273         catch(exception& e) {
274                 errorOut(e, "ClassifySeqsCommand", "execute");
275                 exit(1);
276         }
277 }
278 /**************************************************************************************************/
279
280 void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile) {
281         try {
282 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
283                 int process = 0;
284                 //              processIDS.resize(0);
285                 
286                 //loop through and create all the processes you want
287                 while (process != processors) {
288                         int pid = fork();
289                         
290                         if (pid > 0) {
291                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
292                                 process++;
293                         }else if (pid == 0){
294                                 driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp");
295                                 exit(0);
296                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
297                 }
298                 
299                 //force parent to wait until all the processes are done
300                 for (int i=0;i<processors;i++) { 
301                         int temp = processIDS[i];
302                         wait(&temp);
303                 }
304 #endif          
305         }
306         catch(exception& e) {
307                 errorOut(e, "ClassifySeqsCommand", "createProcesses");
308                 exit(1);
309         }
310 }
311 /**************************************************************************************************/
312
313 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
314         try{
315                 
316                 ofstream output;
317                 ifstream input;
318                 openOutputFileAppend(filename, output);
319                 openInputFile(temp, input);
320                 
321                 while(char c = input.get()){
322                         if(input.eof())         {       break;                  }
323                         else                            {       output << c;    }
324                 }
325                 
326                 input.close();
327                 output.close();
328         }
329         catch(exception& e) {
330                 errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
331                 exit(1);
332         }
333 }
334
335 //**********************************************************************************************************************
336
337 int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName){
338         try {
339                 ofstream outTax;
340                 openOutputFile(taxFName, outTax);
341                 
342                 ofstream outTaxSimple;
343                 openOutputFile(tempTFName, outTaxSimple);
344         
345                 ifstream inFASTA;
346                 openInputFile(fastaFileName, inFASTA);
347
348                 inFASTA.seekg(line->start);
349                 
350                 string taxonomy;
351
352                 for(int i=0;i<line->numSeqs;i++){
353                         
354                         Sequence* candidateSeq = new Sequence(inFASTA);
355
356                         taxonomy = classify->getTaxonomy(candidateSeq);
357                         
358                         if (taxonomy != "bad seq") {
359                                 //if (method != "bayesian") {
360                                         outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
361                                         outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
362                                 //}else{
363                                         //vector<string> pTax = classify->parseTax(taxonomy);
364                                         //map<string, int> confidence = classify->getConfidenceScores();
365                                         
366                                         //outTax << candidateSeq->getName() << '\t';
367                                         //for (int j = 0; j < pTax.size(); j++) {
368                                                 //if (confidence[pTax[j]] > cutoff) {
369                                                 //      outTax << pTax[j] << "(" << confidence[pTax[j]] << ");";
370                                                 //}else{ break; }
371                                         //}
372                                         //outTax << endl;
373                                 //}
374                         }
375                                                         
376                         delete candidateSeq;
377                         
378                         if(i % 100 == 0){
379                                 mothurOut("Classifying sequence " + toString(i)); mothurOutEndLine();
380                         }
381                 }
382                 
383                 inFASTA.close();
384                 outTax.close();
385                 outTaxSimple.close();
386                 
387                 return 1;
388         }
389         catch(exception& e) {
390                 errorOut(e, "ClassifySeqsCommand", "driver");
391                 exit(1);
392         }
393 }
394
395 /**************************************************************************************************/