]> git.donarmstrong.com Git - mothur.git/blob - classifyseqscommand.cpp
started work on classify.seqs command. changed the database class so that it does...
[mothur.git] / classifyseqscommand.cpp
1 /*
2  *  classifyseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/2/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classifyseqscommand.h"
11 #include "sequence.hpp"
12 #include "phylotype.h"
13 #include "bayesian.h"
14 #include "doTaxonomy.h"
15 #include "knn.h"
16
17 //**********************************************************************************************************************
18
19 ClassifySeqsCommand::ClassifySeqsCommand(string option){
20         try {
21                 //              globaldata = GlobalData::getInstance();
22                 abort = false;
23                 
24                 //allow user to run help
25                 if(option == "help") { help(); abort = true; }
26                 
27                 else {
28                         
29                         //valid paramters for this command
30                         string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted"};
31                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
32                         
33                         OptionParser parser(option);
34                         map<string, string> parameters = parser.getParameters(); 
35                         
36                         ValidParameters validParameter;
37                         
38                         //check to make sure all parameters are valid for command
39                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
40                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
41                         }
42                         
43                         //check for required parameters
44                         templateFileName = validParameter.validFile(parameters, "template", true);
45                         if (templateFileName == "not found") { 
46                                 mothurOut("template is a required parameter for the classify.seqs command."); 
47                                 mothurOutEndLine();
48                                 abort = true; 
49                         }
50                         else if (templateFileName == "not open") { abort = true; }      
51                         
52                         fastaFileName = validParameter.validFile(parameters, "fasta", true);
53                         if (fastaFileName == "not found") { 
54                                 mothurOut("fasta is a required parameter for the classify.seqs command."); 
55                                 mothurOutEndLine();
56                                 abort = true; 
57                         }
58                         else if (fastaFileName == "not open") { abort = true; } 
59                         
60                         taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
61                         if (taxonomyFileName == "not found") { 
62                                 mothurOut("taxonomy is a required parameter for the classify.seqs command."); 
63                                 mothurOutEndLine();
64                                 abort = true; 
65                         }
66                         else if (taxonomyFileName == "not open") { abort = true; }      
67
68                         
69                         //check for optional parameter and set defaults
70                         // ...at some point should added some additional type checking...
71                         string temp;
72                         temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
73                         convert(temp, kmerSize); 
74                         
75                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
76                         convert(temp, processors); 
77                         
78                         search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
79                         
80                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "phylotype";   }
81                         
82                         temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
83                         convert(temp, match);  
84                         
85                         temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
86                         convert(temp, misMatch);  
87                         
88                         temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
89                         convert(temp, gapOpen);  
90                         
91                         temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
92                         convert(temp, gapExtend); 
93                         
94                         temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
95                         convert(temp, numWanted);
96
97                 }
98                 
99         }
100         catch(exception& e) {
101                 errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
102                 exit(1);
103         }
104 }
105
106 //**********************************************************************************************************************
107
108 ClassifySeqsCommand::~ClassifySeqsCommand(){    
109
110         if (abort == false) {
111                 for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
112         }
113 }
114
115 //**********************************************************************************************************************
116
117 void ClassifySeqsCommand::help(){
118         try {
119                 mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
120                 mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n");
121                 mothurOut("The template, fasta and taxonomy parameters are required.\n");
122                 mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer and blast. The default is kmer.\n");
123                 mothurOut("The method parameter allows you to specify classification method to use.  Your options are: phylotype, bayesian and knn. The default is phylotype.\n");
124                 mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
125                 mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
126                 mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
127                 mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
128                 mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
129                 mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -2.0.\n");
130                 mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n");
131                 mothurOut("The classify.seqs command should be in the following format: \n");
132                 mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
133                 mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=phylotype, search=gotoh, ksize=8, processors=2)\n");
134                 mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
135                 mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
136                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
137         }
138         catch(exception& e) {
139                 errorOut(e, "ClassifySeqsCommand", "help");
140                 exit(1);
141         }
142 }
143
144
145 //**********************************************************************************************************************
146
147 int ClassifySeqsCommand::execute(){
148         try {
149                 if (abort == true) {    return 0;       }
150                 
151                 int start = time(NULL);
152                 
153                 if(method == "phylotype"){      classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);                                    }
154                 //else if(method == "bayesian")         {       classify = new Bayesian(taxonomyFileName, templateFileName, search);            }
155                 else if(method == "knn")                        {       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
156                 else {
157                         mothurOut(search + " is not a valid method option. I will run the command using phylotype.");
158                         mothurOutEndLine();
159                         classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
160                 }
161
162                 int numFastaSeqs = 0;
163                 
164                 string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
165                 string taxSummary = getRootName(fastaFileName) + "tax.summary";
166                 
167 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
168                 if(processors == 1){
169                         ifstream inFASTA;
170                         openInputFile(fastaFileName, inFASTA);
171                         numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
172                         inFASTA.close();
173                         
174                         lines.push_back(new linePair(0, numFastaSeqs));
175                         
176                         driver(lines[0], newTaxonomyFile);
177                         
178                 }
179                 else{
180                         vector<int> positions;
181                         processIDS.resize(0);
182                         
183                         ifstream inFASTA;
184                         openInputFile(fastaFileName, inFASTA);
185                         
186                         string input;
187                         while(!inFASTA.eof()){
188                                 input = getline(inFASTA);
189                                 if (input.length() != 0) {
190                                         if(input[0] == '>'){    int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);       }
191                                 }
192                         }
193                         inFASTA.close();
194                         
195                         numFastaSeqs = positions.size();
196                         
197                         int numSeqsPerProcessor = numFastaSeqs / processors;
198                         
199                         for (int i = 0; i < processors; i++) {
200                                 int startPos = positions[ i * numSeqsPerProcessor ];
201                                 if(i == processors - 1){
202                                         numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
203                                 }
204                                 lines.push_back(new linePair(startPos, numSeqsPerProcessor));
205                         }
206                         createProcesses(newTaxonomyFile); 
207                         
208                         rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
209                         
210                         for(int i=1;i<processors;i++){
211                                 appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
212                                 remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
213                         }
214                         
215                 }
216 #else
217                 ifstream inFASTA;
218                 openInputFile(candidateFileName, inFASTA);
219                 numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
220                 inFASTA.close();
221                 
222                 lines.push_back(new linePair(0, numFastaSeqs));
223                 
224                 driver(lines[0], newTaxonomyFile);
225 #endif
226                 
227                 delete classify;
228                 
229                 //make taxonomy tree from new taxonomy file 
230                 ifstream inTaxonomy;
231                 openInputFile(newTaxonomyFile, inTaxonomy);
232                 
233                 string accession, taxaList;
234                 PhyloTree taxaBrowser;
235                 
236                 //read in users taxonomy file and add sequences to tree
237                 while(!inTaxonomy.eof()){
238                         inTaxonomy >> accession >> taxaList;
239                         
240                         taxaBrowser.addSeqToTree(accession, taxaList);
241                         
242                         gobble(inTaxonomy);
243                 }
244                 inTaxonomy.close();
245                 
246                 taxaBrowser.assignHeirarchyIDs(0);
247                 
248                 ofstream outTaxTree;
249                 openOutputFile(taxSummary, outTaxTree);
250                 
251                 taxaBrowser.print(outTaxTree);
252                 
253                 mothurOutEndLine();
254                 mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences.");
255                 mothurOutEndLine();
256                 mothurOutEndLine();
257                 
258                 return 0;
259         }
260         catch(exception& e) {
261                 errorOut(e, "ClassifySeqsCommand", "execute");
262                 exit(1);
263         }
264 }
265 /**************************************************************************************************/
266
267 void ClassifySeqsCommand::createProcesses(string taxFileName) {
268         try {
269 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
270                 int process = 0;
271                 //              processIDS.resize(0);
272                 
273                 //loop through and create all the processes you want
274                 while (process != processors) {
275                         int pid = fork();
276                         
277                         if (pid > 0) {
278                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
279                                 process++;
280                         }else if (pid == 0){
281                                 driver(lines[process], taxFileName + toString(getpid()) + ".temp");
282                                 exit(0);
283                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
284                 }
285                 
286                 //force parent to wait until all the processes are done
287                 for (int i=0;i<processors;i++) { 
288                         int temp = processIDS[i];
289                         wait(&temp);
290                 }
291 #endif          
292         }
293         catch(exception& e) {
294                 errorOut(e, "ClassifySeqsCommand", "createProcesses");
295                 exit(1);
296         }
297 }
298 /**************************************************************************************************/
299
300 void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
301         try{
302                 
303                 ofstream output;
304                 ifstream input;
305                 openOutputFileAppend(filename, output);
306                 openInputFile(temp, input);
307                 
308                 while(char c = input.get()){
309                         if(input.eof())         {       break;                  }
310                         else                            {       output << c;    }
311                 }
312                 
313                 input.close();
314                 output.close();
315         }
316         catch(exception& e) {
317                 errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
318                 exit(1);
319         }
320 }
321
322 //**********************************************************************************************************************
323
324 int ClassifySeqsCommand::driver(linePair* line, string taxFName){
325         try {
326                 ofstream outTax;
327                 openOutputFile(taxFName, outTax);
328         
329                 ifstream inFASTA;
330                 openInputFile(fastaFileName, inFASTA);
331
332                 inFASTA.seekg(line->start);
333                 
334                 string taxonomy;
335
336                 for(int i=0;i<line->numSeqs;i++){
337                         
338                         Sequence* candidateSeq = new Sequence(inFASTA);
339
340                         taxonomy = classify->getTaxonomy(candidateSeq);
341                         
342                         if (taxonomy != "bad seq") {
343                                 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
344                         }
345                                                         
346                         delete candidateSeq;
347                 }
348                 
349                 inFASTA.close();
350                 outTax.close();
351                 
352                 return 1;
353         }
354         catch(exception& e) {
355                 errorOut(e, "ClassifySeqsCommand", "driver");
356                 exit(1);
357         }
358 }
359
360 /**************************************************************************************************/