]> git.donarmstrong.com Git - mothur.git/blob - classifyseqscommand.h
paralellized chimera.uchime for windows for both by group and with a template.
[mothur.git] / classifyseqscommand.h
1 #ifndef CLASSIFYSEQSCOMMAND_H
2 #define CLASSIFYSEQSCOMMAND_H
3
4 /*
5  *  classifyseqscommand.h
6  *  Mothur
7  *
8  *  Created by westcott on 11/2/09.
9  *  Copyright 2009 Schloss Lab. All rights reserved.
10  *
11  */
12
13 #include "mothur.h"
14 #include "command.hpp"
15 #include "classify.h"
16 #include "referencedb.h"
17 #include "sequence.hpp"
18 #include "bayesian.h"
19 #include "phylotree.h"
20 #include "phylosummary.h"
21 #include "knn.h"
22
23
24 //KNN and Bayesian methods modeled from algorithms in
25 //Naı¨ve Bayesian Classifier for Rapid Assignment of rRNA Sequences 
26 //into the New Bacterial Taxonomy􏰎† 
27 //Qiong Wang,1 George M. Garrity,1,2 James M. Tiedje,1,2 and James R. Cole1* 
28 //Center for Microbial Ecology1 and Department of Microbiology and Molecular Genetics,2 Michigan State University, 
29 //East Lansing, Michigan 48824 
30 //Received 10 January 2007/Accepted 18 June 2007 
31
32
33
34 class ClassifySeqsCommand : public Command {
35         
36 public:
37         ClassifySeqsCommand(string);
38         ClassifySeqsCommand();
39         ~ClassifySeqsCommand();
40         
41         vector<string> setParameters();
42         string getCommandName()                 { return "classify.seqs";               }
43         string getCommandCategory()             { return "Phylotype Analysis";  }
44         string getHelpString(); 
45         string getCitation() { return "Wang Q, Garrity GM, Tiedje JM, Cole JR (2007). Naive Bayesian classifier for rapid assignment of rRNA sequences into the new bacterial taxonomy. Appl Environ Microbiol 73: 5261-7. [ for Bayesian classifier ] \nAltschul SF, Madden TL, Schaffer AA, Zhang J, Zhang Z, Miller W, Lipman DJ (1997). Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic Acids Res 25: 3389-402. [ for BLAST ] \nDeSantis TZ, Hugenholtz P, Larsen N, Rojas M, Brodie EL, Keller K, Huber T, Dalevi D, Hu P, Andersen GL (2006). Greengenes, a chimera-checked 16S rRNA gene database and workbench compatible with ARB. Appl Environ Microbiol 72: 5069-72. [ for kmer ] \nhttp://www.mothur.org/wiki/Classify.seqs"; }
46         string getDescription()         { return "classify sequences"; }
47         
48         int execute(); 
49         void help() { m->mothurOut(getHelpString()); }  
50         
51         
52         
53 private:
54         struct linePair {
55                 unsigned long long start;
56                 unsigned long long end;
57                 linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
58         };
59
60         vector<int> processIDS;   //processid
61         vector<linePair*> lines;
62         vector<string> fastaFileNames;
63         vector<string> namefileNames;
64         vector<string> groupfileNames;
65         vector<string> outputNames;
66         map<string, vector<string> > nameMap;
67         map<string,  vector<string> >::iterator itNames;
68         
69         Classify* classify;
70         ReferenceDB* rdb;
71         
72         string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
73         int processors, kmerSize, numWanted, cutoff, iters;
74         float match, misMatch, gapOpen, gapExtend;
75         bool abort, probs, save;
76         
77         int driver(linePair*, string, string, string);
78         void appendTaxFiles(string, string);
79         int createProcesses(string, string, string); 
80         string addUnclassifieds(string, int);
81         
82         int MPIReadNamesFile(string);
83         #ifdef USE_MPI
84         int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
85         #endif
86 };
87
88 /**************************************************************************************************/
89 //custom data structure for threads to use.
90 // This is passed by void pointer so it can be any data type
91 // that can be passed using a single void pointer (LPVOID).
92 struct classifyData {
93         string taxFName; 
94         string tempTFName; 
95         string filename;
96         string search, taxonomyFileName, templateFileName, method;
97         unsigned long long start;
98         unsigned long long end;
99         MothurOut* m;
100         float match, misMatch, gapOpen, gapExtend;
101         int count, kmerSize, threadID, cutoff, iters, numWanted;
102         bool probs;
103          
104         classifyData(){}
105         classifyData(bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid) {
106                 taxonomyFileName = tx;
107                 templateFileName = te;
108                 taxFName = a;
109                 tempTFName = r;
110                 filename = f;
111                 search = se;
112                 method = me;
113                 m = mout;
114                 start = st;
115                 end = en;
116                 match = ma; 
117                 misMatch = misMa;
118                 gapOpen = gapO; 
119                 gapExtend = gapE; 
120                 kmerSize = ks;
121                 cutoff = cut;
122                 iters = i;
123                 numWanted = numW;
124                 threadID = tid;
125                 probs = p;
126                 count = 0;
127         }
128 };
129
130 /**************************************************************************************************/
131 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
132 #else
133 static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ 
134         classifyData* pDataArray;
135         pDataArray = (classifyData*)lpParam;
136         
137         try {
138                 ofstream outTax;
139                 pDataArray->m->openOutputFile(pDataArray->taxFName, outTax);
140                 
141                 ofstream outTaxSimple;
142                 pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple);
143                 
144                 ifstream inFASTA;
145                 pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
146                 
147                 string taxonomy;
148                                 
149                 //print header if you are process 0
150                 if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
151                         inFASTA.seekg(0);
152                 }else { //this accounts for the difference in line endings. 
153                         inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); 
154                 }
155                 
156                 pDataArray->count = pDataArray->end;
157                 
158                 //make classify
159                 Classify* myclassify;
160                 if(pDataArray->method == "bayesian"){   myclassify = new Bayesian("saved", "saved", pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID);             }
161                 else if(pDataArray->method == "knn"){   myclassify = new Knn("saved", "saved", pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID);                             }
162                 else {
163                         pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
164                         pDataArray->m->mothurOutEndLine();
165                         myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID);   
166                 }
167                 
168                 if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
169                 
170                 int count = 0;
171                 for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
172                         
173                         if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
174                         
175                         Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
176                         
177                         if (candidateSeq->getName() != "") {
178                                 
179                                 taxonomy = myclassify->getTaxonomy(candidateSeq);
180                                 
181                                 if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; }
182                                 
183                                 if (taxonomy != "bad seq") {
184                                         //output confidence scores or not
185                                         if (pDataArray->probs) {
186                                                 outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
187                                         }else{
188                                                 outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
189                                         }
190                                         
191                                         outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
192                                 }
193                                 count++;
194                         }
195                         delete candidateSeq;
196                         //report progress
197                         if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine();         }
198                         
199                 }
200                 //report progress
201                 if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine();         }
202                 
203                 delete myclassify;
204                 inFASTA.close();
205                 outTax.close();
206                 outTaxSimple.close();
207                 
208         }
209         catch(exception& e) {
210                 pDataArray->m->errorOut(e, "ClassifySeqsCommand", "MyClassThreadFunction");
211                 exit(1);
212         }
213
214 #endif
215
216
217
218
219 #endif
220