]> git.donarmstrong.com Git - mothur.git/blob - preclustercommand.h
fixed bug in phylo.diversity rooting. added filename patterns and create filename...
[mothur.git] / preclustercommand.h
1 #ifndef PRECLUSTERCOMMAND_H
2 #define PRECLUSTERCOMMAND_H
3
4
5 /*
6  *  preclustercommand.h
7  *  Mothur
8  *
9  *  Created by westcott on 12/21/09.
10  *  Copyright 2009 Schloss Lab. All rights reserved.
11  *
12  */
13
14
15 #include "command.hpp"
16 #include "sequence.hpp"
17 #include "sequenceparser.h"
18 #include "sequencecountparser.h"
19
20 /************************************************************/
21 struct seqPNode {
22         int numIdentical;
23         Sequence seq;
24         string names;
25         bool active;
26         int diffs;
27         seqPNode() {}
28         seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) { diffs = 0; }
29         ~seqPNode() {}
30 };
31 /************************************************************/
32 inline bool comparePriority(seqPNode first, seqPNode second) {  
33     if (first.numIdentical > second.numIdentical) { return true;  }
34     else if (first.numIdentical == second.numIdentical) { 
35         if (first.seq.getName() > second.seq.getName()) { return true; }
36     }
37     return false; 
38 }
39 //************************************************************/
40
41 class PreClusterCommand : public Command {
42         
43 public:
44         PreClusterCommand(string);
45         PreClusterCommand();
46         ~PreClusterCommand(){}
47         
48         vector<string> setParameters();
49         string getCommandName()                 { return "pre.cluster";                         }
50         string getCommandCategory()             { return "Sequence Processing";         }
51         
52         string getHelpString(); 
53     string getOutputPattern(string);    
54         string getCitation() { return "Schloss PD, Gevers D, Westcott SL (2011).  Reducing the effects of PCR amplification and sequencing artifacts on 16S rRNA-based studies.  PLoS ONE.  6:e27310.\nhttp://www.mothur.org/wiki/Pre.cluster"; }
55         string getDescription()         { return "implements a pseudo-single linkage algorithm with the goal of removing sequences that are likely due to pyrosequencing errors"; }
56
57         
58         int execute(); 
59         void help() { m->mothurOut(getHelpString()); }  
60         
61 private:
62         
63         struct linePair {
64                 int start;
65                 int end;
66                 linePair(int i, int j) : start(i), end(j) {}
67         };
68         
69     SequenceParser* parser;
70     SequenceCountParser* cparser;
71     CountTable ct;
72     
73         int diffs, length, processors;
74         bool abort, bygroup;
75         string fastafile, namefile, outputDir, groupfile, countfile;
76         vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
77         map<string, string> names; //represents the names file first column maps to second column
78         map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
79         map<string, int>::iterator itSize; 
80 //      map<string, bool> active; //maps sequence name to whether it has already been merged or not.
81         vector<string> outputNames;
82         map<string, vector<string> > outputTypes;
83         
84         int readFASTA();
85         void readNameFile();
86         //int readNamesFASTA();
87         int calcMisMatches(string, string);
88         void printData(string, string, string); //fasta filename, names file name
89         int process(string);
90         int loadSeqs(map<string, string>&, vector<Sequence>&, string);
91         int driverGroups(string, string, string, int, int, vector<string> groups);
92         int createProcessesGroups(string, string, string, vector<string>);
93     int mergeGroupCounts(string, string, string);
94 };
95
96 /**************************************************************************************************/
97 //custom data structure for threads to use.
98 // This is passed by void pointer so it can be any data type
99 // that can be passed using a single void pointer (LPVOID).
100 struct preClusterData {
101         string fastafile; 
102         string namefile; 
103         string groupfile, countfile;
104         string newFName, newNName, newMName;
105         MothurOut* m;
106         int start;
107         int end;
108         int diffs, threadID;
109         vector<string> groups;
110         vector<string> mapFileNames;
111         
112         preClusterData(){}
113         preClusterData(string f, string n, string g, string c, string nff,  string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
114                 fastafile = f;
115                 namefile = n;
116                 groupfile = g;
117                 newFName = nff;
118                 newNName = nnf;
119                 newMName = nmf;
120                 m = mout;
121                 start = st;
122                 end = en;
123                 diffs = d;
124                 threadID = tid;
125                 groups = gr;
126         countfile = c;
127         }
128 };
129
130 /**************************************************************************************************/
131 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
132 #else
133 static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ 
134         preClusterData* pDataArray;
135         pDataArray = (preClusterData*)lpParam;
136         
137         try {
138                 
139                 //parse fasta and name file by group
140                 SequenceParser* parser;
141         SequenceCountParser* cparser;
142         if (pDataArray->countfile != "") {
143             cparser = new SequenceCountParser(pDataArray->countfile, pDataArray->fastafile);
144         }else {
145             if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);  }
146             else                                { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                    }
147         }
148         
149                 int numSeqs = 0;
150                 vector<seqPNode> alignSeqs;
151                 //clear out old files
152                 ofstream outF; pDataArray->m->openOutputFile(pDataArray->newFName, outF); outF.close();
153                 ofstream outN; pDataArray->m->openOutputFile(pDataArray->newNName, outN);  outN.close();
154                 
155                 //precluster each group
156                 for (int k = pDataArray->start; k < pDataArray->end; k++) {
157                         
158                         int start = time(NULL);
159                         
160                         if (pDataArray->m->control_pressed) {  delete parser; return 0; }
161                         
162                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
163                         
164                         map<string, string> thisNameMap;
165             vector<Sequence> thisSeqs;
166                         if (pDataArray->groupfile != "") { 
167                 thisSeqs = parser->getSeqs(pDataArray->groups[k]);
168             }else if (pDataArray->countfile != "") {
169                 thisSeqs = cparser->getSeqs(pDataArray->groups[k]);
170             }
171                         if (pDataArray->namefile != "") {  thisNameMap = parser->getNameMap(pDataArray->groups[k]); }
172                         
173                         //fill alignSeqs with this groups info.
174                         ////////////////////////////////////////////////////
175                         //numSeqs = loadSeqs(thisNameMap, thisSeqs); same function below
176                         
177                         int length = 0;
178                         alignSeqs.clear();
179                         map<string, string>::iterator it;
180                         bool error = false;
181             map<string, int> thisCount;
182             if (pDataArray->countfile != "") { thisCount = cparser->getCountTable(pDataArray->groups[k]);  }
183
184                         
185                         for (int i = 0; i < thisSeqs.size(); i++) {
186                                 
187                                 if (pDataArray->m->control_pressed) { delete parser; return 0; }
188                                 
189                                 if (pDataArray->namefile != "") {
190                                         it = thisNameMap.find(thisSeqs[i].getName());
191                                         
192                                         //should never be true since parser checks for this
193                                         if (it == thisNameMap.end()) { pDataArray->m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); pDataArray->m->mothurOutEndLine(); error = true; }
194                                         else{
195                                                 //get number of reps
196                                                 int numReps = 1;
197                                                 for(int j=0;j<(it->second).length();j++){
198                                                         if((it->second)[j] == ','){     numReps++;      }
199                                                 }
200                                                 
201                                                 seqPNode tempNode(numReps, thisSeqs[i], it->second);
202                                                 alignSeqs.push_back(tempNode);
203                                                 if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
204                                         }       
205                                 }else { //no names file, you are identical to yourself 
206                                         int numRep = 1;
207                     if (pDataArray->countfile != "") { 
208                         map<string, int>::iterator it2 = thisCount.find(thisSeqs[i].getName());
209                         
210                         //should never be true since parser checks for this
211                         if (it2 == thisCount.end()) { pDataArray->m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); error = true; }
212                         else { numRep = it2->second;  }
213                     }
214                     seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName());
215                     alignSeqs.push_back(tempNode);
216                                         if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
217                                 }
218                         }
219                         
220                         //sanity check
221                         if (error) { pDataArray->m->control_pressed = true; }
222                         
223                         thisSeqs.clear();
224                         numSeqs = alignSeqs.size();
225                         
226                         ////////////////////////////////////////////////////
227                         
228                         if (pDataArray->m->control_pressed) {   delete parser; return 0; }
229                         
230                         if (pDataArray->diffs > length) { pDataArray->m->mothurOut("Error: diffs is greater than your sequence length."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; return 0;  }
231                         
232                         ////////////////////////////////////////////////////
233                         //int count = process(); - same function below
234                         
235                         ofstream out;
236                         pDataArray->m->openOutputFile(pDataArray->newMName+pDataArray->groups[k]+".map", out);
237                         pDataArray->mapFileNames.push_back(pDataArray->newMName+pDataArray->groups[k]+".map");
238                         
239                         //sort seqs by number of identical seqs
240                         sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
241                         
242                         int count = 0;
243                         
244                         //think about running through twice...
245                         for (int i = 0; i < numSeqs; i++) {
246                                 
247                                 //are you active
248                                 //                      itActive = active.find(alignSeqs[i].seq.getName());
249                                 
250                                 if (alignSeqs[i].active) {  //this sequence has not been merged yet
251                                         
252                                         string chunk = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n";
253
254                                         //try to merge it with all smaller seqs
255                                         for (int j = i+1; j < numSeqs; j++) {
256                                                 
257                                                 if (pDataArray->m->control_pressed) { delete parser; return 0; }
258                                                 
259                                                 if (alignSeqs[j].active) {  //this sequence has not been merged yet
260                                                         //are you within "diff" bases
261                                                         //int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
262                                                         int mismatch = 0;
263                                                         
264                                                         for (int k = 0; k < alignSeqs[i].seq.getAligned().length(); k++) {
265                                                                 //do they match
266                                                                 if (alignSeqs[i].seq.getAligned()[k] != alignSeqs[j].seq.getAligned()[k]) { mismatch++; }
267                                                                 if (mismatch > pDataArray->diffs) { mismatch = length; break; } //to far to cluster
268                                                         }
269                                                         
270                                                         if (mismatch <= pDataArray->diffs) {
271                                                                 //merge
272                                                                 alignSeqs[i].names += ',' + alignSeqs[j].names;
273                                                                 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
274                                                                 
275                                                                 alignSeqs[j].active = 0;
276                                                                 alignSeqs[j].numIdentical = 0;
277                                                                 alignSeqs[j].diffs = mismatch;
278                                                                 count++;
279                                                                 chunk += alignSeqs[j].seq.getName() + "\t" + toString(alignSeqs[j].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[j].seq.getAligned() + "\n";
280                                                         }
281                                                 }//end if j active
282                                         }//end for loop j
283                                         
284                                         //remove from active list 
285                                         alignSeqs[i].active = 0;
286                                         
287                                         out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl << chunk << endl;
288                                         
289                                 }//end if active i
290                                 if(i % 100 == 0)        { pDataArray->m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); pDataArray->m->mothurOutEndLine(); }
291                         }
292                         out.close();
293                         if(numSeqs % 100 != 0)  { pDataArray->m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); pDataArray->m->mothurOutEndLine();   }       
294                         ////////////////////////////////////////////////////
295                         
296                         if (pDataArray->m->control_pressed) {  delete parser; return 0; }
297                         
298                         pDataArray->m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + ".");pDataArray-> m->mothurOutEndLine();
299                         pDataArray->m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOutEndLine(); 
300                         
301                         ////////////////////////////////////////////////////
302                         //printData(pDataArray->newFFile, pDataArray->newNFile); - same as below
303                         ofstream outFasta;
304                         ofstream outNames;
305                         
306                         pDataArray->m->openOutputFileAppend(pDataArray->newFName, outFasta);
307                         pDataArray->m->openOutputFileAppend(pDataArray->newNName, outNames);
308                                                 
309                         for (int i = 0; i < alignSeqs.size(); i++) {
310                                 if (alignSeqs[i].numIdentical != 0) {
311                                         alignSeqs[i].seq.printSequence(outFasta); 
312                                         if (pDataArray->countfile != "") {  outNames << pDataArray->groups[k] << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; 
313                     }else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
314
315                                 }
316                         }
317                         
318                         outFasta.close();
319                         outNames.close();
320                         ////////////////////////////////////////////////////
321                         
322                         pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); pDataArray->m->mothurOutEndLine(); 
323                         
324                 }
325                 
326                 return numSeqs;
327                 
328
329         }
330         catch(exception& e) {
331                 pDataArray->m->errorOut(e, "PreClusterCommand", "MyPreclusterThreadFunction");
332                 exit(1);
333         }
334
335 #endif
336
337 /**************************************************************************************************/
338
339
340 #endif
341
342