]> git.donarmstrong.com Git - mothur.git/blob - preclustercommand.h
added processors to pre.cluster for mac and windows. Fixed bug in reNameFile function...
[mothur.git] / preclustercommand.h
1 #ifndef PRECLUSTERCOMMAND_H
2 #define PRECLUSTERCOMMAND_H
3
4
5 /*
6  *  preclustercommand.h
7  *  Mothur
8  *
9  *  Created by westcott on 12/21/09.
10  *  Copyright 2009 Schloss Lab. All rights reserved.
11  *
12  */
13
14
15 #include "command.hpp"
16 #include "sequence.hpp"
17 #include "sequenceparser.h"
18
19 /************************************************************/
20 struct seqPNode {
21         int numIdentical;
22         Sequence seq;
23         string names;
24         bool active;
25         seqPNode() {}
26         seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) {}
27         ~seqPNode() {}
28 };
29 /************************************************************/
30 inline bool comparePriority(seqPNode first, seqPNode second) {  return (first.numIdentical > second.numIdentical); }
31 //************************************************************/
32
33 class PreClusterCommand : public Command {
34         
35 public:
36         PreClusterCommand(string);
37         PreClusterCommand();
38         ~PreClusterCommand(){}
39         
40         vector<string> setParameters();
41         string getCommandName()                 { return "pre.cluster";                         }
42         string getCommandCategory()             { return "Sequence Processing";         }
43         string getHelpString(); 
44         string getCitation() { return "http://www.mothur.org/wiki/Pre.cluster"; }
45         string getDescription()         { return "implements a pseudo-single linkage algorithm with the goal of removing sequences that are likely due to pyrosequencing errors"; }
46
47         
48         int execute(); 
49         void help() { m->mothurOut(getHelpString()); }  
50         
51 private:
52         
53         struct linePair {
54                 int start;
55                 int end;
56                 linePair(int i, int j) : start(i), end(j) {}
57         };
58         
59         int diffs, length, processors;
60         bool abort, bygroup;
61         string fastafile, namefile, outputDir, groupfile;
62         vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
63         map<string, string> names; //represents the names file first column maps to second column
64         map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
65         map<string, int>::iterator itSize; 
66 //      map<string, bool> active; //maps sequence name to whether it has already been merged or not.
67         vector<string> outputNames;
68         map<string, vector<string> > outputTypes;
69         
70         int readFASTA();
71         void readNameFile();
72         //int readNamesFASTA();
73         int calcMisMatches(string, string);
74         void printData(string, string); //fasta filename, names file name
75         int process();
76         int loadSeqs(map<string, string>&, vector<Sequence>&);
77         int driverGroups(SequenceParser*, string, string, int, int, vector<string> groups);
78         int createProcessesGroups(SequenceParser*, string, string, vector<string>);
79 };
80
81 /**************************************************************************************************/
82 //custom data structure for threads to use.
83 // This is passed by void pointer so it can be any data type
84 // that can be passed using a single void pointer (LPVOID).
85 typedef struct preClusterData {
86         string fastafile; 
87         string namefile; 
88         string groupfile;
89         string newFName, newNName;
90         MothurOut* m;
91         int start;
92         int end;
93         int diffs, threadID;
94         vector<string> groups;
95         
96         preClusterData(){}
97         preClusterData(string f, string n, string g, string nff,  string nnf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
98                 fastafile = f;
99                 namefile = n;
100                 groupfile = g;
101                 newFName = nff;
102                 newNName = nnf;
103                 m = mout;
104                 start = st;
105                 end = en;
106                 diffs = d;
107                 threadID = tid;
108                 groups = gr;
109         }
110 };
111
112 /**************************************************************************************************/
113 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
114 #else
115 static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ 
116         preClusterData* pDataArray;
117         pDataArray = (preClusterData*)lpParam;
118         
119         try {
120                 
121                 //parse fasta and name file by group
122                 SequenceParser* parser;
123                 if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);      }
124                 else                                                    { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
125                 
126                 int numSeqs = 0;
127                 vector<seqPNode> alignSeqs;
128                 //clear out old files
129                 ofstream outF; pDataArray->m->openOutputFile(pDataArray->newFName, outF); outF.close();
130                 ofstream outN; pDataArray->m->openOutputFile(pDataArray->newNName, outN);  outN.close();
131                 
132                 //precluster each group
133                 for (int k = pDataArray->start; k < pDataArray->end; k++) {
134                         
135                         int start = time(NULL);
136                         
137                         if (pDataArray->m->control_pressed) {  delete parser; return 0; }
138                         
139                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
140                         
141                         map<string, string> thisNameMap;
142                         if (pDataArray->namefile != "") { thisNameMap = parser->getNameMap(pDataArray->groups[k]); }
143                         vector<Sequence> thisSeqs = parser->getSeqs(pDataArray->groups[k]);
144                         
145                         //fill alignSeqs with this groups info.
146                         ////////////////////////////////////////////////////
147                         //numSeqs = loadSeqs(thisNameMap, thisSeqs); same function below
148                         
149                         int length = 0;
150                         alignSeqs.clear();
151                         map<string, string>::iterator it;
152                         bool error = false;
153                         
154                         for (int i = 0; i < thisSeqs.size(); i++) {
155                                 
156                                 if (pDataArray->m->control_pressed) { delete parser; return 0; }
157                                 
158                                 if (pDataArray->namefile != "") {
159                                         it = thisNameMap.find(thisSeqs[i].getName());
160                                         
161                                         //should never be true since parser checks for this
162                                         if (it == thisNameMap.end()) { pDataArray->m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); pDataArray->m->mothurOutEndLine(); error = true; }
163                                         else{
164                                                 //get number of reps
165                                                 int numReps = 1;
166                                                 for(int j=0;j<(it->second).length();j++){
167                                                         if((it->second)[j] == ','){     numReps++;      }
168                                                 }
169                                                 
170                                                 seqPNode tempNode(numReps, thisSeqs[i], it->second);
171                                                 alignSeqs.push_back(tempNode);
172                                                 if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
173                                         }       
174                                 }else { //no names file, you are identical to yourself 
175                                         seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
176                                         alignSeqs.push_back(tempNode);
177                                         if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
178                                 }
179                         }
180                         
181                         //sanity check
182                         if (error) { pDataArray->m->control_pressed = true; }
183                         
184                         thisSeqs.clear();
185                         numSeqs = alignSeqs.size();
186                         
187                         ////////////////////////////////////////////////////
188                         
189                         if (pDataArray->m->control_pressed) {   delete parser; return 0; }
190                         
191                         if (pDataArray->diffs > length) { pDataArray->m->mothurOut("Error: diffs is greater than your sequence length."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; return 0;  }
192                         
193                         ////////////////////////////////////////////////////
194                         //int count = process(); - same function below
195                         
196                         //sort seqs by number of identical seqs
197                         sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
198                         
199                         int count = 0;
200                         
201                         //think about running through twice...
202                         for (int i = 0; i < numSeqs; i++) {
203                                 
204                                 //are you active
205                                 //                      itActive = active.find(alignSeqs[i].seq.getName());
206                                 
207                                 if (alignSeqs[i].active) {  //this sequence has not been merged yet
208                                         
209                                         //try to merge it with all smaller seqs
210                                         for (int j = i+1; j < numSeqs; j++) {
211                                                 
212                                                 if (pDataArray->m->control_pressed) { delete parser; return 0; }
213                                                 
214                                                 if (alignSeqs[j].active) {  //this sequence has not been merged yet
215                                                         //are you within "diff" bases
216                                                         //int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
217                                                         int mismatch = 0;
218                                                         
219                                                         for (int k = 0; k < alignSeqs[i].seq.getAligned().length(); k++) {
220                                                                 //do they match
221                                                                 if (alignSeqs[i].seq.getAligned()[k] != alignSeqs[j].seq.getAligned()[k]) { mismatch++; }
222                                                                 if (mismatch > pDataArray->diffs) { mismatch = length; break; } //to far to cluster
223                                                         }
224                                                         
225                                                         if (mismatch <= pDataArray->diffs) {
226                                                                 //merge
227                                                                 alignSeqs[i].names += ',' + alignSeqs[j].names;
228                                                                 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
229                                                                 
230                                                                 alignSeqs[j].active = 0;
231                                                                 alignSeqs[j].numIdentical = 0;
232                                                                 count++;
233                                                         }
234                                                 }//end if j active
235                                         }//end if i != j
236                                         
237                                         //remove from active list 
238                                         alignSeqs[i].active = 0;
239                                         
240                                 }//end if active i
241                                 if(i % 100 == 0)        { pDataArray->m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); pDataArray->m->mothurOutEndLine(); }
242                         }
243                         
244                         if(numSeqs % 100 != 0)  { pDataArray->m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); pDataArray->m->mothurOutEndLine();   }       
245                         ////////////////////////////////////////////////////
246                         
247                         if (pDataArray->m->control_pressed) {  delete parser; return 0; }
248                         
249                         pDataArray->m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + ".");pDataArray-> m->mothurOutEndLine();
250                         pDataArray->m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOutEndLine(); 
251                         
252                         ////////////////////////////////////////////////////
253                         //printData(pDataArray->newFFile, pDataArray->newNFile); - same as below
254                         ofstream outFasta;
255                         ofstream outNames;
256                         
257                         pDataArray->m->openOutputFileAppend(pDataArray->newFName, outFasta);
258                         pDataArray->m->openOutputFileAppend(pDataArray->newNName, outNames);
259                                                 
260                         for (int i = 0; i < alignSeqs.size(); i++) {
261                                 if (alignSeqs[i].numIdentical != 0) {
262                                         alignSeqs[i].seq.printSequence(outFasta); 
263                                         outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
264                                 }
265                         }
266                         
267                         outFasta.close();
268                         outNames.close();
269                         ////////////////////////////////////////////////////
270                         
271                         pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); pDataArray->m->mothurOutEndLine(); 
272                         
273                 }
274                 
275                 return numSeqs;
276                 
277
278         }
279         catch(exception& e) {
280                 pDataArray->m->errorOut(e, "AlignCommand", "MyPreclusterThreadFunction");
281                 exit(1);
282         }
283
284 #endif
285
286 /**************************************************************************************************/
287
288
289 #endif
290
291