]> git.donarmstrong.com Git - mothur.git/blob - shhhseqscommand.h
fixed bug with trim.flows that was adding flow files names to the .flow.files file...
[mothur.git] / shhhseqscommand.h
1 #ifndef SHHHSEQSCOMMAND_H
2 #define SHHHSEQSCOMMAND_H
3
4 /*
5  *  shhhseqscommand.h
6  *  Mothur
7  *
8  *  Created by westcott on 11/8/11.
9  *  Copyright 2011 Schloss Lab. All rights reserved.
10  *
11  */
12
13 #include "command.hpp"
14 #include "myseqdist.h"
15 #include "seqnoise.h"
16 #include "sequenceparser.h"
17 #include "deconvolutecommand.h"
18 #include "clustercommand.h"
19
20 //**********************************************************************************************************************
21
22 class ShhhSeqsCommand : public Command {
23         
24 public:
25         ShhhSeqsCommand(string);
26         ShhhSeqsCommand();
27         ~ShhhSeqsCommand() {}
28         
29         vector<string> setParameters();
30         string getCommandName()                 { return "shhh.seqs";   }
31         string getCommandCategory()             { return "Sequence Processing";         }
32         string getHelpString(); 
33         string getCitation() { return "http://www.mothur.org/wiki/Shhh.seqs"; }
34         string getDescription()         { return "shhh.seqs"; }
35         
36         
37         int execute(); 
38         void help() { m->mothurOut(getHelpString()); }          
39         
40 private:
41         
42         struct linePair {
43                 int start;
44                 int end;
45                 linePair(int i, int j) : start(i), end(j) {}
46         };
47         
48         bool abort;
49         string outputDir, fastafile, namefile, groupfile;
50         int processors;
51         double sigma;
52         vector<string> outputNames;
53         
54         int readData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&);
55         int loadData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, map<string, string>&, vector<Sequence>&);
56
57         int driver(seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, string, string, string, string);
58         int driverGroups(SequenceParser&, string, string, string, int, int, vector<string>);
59         int createProcessesGroups(SequenceParser&, string, string, string, vector<string>);
60         int deconvoluteResults(string, string);
61
62
63
64 };
65
66 /**************************************************************************************************/
67 //custom data structure for threads to use.
68 // This is passed by void pointer so it can be any data type
69 // that can be passed using a single void pointer (LPVOID).
70 struct shhhseqsData {
71         string fastafile; 
72         string namefile; 
73         string groupfile;
74         string newFName, newNName, newMName;
75         MothurOut* m;
76         int start;
77         int end;
78         int sigma, threadID;
79         vector<string> groups;
80         
81         shhhseqsData(){}
82         shhhseqsData(string f, string n, string g, string nff,  string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int s, int tid) {
83                 fastafile = f;
84                 namefile = n;
85                 groupfile = g;
86                 newFName = nff;
87                 newNName = nnf;
88                 newNName = nmf;
89                 m = mout;
90                 start = st;
91                 end = en;
92                 sigma = s;
93                 threadID = tid;
94                 groups = gr;
95         }
96 };
97
98 /**************************************************************************************************/
99 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
100 #else
101 static DWORD WINAPI MyShhhSeqsThreadFunction(LPVOID lpParam){ 
102         shhhseqsData* pDataArray;
103         pDataArray = (shhhseqsData*)lpParam;
104         
105         try {
106                 
107                 //parse fasta and name file by group
108                 SequenceParser parser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);
109                                                 
110                 //precluster each group
111                 for (int k = pDataArray->start; k < pDataArray->end; k++) {
112                         
113                         int start = time(NULL);
114                         
115                         if (pDataArray->m->control_pressed) {  return 0; }
116                         
117                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
118                         
119                         map<string, string> thisNameMap;
120                         thisNameMap = parser.getNameMap(pDataArray->groups[k]); 
121                         vector<Sequence> thisSeqs = parser.getSeqs(pDataArray->groups[k]);
122                         
123                         if (pDataArray->m->control_pressed) {  return 0; }
124                         
125                         vector<string> sequences;
126                         vector<string> uniqueNames;
127                         vector<string> redundantNames;
128                         vector<int> seqFreq;
129                         
130                         seqNoise noise;
131                         correctDist* correct = new correctDist(1); //we use one processor since we already split up the work load.
132                         
133                         //load this groups info in order
134                         //loadData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq, thisNameMap, thisSeqs);
135                         //////////////////////////////////////////////////////////////////////////////////////////////////
136                         bool error = false;
137                         map<string, string>::iterator it;
138                         
139                         for (int i = 0; i < thisSeqs.size(); i++) {
140                                 
141                                 if (pDataArray->m->control_pressed) { return 0; }
142                                 
143                                 if (thisSeqs[i].getName() != "") {
144                                         correct->addSeq(thisSeqs[i].getName(), thisSeqs[i].getAligned());
145                                         
146                                         it = thisNameMap.find(thisSeqs[i].getName());
147                                         if (it != thisNameMap.end()) {
148                                                 noise.addSeq(thisSeqs[i].getAligned(), sequences);
149                                                 noise.addRedundantName(it->first, it->second, uniqueNames, redundantNames, seqFreq);
150                                         }else {
151                                                 pDataArray->m->mothurOut("[ERROR]: " + thisSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct.");
152                                                 error = true;
153                                         }
154                                 }
155                         }
156                         
157                         if (error) { return 0; }
158                         //////////////////////////////////////////////////////////////////////////////////////////////////
159                         
160                         if (pDataArray->m->control_pressed) { return 0; }
161                         
162                         //calc distances for cluster
163                         string distFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(pDataArray->fastafile)) + pDataArray->groups[k] + ".shhh.dist";
164                         correct->execute(distFileName);
165                         delete correct;
166                         
167                         if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); return 0; }
168                         
169                         //driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, newFFile+groups[i], newNFile+groups[i], newMFile+groups[i]+".map"); 
170                         ///////////////////////////////////////////////////////////////////////////////////////////////////
171                         double cutOff = 0.08;
172                         int minIter = 10;
173                         int maxIter = 1000;
174                         double minDelta = 1e-6;
175                         int numIters = 0;
176                         double maxDelta = 1e6;
177                         int numSeqs = sequences.size();
178                         
179                         //run cluster command
180                         string inputString = "phylip=" + distFileName + ", method=furthest, cutoff=0.08";
181                         pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine(); 
182                         pDataArray->m->mothurOut("Running command: cluster(" + inputString + ")"); pDataArray->m->mothurOutEndLine(); 
183                         
184                         Command* clusterCommand = new ClusterCommand(inputString);
185                         clusterCommand->execute();
186                         
187                         map<string, vector<string> > filenames = clusterCommand->getOutputFiles();
188                         string listFileName = filenames["list"][0];
189                         string rabundFileName = filenames["rabund"][0]; pDataArray->m->mothurRemove(rabundFileName);
190                         string sabundFileName = filenames["sabund"][0]; pDataArray->m->mothurRemove(sabundFileName);
191                         
192                         delete clusterCommand;
193                         pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine(); 
194                         
195                         if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); pDataArray->m->mothurRemove(listFileName); return 0; }
196                         
197                         vector<double> distances(numSeqs * numSeqs);
198                         noise.getDistanceData(distFileName, distances);
199                         pDataArray->m->mothurRemove(distFileName); 
200                         if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(listFileName); return 0; }
201                         
202                         vector<int> otuData(numSeqs);
203                         vector<int> otuFreq;
204                         vector<vector<int> > otuBySeqLookUp;
205                         noise.getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
206                         pDataArray->m->mothurRemove(listFileName);
207                         if (pDataArray->m->control_pressed) { return 0; }
208                         
209                         int numOTUs = otuFreq.size();
210                         
211                         vector<double> weights(numOTUs, 0);
212                         vector<int> change(numOTUs, 1);
213                         vector<int> centroids(numOTUs, -1);
214                         vector<int> cumCount(numOTUs, 0);
215                         
216                         vector<double> tau(numSeqs, 1);
217                         vector<int> anP(numSeqs, 0);
218                         vector<int> anI(numSeqs, 0);
219                         vector<int> anN(numSeqs, 0);
220                         vector<vector<int> > aanI = otuBySeqLookUp;
221                         
222                         while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
223                                 
224                                 if (pDataArray->m->control_pressed) { return 0; }
225                                 
226                                 noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (pDataArray->m->control_pressed) { return 0; }
227                                 maxDelta = noise.calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);  if (pDataArray->m->control_pressed) { return 0; }
228                                 
229                                 noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
230                                 noise.checkCentroids(weights, centroids); if (pDataArray->m->control_pressed) { return 0; }
231                                 
232                                 otuFreq.assign(numOTUs, 0);
233                                 
234                                 int total = 0;
235                                 
236                                 for(int i=0;i<numSeqs;i++){
237                                         if (pDataArray->m->control_pressed) { return 0; }
238                                         
239                                         double offset = 1e6;
240                                         double norm = 0.0000;
241                                         double minWeight = 0.1;
242                                         vector<double> currentTau(numOTUs);
243                                         
244                                         for(int j=0;j<numOTUs;j++){
245                                                 if (pDataArray->m->control_pressed) { return 0; }
246                                                 if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
247                                                         offset = distances[i * numSeqs+centroids[j]];
248                                                 }
249                                         }
250                                         
251                                         for(int j=0;j<numOTUs;j++){
252                                                 if (pDataArray->m->control_pressed) { return 0; }
253                                                 if(weights[j] > minWeight){
254                                                         currentTau[j] = exp(pDataArray->sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
255                                                         norm += currentTau[j];
256                                                 }
257                                                 else{
258                                                         currentTau[j] = 0.0000;
259                                                 }
260                                         }                       
261                                         
262                                         for(int j=0;j<numOTUs;j++){
263                                                 if (pDataArray->m->control_pressed) { return 0; }
264                                                 currentTau[j] /= norm;
265                                         }
266                                         
267                                         for(int j=0;j<numOTUs;j++){
268                                                 if (pDataArray->m->control_pressed) { return 0; }
269                                                 
270                                                 if(currentTau[j] > 1.0e-4){
271                                                         int oldTotal = total;
272                                                         total++;
273                                                         
274                                                         tau.resize(oldTotal+1);
275                                                         tau[oldTotal] = currentTau[j];
276                                                         otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
277                                                         aanI[j][otuFreq[j]] = i;
278                                                         otuFreq[j]++;
279                                                         
280                                                 }
281                                         }
282                                         
283                                         anP.resize(total);
284                                         anI.resize(total);
285                                 }
286                                 
287                                 numIters++;
288                         }
289                         
290                         noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);  if (pDataArray->m->control_pressed) { return 0; }
291                         
292                         vector<double> percentage(numSeqs);
293                         noise.setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);  if (pDataArray->m->control_pressed) { return 0; }
294                         noise.finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);  if (pDataArray->m->control_pressed) { return 0; }
295                         
296                         change.assign(numOTUs, 1);
297                         noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
298                         
299                         
300                         vector<int> finalTau(numOTUs, 0);
301                         for(int i=0;i<numSeqs;i++){
302                                 if (pDataArray->m->control_pressed) { return 0; }
303                                 finalTau[otuData[i]] += int(seqFreq[i]);
304                         }
305                         
306                         noise.writeOutput(pDataArray->newFName+pDataArray->groups[k], pDataArray->newNName+pDataArray->groups[k], pDataArray->newMName+pDataArray->groups[k]+".map", finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
307                         
308                         ///////////////////////////////////////////////////////////////////////////////////////////////////
309                         
310                         if (pDataArray->m->control_pressed) { return 0; }
311                         
312                         pDataArray->m->appendFiles(pDataArray->newFName+pDataArray->groups[k], pDataArray->newFName); pDataArray->m->mothurRemove(pDataArray->newFName+pDataArray->groups[k]);
313                         pDataArray->m->appendFiles(pDataArray->newNName+pDataArray->groups[k], pDataArray->newNName); pDataArray->m->mothurRemove(pDataArray->newNName+pDataArray->groups[k]);
314                         
315                         pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process group " + pDataArray->groups[k] + "."); pDataArray->m->mothurOutEndLine(); 
316                         
317                                                         
318                 }
319                 
320                 return 0;
321                 
322                 
323         }
324         catch(exception& e) {
325                 pDataArray->m->errorOut(e, "PreClusterCommand", "MyPreclusterThreadFunction");
326                 exit(1);
327         }
328
329 #endif
330 /**************************************************************************************************/
331
332 #endif