]> git.donarmstrong.com Git - mothur.git/blob - shhhseqscommand.h
fixes while testing 1.33.0
[mothur.git] / shhhseqscommand.h
1 #ifndef SHHHSEQSCOMMAND_H
2 #define SHHHSEQSCOMMAND_H
3
4 /*
5  *  shhhseqscommand.h
6  *  Mothur
7  *
8  *  Created by westcott on 11/8/11.
9  *  Copyright 2011 Schloss Lab. All rights reserved.
10  *
11  */
12
13 #include "command.hpp"
14 #include "myseqdist.h"
15 #include "seqnoise.h"
16 #include "sequenceparser.h"
17 #include "deconvolutecommand.h"
18 #include "clustercommand.h"
19
20 //**********************************************************************************************************************
21
22 class ShhhSeqsCommand : public Command {
23         
24 public:
25         ShhhSeqsCommand(string);
26         ShhhSeqsCommand();
27         ~ShhhSeqsCommand() {}
28         
29         vector<string> setParameters();
30         string getCommandName()                 { return "shhh.seqs";   }
31         string getCommandCategory()             { return "Sequence Processing";         }
32         
33         string getHelpString(); 
34     string getOutputPattern(string);    
35         string getCitation() { return "Schloss PD, Gevers D, Westcott SL (2011).  Reducing the effects of PCR amplification and sequencing artifacts on 16S rRNA-based studies.  PLoS ONE.  6:e27310.\nQuince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011).  Removing noise from pyrosequenced amplicons.  BMC Bioinformatics  12:38.\nhttp://www.mothur.org/wiki/Shhh.seqs"; }
36         string getDescription()         { return "shhh.seqs"; }
37         
38         
39         int execute(); 
40         void help() { m->mothurOut(getHelpString()); }          
41         
42 private:
43         
44         struct linePair {
45                 int start;
46                 int end;
47                 linePair(int i, int j) : start(i), end(j) {}
48         };
49         
50         bool abort;
51         string outputDir, fastafile, namefile, groupfile;
52         int processors;
53         double sigma;
54         vector<string> outputNames;
55         
56         int readData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&);
57         int loadData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, map<string, string>&, vector<Sequence>&);
58
59         int driver(seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, string, string, string, string);
60         vector<string> driverGroups(SequenceParser&, string, string, string, int, int, vector<string>);
61         vector<string> createProcessesGroups(SequenceParser&, string, string, string, vector<string>);
62         int deconvoluteResults(string, string);
63
64
65
66 };
67
68 /**************************************************************************************************/
69 //custom data structure for threads to use.
70 // This is passed by void pointer so it can be any data type
71 // that can be passed using a single void pointer (LPVOID).
72 struct shhhseqsData {
73         string fastafile; 
74         string namefile; 
75         string groupfile;
76         string newFName, newNName, newMName;
77         MothurOut* m;
78         int start;
79         int end;
80         int sigma, threadID, count;
81         vector<string> groups;
82         vector<string> mapfileNames;
83         
84         shhhseqsData(){}
85         shhhseqsData(string f, string n, string g, string nff,  string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int s, int tid) {
86                 fastafile = f;
87                 namefile = n;
88                 groupfile = g;
89                 newFName = nff;
90                 newNName = nnf;
91                 newMName = nmf;
92                 m = mout;
93                 start = st;
94                 end = en;
95                 sigma = s;
96                 threadID = tid;
97                 groups = gr;
98         count=0;
99         }
100 };
101
102 /**************************************************************************************************/
103 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
104 #else
105 static DWORD WINAPI MyShhhSeqsThreadFunction(LPVOID lpParam){ 
106         shhhseqsData* pDataArray;
107         pDataArray = (shhhseqsData*)lpParam;
108         
109         try {
110                 
111                 //parse fasta and name file by group
112                 SequenceParser parser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);
113                                                 
114                 //precluster each group
115                 for (int k = pDataArray->start; k < pDataArray->end; k++) {
116                         
117             pDataArray->count++;
118             
119                         int start = time(NULL);
120                         
121                         if (pDataArray->m->control_pressed) {  return 0; }
122                         
123                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
124
125                         map<string, string> thisNameMap;
126                         thisNameMap = parser.getNameMap(pDataArray->groups[k]); 
127                         vector<Sequence> thisSeqs = parser.getSeqs(pDataArray->groups[k]);
128                         
129                         if (pDataArray->m->control_pressed) {  return 0; }
130                         
131                         vector<string> sequences;
132                         vector<string> uniqueNames;
133                         vector<string> redundantNames;
134                         vector<int> seqFreq;
135                         
136                         seqNoise noise;
137                         correctDist* correct = new correctDist(1); //we use one processor since we already split up the work load.
138                         
139                         //load this groups info in order
140                         //loadData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq, thisNameMap, thisSeqs);
141                         //////////////////////////////////////////////////////////////////////////////////////////////////
142                         bool error = false;
143                         map<string, string>::iterator it;
144                         
145                         for (int i = 0; i < thisSeqs.size(); i++) {
146                                 
147                                 if (pDataArray->m->control_pressed) { return 0; }
148                                 
149                                 if (thisSeqs[i].getName() != "") {
150                                         correct->addSeq(thisSeqs[i].getName(), thisSeqs[i].getAligned());
151                                         
152                                         it = thisNameMap.find(thisSeqs[i].getName());
153                                         if (it != thisNameMap.end()) {
154                                                 noise.addSeq(thisSeqs[i].getAligned(), sequences);
155                                                 noise.addRedundantName(it->first, it->second, uniqueNames, redundantNames, seqFreq);
156                                         }else {
157                                                 pDataArray->m->mothurOut("[ERROR]: " + thisSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct.");
158                                                 error = true;
159                                         }
160                                 }
161                         }
162                         
163                         if (error) { return 0; }
164                         //////////////////////////////////////////////////////////////////////////////////////////////////
165                         
166                         if (pDataArray->m->control_pressed) { return 0; }
167                         
168                         //calc distances for cluster
169                         string distFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(pDataArray->fastafile)) + pDataArray->groups[k] + ".shhh.dist";
170                         correct->execute(distFileName);
171                         delete correct;
172                         
173                         if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); return 0; }
174                         
175                         //driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, newFFile+groups[i], newNFile+groups[i], newMFile+groups[i]+".map"); 
176                         ///////////////////////////////////////////////////////////////////////////////////////////////////
177                         double cutOff = 0.08;
178                         int minIter = 10;
179                         int maxIter = 1000;
180                         double minDelta = 1e-6;
181                         int numIters = 0;
182                         double maxDelta = 1e6;
183                         int numSeqs = sequences.size();
184                         
185                         //run cluster command
186                         string inputString = "phylip=" + distFileName + ", method=furthest, cutoff=0.08";
187                         pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine(); 
188                         pDataArray->m->mothurOut("Running command: cluster(" + inputString + ")"); pDataArray->m->mothurOutEndLine(); 
189                         
190                         Command* clusterCommand = new ClusterCommand(inputString);
191                         clusterCommand->execute();
192                         
193                         map<string, vector<string> > filenames = clusterCommand->getOutputFiles();
194                         string listFileName = filenames["list"][0];
195                         string rabundFileName = filenames["rabund"][0]; pDataArray->m->mothurRemove(rabundFileName);
196                         string sabundFileName = filenames["sabund"][0]; pDataArray->m->mothurRemove(sabundFileName);
197                         
198                         delete clusterCommand;
199                         pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine(); 
200                         
201                         if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); pDataArray->m->mothurRemove(listFileName); return 0; }
202                         
203                         vector<double> distances(numSeqs * numSeqs);
204                         noise.getDistanceData(distFileName, distances);
205                         pDataArray->m->mothurRemove(distFileName); 
206                         if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(listFileName); return 0; }
207                         
208                         vector<int> otuData(numSeqs);
209                         vector<int> otuFreq;
210                         vector<vector<int> > otuBySeqLookUp;
211                         noise.getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
212                         pDataArray->m->mothurRemove(listFileName);
213                         if (pDataArray->m->control_pressed) { return 0; }
214                         
215                         int numOTUs = otuFreq.size();
216                         
217                         vector<double> weights(numOTUs, 0);
218                         vector<int> change(numOTUs, 1);
219                         vector<int> centroids(numOTUs, -1);
220                         vector<int> cumCount(numOTUs, 0);
221                         
222                         vector<double> tau(numSeqs, 1);
223                         vector<int> anP(numSeqs, 0);
224                         vector<int> anI(numSeqs, 0);
225                         vector<int> anN(numSeqs, 0);
226                         vector<vector<int> > aanI = otuBySeqLookUp;
227                         
228                         while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
229                                 
230                                 if (pDataArray->m->control_pressed) { return 0; }
231                                 
232                                 noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (pDataArray->m->control_pressed) { return 0; }
233                                 maxDelta = noise.calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);  if (pDataArray->m->control_pressed) { return 0; }
234                                 
235                                 noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
236                                 noise.checkCentroids(weights, centroids); if (pDataArray->m->control_pressed) { return 0; }
237                                 
238                                 otuFreq.assign(numOTUs, 0);
239                                 
240                                 int total = 0;
241                                 
242                                 for(int i=0;i<numSeqs;i++){
243                                         if (pDataArray->m->control_pressed) { return 0; }
244                                         
245                                         double offset = 1e6;
246                                         double norm = 0.0000;
247                                         double minWeight = 0.1;
248                                         vector<double> currentTau(numOTUs);
249                                         
250                                         for(int j=0;j<numOTUs;j++){
251                                                 if (pDataArray->m->control_pressed) { return 0; }
252                                                 if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
253                                                         offset = distances[i * numSeqs+centroids[j]];
254                                                 }
255                                         }
256                                         
257                                         for(int j=0;j<numOTUs;j++){
258                                                 if (pDataArray->m->control_pressed) { return 0; }
259                                                 if(weights[j] > minWeight){
260                                                         currentTau[j] = exp(pDataArray->sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
261                                                         norm += currentTau[j];
262                                                 }
263                                                 else{
264                                                         currentTau[j] = 0.0000;
265                                                 }
266                                         }                       
267                                         
268                                         for(int j=0;j<numOTUs;j++){
269                                                 if (pDataArray->m->control_pressed) { return 0; }
270                                                 currentTau[j] /= norm;
271                                         }
272                                         
273                                         for(int j=0;j<numOTUs;j++){
274                                                 if (pDataArray->m->control_pressed) { return 0; }
275                                                 
276                                                 if(currentTau[j] > 1.0e-4){
277                                                         int oldTotal = total;
278                                                         total++;
279                                                         
280                                                         tau.resize(oldTotal+1);
281                                                         tau[oldTotal] = currentTau[j];
282                                                         otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
283                                                         aanI[j][otuFreq[j]] = i;
284                                                         otuFreq[j]++;
285                                                         
286                                                 }
287                                         }
288                                         
289                                         anP.resize(total);
290                                         anI.resize(total);
291                                 }
292                                 
293                                 numIters++;
294                         }
295                         
296                         noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);  if (pDataArray->m->control_pressed) { return 0; }
297                         
298                         vector<double> percentage(numSeqs);
299                         noise.setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);  if (pDataArray->m->control_pressed) { return 0; }
300                         noise.finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);  if (pDataArray->m->control_pressed) { return 0; }
301                         
302                         change.assign(numOTUs, 1);
303                         noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
304                         
305                         
306                         vector<int> finalTau(numOTUs, 0);
307                         for(int i=0;i<numSeqs;i++){
308                                 if (pDataArray->m->control_pressed) { return 0; }
309                                 finalTau[otuData[i]] += int(seqFreq[i]);
310                         }
311                         
312                         noise.writeOutput(pDataArray->newFName+pDataArray->groups[k], pDataArray->newNName+pDataArray->groups[k], pDataArray->newMName+pDataArray->groups[k]+".map", finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
313                         
314                         ///////////////////////////////////////////////////////////////////////////////////////////////////
315                         
316                         if (pDataArray->m->control_pressed) { return 0; }
317                         
318                         pDataArray->m->appendFiles(pDataArray->newFName+pDataArray->groups[k], pDataArray->newFName); pDataArray->m->mothurRemove(pDataArray->newFName+pDataArray->groups[k]);
319                         pDataArray->m->appendFiles(pDataArray->newNName+pDataArray->groups[k], pDataArray->newNName); pDataArray->m->mothurRemove(pDataArray->newNName+pDataArray->groups[k]);
320                         pDataArray->mapfileNames.push_back(pDataArray->newMName+pDataArray->groups[k]+".map");
321                         
322                         pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process group " + pDataArray->groups[k] + "."); pDataArray->m->mothurOutEndLine(); 
323                 }
324                 
325                 return 0;
326                 
327                 
328         }
329         catch(exception& e) {
330                 pDataArray->m->errorOut(e, "ShhhSeqsCommand", "MyShhhSeqsThreadFunction");
331                 exit(1);
332         }
333
334 #endif
335 /**************************************************************************************************/
336
337 #endif