1 #ifndef SHHHSEQSCOMMAND_H
2 #define SHHHSEQSCOMMAND_H
8 * Created by westcott on 11/8/11.
9 * Copyright 2011 Schloss Lab. All rights reserved.
13 #include "command.hpp"
14 #include "myseqdist.h"
16 #include "sequenceparser.h"
17 #include "deconvolutecommand.h"
18 #include "clustercommand.h"
20 //**********************************************************************************************************************
22 class ShhhSeqsCommand : public Command {
25 ShhhSeqsCommand(string);
29 vector<string> setParameters();
30 string getCommandName() { return "shhh.seqs"; }
31 string getCommandCategory() { return "Sequence Processing"; }
33 string getHelpString();
34 string getOutputPattern(string);
35 string getCitation() { return "Schloss PD, Gevers D, Westcott SL (2011). Reducing the effects of PCR amplification and sequencing artifacts on 16S rRNA-based studies. PLoS ONE. 6:e27310.\nQuince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011). Removing noise from pyrosequenced amplicons. BMC Bioinformatics 12:38.\nhttp://www.mothur.org/wiki/Shhh.seqs"; }
36 string getDescription() { return "shhh.seqs"; }
40 void help() { m->mothurOut(getHelpString()); }
47 linePair(int i, int j) : start(i), end(j) {}
51 string outputDir, fastafile, namefile, groupfile;
54 vector<string> outputNames;
56 int readData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&);
57 int loadData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, map<string, string>&, vector<Sequence>&);
59 int driver(seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, string, string, string, string);
60 vector<string> driverGroups(SequenceParser&, string, string, string, int, int, vector<string>);
61 vector<string> createProcessesGroups(SequenceParser&, string, string, string, vector<string>);
62 int deconvoluteResults(string, string);
68 /**************************************************************************************************/
69 //custom data structure for threads to use.
70 // This is passed by void pointer so it can be any data type
71 // that can be passed using a single void pointer (LPVOID).
76 string newFName, newNName, newMName;
80 int sigma, threadID, count;
81 vector<string> groups;
82 vector<string> mapfileNames;
85 shhhseqsData(string f, string n, string g, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int s, int tid) {
102 /**************************************************************************************************/
103 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
105 static DWORD WINAPI MyShhhSeqsThreadFunction(LPVOID lpParam){
106 shhhseqsData* pDataArray;
107 pDataArray = (shhhseqsData*)lpParam;
111 //parse fasta and name file by group
112 SequenceParser parser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);
114 //precluster each group
115 for (int k = pDataArray->start; k < pDataArray->end; k++) {
119 int start = time(NULL);
121 if (pDataArray->m->control_pressed) { return 0; }
123 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
125 map<string, string> thisNameMap;
126 thisNameMap = parser.getNameMap(pDataArray->groups[k]);
127 vector<Sequence> thisSeqs = parser.getSeqs(pDataArray->groups[k]);
129 if (pDataArray->m->control_pressed) { return 0; }
131 vector<string> sequences;
132 vector<string> uniqueNames;
133 vector<string> redundantNames;
137 correctDist* correct = new correctDist(1); //we use one processor since we already split up the work load.
139 //load this groups info in order
140 //loadData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq, thisNameMap, thisSeqs);
141 //////////////////////////////////////////////////////////////////////////////////////////////////
143 map<string, string>::iterator it;
145 for (int i = 0; i < thisSeqs.size(); i++) {
147 if (pDataArray->m->control_pressed) { return 0; }
149 if (thisSeqs[i].getName() != "") {
150 correct->addSeq(thisSeqs[i].getName(), thisSeqs[i].getAligned());
152 it = thisNameMap.find(thisSeqs[i].getName());
153 if (it != thisNameMap.end()) {
154 noise.addSeq(thisSeqs[i].getAligned(), sequences);
155 noise.addRedundantName(it->first, it->second, uniqueNames, redundantNames, seqFreq);
157 pDataArray->m->mothurOut("[ERROR]: " + thisSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct.");
163 if (error) { return 0; }
164 //////////////////////////////////////////////////////////////////////////////////////////////////
166 if (pDataArray->m->control_pressed) { return 0; }
168 //calc distances for cluster
169 string distFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(pDataArray->fastafile)) + pDataArray->groups[k] + ".shhh.dist";
170 correct->execute(distFileName);
173 if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); return 0; }
175 //driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, newFFile+groups[i], newNFile+groups[i], newMFile+groups[i]+".map");
176 ///////////////////////////////////////////////////////////////////////////////////////////////////
177 double cutOff = 0.08;
180 double minDelta = 1e-6;
182 double maxDelta = 1e6;
183 int numSeqs = sequences.size();
185 //run cluster command
186 string inputString = "phylip=" + distFileName + ", method=furthest, cutoff=0.08";
187 pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine();
188 pDataArray->m->mothurOut("Running command: cluster(" + inputString + ")"); pDataArray->m->mothurOutEndLine();
190 Command* clusterCommand = new ClusterCommand(inputString);
191 clusterCommand->execute();
193 map<string, vector<string> > filenames = clusterCommand->getOutputFiles();
194 string listFileName = filenames["list"][0];
195 string rabundFileName = filenames["rabund"][0]; pDataArray->m->mothurRemove(rabundFileName);
196 string sabundFileName = filenames["sabund"][0]; pDataArray->m->mothurRemove(sabundFileName);
198 delete clusterCommand;
199 pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine();
201 if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); pDataArray->m->mothurRemove(listFileName); return 0; }
203 vector<double> distances(numSeqs * numSeqs);
204 noise.getDistanceData(distFileName, distances);
205 pDataArray->m->mothurRemove(distFileName);
206 if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(listFileName); return 0; }
208 vector<int> otuData(numSeqs);
210 vector<vector<int> > otuBySeqLookUp;
211 noise.getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
212 pDataArray->m->mothurRemove(listFileName);
213 if (pDataArray->m->control_pressed) { return 0; }
215 int numOTUs = otuFreq.size();
217 vector<double> weights(numOTUs, 0);
218 vector<int> change(numOTUs, 1);
219 vector<int> centroids(numOTUs, -1);
220 vector<int> cumCount(numOTUs, 0);
222 vector<double> tau(numSeqs, 1);
223 vector<int> anP(numSeqs, 0);
224 vector<int> anI(numSeqs, 0);
225 vector<int> anN(numSeqs, 0);
226 vector<vector<int> > aanI = otuBySeqLookUp;
228 while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
230 if (pDataArray->m->control_pressed) { return 0; }
232 noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (pDataArray->m->control_pressed) { return 0; }
233 maxDelta = noise.calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
235 noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
236 noise.checkCentroids(weights, centroids); if (pDataArray->m->control_pressed) { return 0; }
238 otuFreq.assign(numOTUs, 0);
242 for(int i=0;i<numSeqs;i++){
243 if (pDataArray->m->control_pressed) { return 0; }
246 double norm = 0.0000;
247 double minWeight = 0.1;
248 vector<double> currentTau(numOTUs);
250 for(int j=0;j<numOTUs;j++){
251 if (pDataArray->m->control_pressed) { return 0; }
252 if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
253 offset = distances[i * numSeqs+centroids[j]];
257 for(int j=0;j<numOTUs;j++){
258 if (pDataArray->m->control_pressed) { return 0; }
259 if(weights[j] > minWeight){
260 currentTau[j] = exp(pDataArray->sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
261 norm += currentTau[j];
264 currentTau[j] = 0.0000;
268 for(int j=0;j<numOTUs;j++){
269 if (pDataArray->m->control_pressed) { return 0; }
270 currentTau[j] /= norm;
273 for(int j=0;j<numOTUs;j++){
274 if (pDataArray->m->control_pressed) { return 0; }
276 if(currentTau[j] > 1.0e-4){
277 int oldTotal = total;
280 tau.resize(oldTotal+1);
281 tau[oldTotal] = currentTau[j];
282 otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
283 aanI[j][otuFreq[j]] = i;
296 noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (pDataArray->m->control_pressed) { return 0; }
298 vector<double> percentage(numSeqs);
299 noise.setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI); if (pDataArray->m->control_pressed) { return 0; }
300 noise.finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau); if (pDataArray->m->control_pressed) { return 0; }
302 change.assign(numOTUs, 1);
303 noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
306 vector<int> finalTau(numOTUs, 0);
307 for(int i=0;i<numSeqs;i++){
308 if (pDataArray->m->control_pressed) { return 0; }
309 finalTau[otuData[i]] += int(seqFreq[i]);
312 noise.writeOutput(pDataArray->newFName+pDataArray->groups[k], pDataArray->newNName+pDataArray->groups[k], pDataArray->newMName+pDataArray->groups[k]+".map", finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
314 ///////////////////////////////////////////////////////////////////////////////////////////////////
316 if (pDataArray->m->control_pressed) { return 0; }
318 pDataArray->m->appendFiles(pDataArray->newFName+pDataArray->groups[k], pDataArray->newFName); pDataArray->m->mothurRemove(pDataArray->newFName+pDataArray->groups[k]);
319 pDataArray->m->appendFiles(pDataArray->newNName+pDataArray->groups[k], pDataArray->newNName); pDataArray->m->mothurRemove(pDataArray->newNName+pDataArray->groups[k]);
320 pDataArray->mapfileNames.push_back(pDataArray->newMName+pDataArray->groups[k]+".map");
322 pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process group " + pDataArray->groups[k] + "."); pDataArray->m->mothurOutEndLine();
329 catch(exception& e) {
330 pDataArray->m->errorOut(e, "ShhhSeqsCommand", "MyShhhSeqsThreadFunction");
335 /**************************************************************************************************/