8 * Created by Pat Schloss on 12/27/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
15 #include "readcolumn.h"
16 #include "readmatrix.hpp"
17 #include "rabundvector.hpp"
18 #include "sabundvector.hpp"
19 #include "listvector.hpp"
20 #include "cluster.hpp"
23 //**********************************************************************************************************************
28 #define MIN_WEIGHT 0.1
29 #define MIN_TAU 0.0001
31 //**********************************************************************************************************************
33 class ShhherCommand : public Command {
36 ShhherCommand(string);
40 vector<string> setParameters();
41 string getCommandName() { return "shhh.flows"; }
42 string getCommandCategory() { return "Sequence Processing"; }
44 string getHelpString();
45 string getOutputPattern(string);
46 string getCitation() { return "Schloss PD, Gevers D, Westcott SL (2011). Reducing the effects of PCR amplification and sequencing artifacts on 16S rRNA-based studies. PLoS ONE. 6:e27310.\nQuince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011). Removing noise from pyrosequenced amplicons. BMC Bioinformatics 12:38.\nQuince C, Lanzén A, Curtis TP, Davenport RJ, Hall N, Head IM, Read LF, Sloan WT (2009). Accurate determination of microbial diversity from 454 pyrosequencing data. Nat. Methods 6:639.\nhttp://www.mothur.org/wiki/Shhh.flows"; }
47 string getDescription() { return "shhh.flows"; }
51 void help() { m->mothurOut(getHelpString()); }
57 linePair(int i, int j) : start(i), end(j) {}
61 string outputDir, flowFileName, flowFilesFileName, lookupFileName, compositeFASTAFileName, compositeNamesFileName;
63 int processors, maxIters, largeSize;
64 float cutoff, sigma, minDelta;
67 vector<string> outputNames;
68 vector<double> singleLookUp;
69 vector<double> jointLookUp;
70 vector<string> flowFileVector;
72 vector<string> parseFlowFiles(string);
73 int driver(vector<string>, string, string);
74 int createProcesses(vector<string>);
75 int getFlowData(string, vector<string>&, vector<int>&, vector<short>&, map<string, int>&, int&);
76 int getUniques(int, int, vector<short>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
77 int flowDistParentFork(int, string, int, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
78 float calcPairwiseDist(int, int, int, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
79 int createNamesFile(int, int, string, vector<string>&, vector<int>&, vector<int>&);
80 int cluster(string, string, string);
81 int getOTUData(int numSeqs, string, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&, vector<int>&, vector<int>&,map<string, int>&);
82 int calcCentroidsDriver(int numOTUs, vector<int>&, vector<int>&, vector<int>&, vector<short>&, vector<int>&, vector<double>&, vector<int>&, vector<short>&, vector<short>&, vector<int>&, int, vector<int>&);
83 double getDistToCentroid(int, int, int, vector<short>&, vector<short>&, int);
84 double getNewWeights(int, vector<int>&, vector<int>&, vector<double>&, vector<int>&, vector<double>&);
86 double getLikelihood(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<double>&);
87 int checkCentroids(int, vector<int>&, vector<double>&);
88 void calcNewDistances(int, int, vector<int>& , vector<double>&,vector<double>& , vector<short>& change, vector<int>&,vector<vector<int> >&, vector<double>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&, vector<short>&, int, vector<int>&);
89 int fill(int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&);
90 void setOTUs(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&,
91 vector<int>&, vector<double>&, vector<double>&, vector<vector<int> >&, vector<vector<int> >&);
92 void writeQualities(int, int, string, vector<int>, vector<int>&, vector<int>&, vector<double>&, vector<short>&, vector<short>&, vector<int>&, vector<int>&, vector<string>&, vector<int>&, vector<vector<int> >&);
93 void writeSequences(string, int, int, string, vector<int>, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&);
94 void writeNames(string, int, string, vector<int>, vector<string>&, vector<vector<int> >&, vector<int>&);
95 void writeGroups(string, string, int, vector<string>&);
96 void writeClusters(string, int, int, vector<int>, vector<int>&, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&);
98 void getSingleLookUp();
99 void getJointLookUp();
100 double getProbIntensity(int);
104 string flowDistMPI(int, int);
105 void calcNewDistancesChildMPI(int, int, vector<int>&);
112 float calcPairwiseDist(int, int);
113 void flowDistParentFork(string, int, int);
115 string createDistFile(int);
116 string createNamesFile();
117 string cluster(string, string);
119 void getOTUData(string);
120 void initPyroCluster();
122 void calcCentroids();
123 void calcCentroidsDriver(int, int);
124 double getDistToCentroid(int, int, int);
125 double getNewWeights();
126 double getLikelihood();
127 void checkCentroids();
128 void calcNewDistances();
129 void calcNewDistancesParent(int, int);
130 void calcNewDistancesChild(int, int, vector<int>&, vector<int>&, vector<double>&);
134 void writeQualities(vector<int>);
135 void writeSequences(vector<int>);
136 void writeNames(vector<int>);
138 void writeClusters(vector<int>);
140 vector<string> seqNameVector;
142 vector<short> flowDataIntI;
143 vector<double> flowDataPrI;
144 map<string, int> nameMap;
146 vector<int> cumNumSeqs;
147 vector<int> nSeqsPerOTU;
148 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
149 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
150 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
151 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
152 vector<double> dist; //adDist - distance of sequences to centroids
153 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
154 vector<int> centroids; //the representative flowgram for each cluster m
155 vector<double> weight;
156 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
157 vector<short> uniqueFlowgrams;
158 vector<int> uniqueCount;
159 vector<int> mapSeqToUnique;
160 vector<int> mapUniqueToSeq;
161 vector<int> uniqueLengths;
162 int numSeqs, numUniques, numOTUs, numFlowCells;
163 vector<int> nSeqsBreaks;
164 vector<int> nOTUsBreaks;
170 /**************************************************************************************************
171 //custom data structure for threads to use.
172 // This is passed by void pointer so it can be any data type
173 // that can be passed using a single void pointer (LPVOID).
174 struct shhhFlowsData {
175 int threadID, maxIters;
176 float cutoff, sigma, minDelta;
178 vector<double> singleLookUp;
179 vector<double> jointLookUp;
180 vector<string> filenames;
181 vector<string> outputNames;
182 string thisCompositeFASTAFileName, thisCompositeNameFileName, outputDir;
187 shhhFlowsData(vector<string> f, string cf, string cn, string ou, string flor, vector<double> jl, vector<double> sl, MothurOut* mout, int st, int sp, float cut, float si, float mD, int mx, int tid) {
189 thisCompositeFASTAFileName = cf;
190 thisCompositeNameFileName = cn;
206 /**************************************************************************************************
207 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
209 static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
210 shhhFlowsData* pDataArray;
211 pDataArray = (shhhFlowsData*)lpParam;
215 for(int l=pDataArray->start;l<pDataArray->stop;l++){
217 if (pDataArray->m->control_pressed) { break; }
219 string flowFileName = pDataArray->filenames[l];
221 pDataArray->m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(l+1) + " of " + toString(pDataArray->filenames.size()) + ")\t<<<<<\n");
222 pDataArray->m->mothurOut("Reading flowgrams...\n");
224 vector<string> seqNameVector;
226 vector<short> flowDataIntI;
227 vector<double> flowDataPrI;
228 map<string, int> nameMap;
229 vector<short> uniqueFlowgrams;
230 vector<int> uniqueCount;
231 vector<int> mapSeqToUnique;
232 vector<int> mapUniqueToSeq;
233 vector<int> uniqueLengths;
236 //int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
237 /*****************************************************************************************************
240 // cout << "herethread " << flowFileName << '\t' << &flowFile << endl;
241 pDataArray->m->openInputFile(flowFileName, flowFile);
243 // cout << "herethread " << flowFileName << endl;
245 int currentNumFlowCells;
248 flowFile >> numFlowCells;
249 int index = 0;//pcluster
250 while(!flowFile.eof()){
252 if (pDataArray->m->control_pressed) { flowFile.close(); return 0; }
254 flowFile >> seqName >> currentNumFlowCells;
255 lengths.push_back(currentNumFlowCells);
256 // cout << "herethread " << seqName << endl;
257 seqNameVector.push_back(seqName);
258 nameMap[seqName] = index++;//pcluster
260 for(int i=0;i<numFlowCells;i++){
261 flowFile >> intensity;
262 if(intensity > 9.99) { intensity = 9.99; }
263 int intI = int(100 * intensity + 0.0001);
264 flowDataIntI.push_back(intI);
266 pDataArray->m->gobble(flowFile);
270 int numSeqs = seqNameVector.size();
271 // cout << numSeqs << endl;
272 for(int i=0;i<numSeqs;i++){
274 if (pDataArray->m->control_pressed) { return 0; }
276 int iNumFlowCells = i * numFlowCells;
277 for(int j=lengths[i];j<numFlowCells;j++){
278 flowDataIntI[iNumFlowCells + j] = 0;
281 // cout << "here" << endl;
282 /*****************************************************************************************************
284 if (pDataArray->m->control_pressed) { return 0; }
286 pDataArray->m->mothurOut("Identifying unique flowgrams...\n");
287 //int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
288 /*****************************************************************************************************
290 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
291 uniqueCount.assign(numSeqs, 0); // anWeights
292 uniqueLengths.assign(numSeqs, 0);
293 mapSeqToUnique.assign(numSeqs, -1);
294 mapUniqueToSeq.assign(numSeqs, -1);
296 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
298 for(int i=0;i<numSeqs;i++){
300 if (pDataArray->m->control_pressed) { return 0; }
304 vector<short> current(numFlowCells);
305 for(int j=0;j<numFlowCells;j++){
306 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
309 for(int j=0;j<numUniques;j++){
310 int offset = j * numFlowCells;
314 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
315 else { shorterLength = uniqueLengths[j]; }
317 for(int k=0;k<shorterLength;k++){
318 if(current[k] != uniqueFlowgrams[offset + k]){
325 mapSeqToUnique[i] = j;
328 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
334 if(index == numUniques){
335 uniqueLengths[numUniques] = lengths[i];
336 uniqueCount[numUniques] = 1;
337 mapSeqToUnique[i] = numUniques;//anMap
338 mapUniqueToSeq[numUniques] = i;//anF
340 for(int k=0;k<numFlowCells;k++){
341 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
342 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
348 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
349 uniqueLengths.resize(numUniques);
351 flowDataPrI.resize(numSeqs * numFlowCells, 0);
352 for(int i=0;i<flowDataPrI.size();i++) {
353 if (pDataArray->m->control_pressed) { return 0; }
354 //flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);
356 flowDataPrI[i] = 100000000;
358 for(int j=0;j<HOMOPS;j++){//loop signal strength
360 if (pDataArray->m->control_pressed) { return 0; }
362 float negLogProb = pDataArray->singleLookUp[j * NUMBINS + flowDataIntI[i]];
363 if(negLogProb < flowDataPrI[i]) { flowDataPrI[i] = negLogProb; }
367 /*****************************************************************************************************
369 if (pDataArray->m->control_pressed) { return 0; }
371 pDataArray->m->mothurOut("Calculating distances between flowgrams...\n");
372 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
373 unsigned long long begTime = time(NULL);
374 double begClock = clock();
376 //flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
377 /*****************************************************************************************************
378 ostringstream outStream;
379 outStream.setf(ios::fixed, ios::floatfield);
380 outStream.setf(ios::dec, ios::basefield);
381 outStream.setf(ios::showpoint);
382 outStream.precision(6);
384 int thisbegTime = time(NULL);
385 double thisbegClock = clock();
387 for(int i=0;i<numUniques;i++){
389 if (pDataArray->m->control_pressed) { break; }
391 for(int j=0;j<i;j++){
392 //float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
393 /*****************************************************************************************************
394 int seqA = mapUniqueToSeq[i]; int seqB = mapUniqueToSeq[j];
395 int minLength = lengths[mapSeqToUnique[seqA]];
396 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
398 int ANumFlowCells = seqA * numFlowCells;
399 int BNumFlowCells = seqB * numFlowCells;
401 float flowDistance = 0;
403 for(int k=0;k<minLength;k++){
405 if (pDataArray->m->control_pressed) { break; }
407 int flowAIntI = flowDataIntI[ANumFlowCells + k];
408 float flowAPrI = flowDataPrI[ANumFlowCells + k];
410 int flowBIntI = flowDataIntI[BNumFlowCells + k];
411 float flowBPrI = flowDataPrI[BNumFlowCells + k];
412 flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
415 flowDistance /= (float) minLength;
416 /*****************************************************************************************************
418 if(flowDistance < 1e-6){
419 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
421 else if(flowDistance <= pDataArray->cutoff){
422 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
426 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - thisbegTime));
427 pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
428 pDataArray->m->mothurOutEndLine();
432 ofstream distFile(distFileName.c_str());
433 distFile << outStream.str();
436 if (pDataArray->m->control_pressed) {}
438 pDataArray->m->mothurOut(toString(numUniques-1) + "\t" + toString(time(NULL) - thisbegTime));
439 pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
440 pDataArray->m->mothurOutEndLine();
442 /*****************************************************************************************************
444 pDataArray->m->mothurOutEndLine();
445 pDataArray->m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
447 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
448 //createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
449 /*****************************************************************************************************
450 vector<string> duplicateNames(numUniques, "");
451 for(int i=0;i<numSeqs;i++){
452 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
456 pDataArray->m->openOutputFile(namesFileName, nameFile);
458 for(int i=0;i<numUniques;i++){
459 if (pDataArray->m->control_pressed) { nameFile.close(); return 0; }
460 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
463 /*****************************************************************************************************
465 if (pDataArray->m->control_pressed) { return 0; }
467 pDataArray->m->mothurOut("\nClustering flowgrams...\n");
468 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
469 //cluster(listFileName, distFileName, namesFileName);
470 /*****************************************************************************************************
471 ReadMatrix* read = new ReadColumnMatrix(distFileName);
472 read->setCutoff(pDataArray->cutoff);
474 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
475 clusterNameMap->readMap();
476 read->read(clusterNameMap);
478 ListVector* list = read->getListVector();
479 SparseMatrix* matrix = read->getMatrix();
482 delete clusterNameMap;
484 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
486 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, pDataArray->cutoff, "furthest");
487 string tag = cluster->getTag();
489 double clusterCutoff = pDataArray->cutoff;
490 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
492 if (pDataArray->m->control_pressed) { break; }
494 cluster->update(clusterCutoff);
497 list->setLabel(toString(pDataArray->cutoff));
499 ofstream listFileOut;
500 pDataArray->m->openOutputFile(listFileName, listFileOut);
501 list->print(listFileOut);
504 delete matrix; delete cluster; delete rabund; delete list;
505 /*****************************************************************************************************
507 if (pDataArray->m->control_pressed) { return 0; }
510 vector<int> cumNumSeqs;
511 vector<int> nSeqsPerOTU;
512 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
513 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
514 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
515 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
518 //int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
519 /*****************************************************************************************************
521 pDataArray->m->openInputFile(listFileName, listFile);
525 listFile >> label >> numOTUs;
527 otuData.assign(numSeqs, 0);
528 cumNumSeqs.assign(numOTUs, 0);
529 nSeqsPerOTU.assign(numOTUs, 0);
530 aaP.clear();aaP.resize(numOTUs);
536 string singleOTU = "";
538 for(int i=0;i<numOTUs;i++){
540 if (pDataArray->m->control_pressed) { break; }
542 listFile >> singleOTU;
544 istringstream otuString(singleOTU);
550 for(int j=0;j<singleOTU.length();j++){
551 char letter = otuString.get();
557 map<string,int>::iterator nmIt = nameMap.find(seqName);
558 int index = nmIt->second;
564 aaP[i].push_back(index);
569 map<string,int>::iterator nmIt = nameMap.find(seqName);
571 int index = nmIt->second;
576 aaP[i].push_back(index);
581 sort(aaP[i].begin(), aaP[i].end());
582 for(int j=0;j<nSeqsPerOTU[i];j++){
583 seqNumber.push_back(aaP[i][j]);
585 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
592 for(int i=1;i<numOTUs;i++){
593 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
596 seqIndex = seqNumber;
599 /*****************************************************************************************************
601 if (pDataArray->m->control_pressed) { return 0; }
603 pDataArray->m->mothurRemove(distFileName);
604 pDataArray->m->mothurRemove(namesFileName);
605 pDataArray->m->mothurRemove(listFileName);
607 vector<double> dist; //adDist - distance of sequences to centroids
608 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
609 vector<int> centroids; //the representative flowgram for each cluster m
610 vector<double> weight;
611 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
612 vector<int> nSeqsBreaks;
613 vector<int> nOTUsBreaks;
615 dist.assign(numSeqs * numOTUs, 0);
616 change.assign(numOTUs, 1);
617 centroids.assign(numOTUs, -1);
618 weight.assign(numOTUs, 0);
619 singleTau.assign(numSeqs, 1.0);
621 nSeqsBreaks.assign(2, 0);
622 nOTUsBreaks.assign(2, 0);
625 nSeqsBreaks[1] = numSeqs;
626 nOTUsBreaks[1] = numOTUs;
628 if (pDataArray->m->control_pressed) { break; }
634 begTime = time(NULL);
636 pDataArray->m->mothurOut("\nDenoising flowgrams...\n");
637 pDataArray->m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
639 while((pDataArray->maxIters == 0 && maxDelta > pDataArray->minDelta) || iter < MIN_ITER || (maxDelta > pDataArray->minDelta && iter < pDataArray->maxIters)){
641 if (pDataArray->m->control_pressed) { break; }
643 double cycClock = clock();
644 unsigned long long cycTime = time(NULL);
645 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
646 /*****************************************************************************************************
648 for(int i=0;i<numOTUs;i++){
650 if (pDataArray->m->control_pressed) { return 0; }
652 cumNumSeqs[i] = indexFill;
653 for(int j=0;j<nSeqsPerOTU[i];j++){
654 seqNumber[indexFill] = aaP[i][j];
655 seqIndex[indexFill] = aaI[i][j];
660 /*****************************************************************************************************
663 if (pDataArray->m->control_pressed) { break; }
665 //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
666 /*****************************************************************************************************
667 for(int i=0;i<numOTUs;i++){
669 if (pDataArray->m->control_pressed) { break; }
673 int minFlowGram = 100000000;
674 double minFlowValue = 1e8;
675 change[i] = 0; //FALSE
677 for(int j=0;j<nSeqsPerOTU[i];j++){
678 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
681 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
682 vector<double> adF(nSeqsPerOTU[i]);
683 vector<int> anL(nSeqsPerOTU[i]);
685 for(int j=0;j<nSeqsPerOTU[i];j++){
686 int index = cumNumSeqs[i] + j;
687 int nI = seqIndex[index];
688 int nIU = mapSeqToUnique[nI];
691 for(k=0;k<position;k++){
698 adF[position] = 0.0000;
703 for(int j=0;j<nSeqsPerOTU[i];j++){
704 int index = cumNumSeqs[i] + j;
705 int nI = seqIndex[index];
707 double tauValue = singleTau[seqNumber[index]];
709 for(int k=0;k<position;k++){
710 // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
711 /*****************************************************************************************************
712 int flowAValue = anL[k] * numFlowCells;
713 int flowBValue = nI * numFlowCells;
717 for(int l=0;l<lengths[nI];l++){
718 dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
723 dist = dist / (double)lengths[nI];
724 /*****************************************************************************************************
725 adF[k] += dist * tauValue;
729 for(int j=0;j<position;j++){
730 if(adF[j] < minFlowValue){
732 minFlowValue = adF[j];
736 if(centroids[i] != anL[minFlowGram]){
738 centroids[i] = anL[minFlowGram];
741 else if(centroids[i] != -1){
746 /*****************************************************************************************************
748 if (pDataArray->m->control_pressed) { break; }
750 //maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
751 /*****************************************************************************************************
752 double maxChange = 0;
754 for(int i=0;i<numOTUs;i++){
756 if (pDataArray->m->control_pressed) { break; }
758 double difference = weight[i];
761 for(int j=0;j<nSeqsPerOTU[i];j++){
762 int index = cumNumSeqs[i] + j;
763 double tauValue = singleTau[seqNumber[index]];
764 weight[i] += tauValue;
767 difference = fabs(weight[i] - difference);
768 if(difference > maxChange){ maxChange = difference; }
770 maxDelta = maxChange;
771 /*****************************************************************************************************
773 if (pDataArray->m->control_pressed) { break; }
775 //double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
776 /*****************************************************************************************************
777 vector<long double> P(numSeqs, 0);
780 for(int i=0;i<numOTUs;i++){
781 if(weight[i] > MIN_WEIGHT){
787 for(int i=0;i<numOTUs;i++){
789 if (pDataArray->m->control_pressed) { break; }
791 for(int j=0;j<nSeqsPerOTU[i];j++){
792 int index = cumNumSeqs[i] + j;
793 int nI = seqIndex[index];
794 double singleDist = dist[seqNumber[index]];
796 P[nI] += weight[i] * exp(-singleDist * pDataArray->sigma);
800 for(int i=0;i<numSeqs;i++){
801 if(P[i] == 0){ P[i] = DBL_EPSILON; }
806 nLL = nLL -(double)numSeqs * log(pDataArray->sigma);
807 /*****************************************************************************************************
809 if (pDataArray->m->control_pressed) { break; }
811 //checkCentroids(numOTUs, centroids, weight);
812 /*****************************************************************************************************
813 vector<int> unique(numOTUs, 1);
815 for(int i=0;i<numOTUs;i++){
816 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
821 for(int i=0;i<numOTUs;i++){
823 if (pDataArray->m->control_pressed) { break; }
826 for(int j=i+1;j<numOTUs;j++){
829 if(centroids[j] == centroids[i]){
833 weight[i] += weight[j];
840 /*****************************************************************************************************
842 if (pDataArray->m->control_pressed) { break; }
844 //calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
845 /*****************************************************************************************************
847 vector<double> newTau(numOTUs,0);
848 vector<double> norms(numSeqs, 0);
849 nSeqsPerOTU.assign(numOTUs, 0);
851 for(int i=0;i<numSeqs;i++){
853 if (pDataArray->m->control_pressed) { break; }
855 int indexOffset = i * numOTUs;
859 for(int j=0;j<numOTUs;j++){
861 if(weight[j] > MIN_WEIGHT && change[j] == 1){
862 //dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
863 /*****************************************************************************************************
864 int flowAValue = centroids[j] * numFlowCells;
865 int flowBValue = i * numFlowCells;
869 for(int l=0;l<lengths[i];l++){
870 distTemp += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
875 dist[indexOffset + j] = distTemp / (double)lengths[i];
876 /*****************************************************************************************************
880 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
881 offset = dist[indexOffset + j];
885 for(int j=0;j<numOTUs;j++){
886 if(weight[j] > MIN_WEIGHT){
887 newTau[j] = exp(pDataArray->sigma * (-dist[indexOffset + j] + offset)) * weight[j];
888 norms[i] += newTau[j];
895 for(int j=0;j<numOTUs;j++){
896 newTau[j] /= norms[i];
899 for(int j=0;j<numOTUs;j++){
900 if(newTau[j] > MIN_TAU){
902 int oldTotal = total;
906 singleTau.resize(total, 0);
907 seqNumber.resize(total, 0);
908 seqIndex.resize(total, 0);
910 singleTau[oldTotal] = newTau[j];
912 aaP[j][nSeqsPerOTU[j]] = oldTotal;
913 aaI[j][nSeqsPerOTU[j]] = i;
920 /*****************************************************************************************************
922 if (pDataArray->m->control_pressed) { break; }
926 pDataArray->m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
930 if (pDataArray->m->control_pressed) { break; }
932 pDataArray->m->mothurOut("\nFinalizing...\n");
933 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
934 /*****************************************************************************************************
936 for(int i=0;i<numOTUs;i++){
938 if (pDataArray->m->control_pressed) { return 0; }
940 cumNumSeqs[i] = indexFill;
941 for(int j=0;j<nSeqsPerOTU[i];j++){
942 seqNumber[indexFill] = aaP[i][j];
943 seqIndex[indexFill] = aaI[i][j];
948 /*****************************************************************************************************
950 if (pDataArray->m->control_pressed) { break; }
952 //setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
953 /*****************************************************************************************************
954 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
956 for(int i=0;i<numOTUs;i++){
958 if (pDataArray->m->control_pressed) { break; }
960 for(int j=0;j<nSeqsPerOTU[i];j++){
961 int index = cumNumSeqs[i] + j;
962 double tauValue = singleTau[seqNumber[index]];
963 int sIndex = seqIndex[index];
964 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
968 for(int i=0;i<numSeqs;i++){
969 double maxTau = -1.0000;
971 for(int j=0;j<numOTUs;j++){
972 if(bigTauMatrix[i * numOTUs + j] > maxTau){
973 maxTau = bigTauMatrix[i * numOTUs + j];
981 nSeqsPerOTU.assign(numOTUs, 0);
983 for(int i=0;i<numSeqs;i++){
984 int index = otuData[i];
986 singleTau[i] = 1.0000;
989 aaP[index][nSeqsPerOTU[index]] = i;
990 aaI[index][nSeqsPerOTU[index]] = i;
992 nSeqsPerOTU[index]++;
995 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
996 /*****************************************************************************************************
998 for(int i=0;i<numOTUs;i++){
1000 if (pDataArray->m->control_pressed) { return 0; }
1002 cumNumSeqs[i] = indexFill;
1003 for(int j=0;j<nSeqsPerOTU[i];j++){
1004 seqNumber[indexFill] = aaP[i][j];
1005 seqIndex[indexFill] = aaI[i][j];
1010 /*****************************************************************************************************/
1012 /*****************************************************************************************************
1014 if (pDataArray->m->control_pressed) { break; }
1016 vector<int> otuCounts(numOTUs, 0);
1017 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
1019 //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
1020 /*****************************************************************************************************
1021 for(int i=0;i<numOTUs;i++){
1023 if (pDataArray->m->control_pressed) { break; }
1027 int minFlowGram = 100000000;
1028 double minFlowValue = 1e8;
1029 change[i] = 0; //FALSE
1031 for(int j=0;j<nSeqsPerOTU[i];j++){
1032 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1035 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1036 vector<double> adF(nSeqsPerOTU[i]);
1037 vector<int> anL(nSeqsPerOTU[i]);
1039 for(int j=0;j<nSeqsPerOTU[i];j++){
1040 int index = cumNumSeqs[i] + j;
1041 int nI = seqIndex[index];
1042 int nIU = mapSeqToUnique[nI];
1045 for(k=0;k<position;k++){
1051 anL[position] = nIU;
1052 adF[position] = 0.0000;
1057 for(int j=0;j<nSeqsPerOTU[i];j++){
1058 int index = cumNumSeqs[i] + j;
1059 int nI = seqIndex[index];
1061 double tauValue = singleTau[seqNumber[index]];
1063 for(int k=0;k<position;k++){
1064 // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
1065 /*****************************************************************************************************
1066 int flowAValue = anL[k] * numFlowCells;
1067 int flowBValue = nI * numFlowCells;
1071 for(int l=0;l<lengths[nI];l++){
1072 dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1077 dist = dist / (double)lengths[nI];
1078 /*****************************************************************************************************
1079 adF[k] += dist * tauValue;
1083 for(int j=0;j<position;j++){
1084 if(adF[j] < minFlowValue){
1086 minFlowValue = adF[j];
1090 if(centroids[i] != anL[minFlowGram]){
1092 centroids[i] = anL[minFlowGram];
1095 else if(centroids[i] != -1){
1101 /*****************************************************************************************************
1103 if (pDataArray->m->control_pressed) { break; }
1105 //writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI);
1106 if (pDataArray->m->control_pressed) { break; }
1107 /*****************************************************************************************************
1108 string thisOutputDir = pDataArray->outputDir;
1109 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1110 string qualityFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.qual";
1112 ofstream qualityFile;
1113 pDataArray->m->openOutputFile(qualityFileName, qualityFile);
1115 qualityFile.setf(ios::fixed, ios::floatfield);
1116 qualityFile.setf(ios::showpoint);
1117 qualityFile << setprecision(6);
1119 vector<vector<int> > qualities(numOTUs);
1120 vector<double> pr(HOMOPS, 0);
1123 for(int i=0;i<numOTUs;i++){
1125 if (pDataArray->m->control_pressed) { break; }
1130 if(nSeqsPerOTU[i] > 0){
1131 qualities[i].assign(1024, -1);
1133 while(index < numFlowCells){
1134 double maxPrValue = 1e8;
1135 short maxPrIndex = -1;
1136 double count = 0.0000;
1138 pr.assign(HOMOPS, 0);
1140 for(int j=0;j<nSeqsPerOTU[i];j++){
1141 int lIndex = cumNumSeqs[i] + j;
1142 double tauValue = singleTau[seqNumber[lIndex]];
1143 int sequenceIndex = aaI[i][j];
1144 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1148 for(int s=0;s<HOMOPS;s++){
1149 pr[s] += tauValue * pDataArray->singleLookUp[s * NUMBINS + intensity];
1153 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1154 maxPrValue = pr[maxPrIndex];
1156 if(count > MIN_COUNT){
1158 double norm = 0.0000;
1160 for(int s=0;s<HOMOPS;s++){
1161 norm += exp(-(pr[s] - maxPrValue));
1164 for(int s=1;s<=maxPrIndex;s++){
1166 double temp = 0.0000;
1168 U += exp(-(pr[s-1]-maxPrValue))/norm;
1176 temp = floor(-10 * temp);
1177 value = (int)floor(temp);
1178 if(value > 100){ value = 100; }
1180 qualities[i][base] = (int)value;
1190 if(otuCounts[i] > 0){
1191 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1193 int j=4; //need to get past the first four bases
1194 while(qualities[i][j] != -1){
1195 qualityFile << qualities[i][j] << ' ';
1198 qualityFile << endl;
1201 qualityFile.close();
1202 pDataArray->outputNames.push_back(qualityFileName);
1203 /*****************************************************************************************************
1205 // writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);
1206 if (pDataArray->m->control_pressed) { break; }
1207 /*****************************************************************************************************
1208 thisOutputDir = pDataArray->outputDir;
1209 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1210 string fastaFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.fasta";
1212 pDataArray->m->openOutputFile(fastaFileName, fastaFile);
1214 vector<string> names(numOTUs, "");
1216 for(int i=0;i<numOTUs;i++){
1218 if (pDataArray->m->control_pressed) { break; }
1220 int index = centroids[i];
1222 if(otuCounts[i] > 0){
1223 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1227 for(int j=0;j<numFlowCells;j++){
1229 char base = pDataArray->flowOrder[j % 4];
1230 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1235 fastaFile << newSeq.substr(4) << endl;
1240 pDataArray->outputNames.push_back(fastaFileName);
1242 if(pDataArray->thisCompositeFASTAFileName != ""){
1243 pDataArray->m->appendFiles(fastaFileName, pDataArray->thisCompositeFASTAFileName);
1246 /*****************************************************************************************************
1248 //writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);
1249 if (pDataArray->m->control_pressed) { break; }
1250 /*****************************************************************************************************
1251 thisOutputDir = pDataArray->outputDir;
1252 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1253 string nameFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.names";
1254 ofstream nameFileOut;
1255 pDataArray->m->openOutputFile(nameFileName, nameFileOut);
1257 for(int i=0;i<numOTUs;i++){
1259 if (pDataArray->m->control_pressed) { break; }
1261 if(otuCounts[i] > 0){
1262 nameFileOut << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1264 for(int j=1;j<nSeqsPerOTU[i];j++){
1265 nameFileOut << ',' << seqNameVector[aaI[i][j]];
1268 nameFileOut << endl;
1271 nameFileOut.close();
1272 pDataArray->outputNames.push_back(nameFileName);
1275 if(pDataArray->thisCompositeNameFileName != ""){
1276 pDataArray->m->appendFiles(nameFileName, pDataArray->thisCompositeNameFileName);
1278 /*****************************************************************************************************
1280 //writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);
1281 if (pDataArray->m->control_pressed) { break; }
1282 /*****************************************************************************************************
1283 thisOutputDir = pDataArray->outputDir;
1284 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1285 string otuCountsFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.counts";
1286 ofstream otuCountsFile;
1287 pDataArray->m->openOutputFile(otuCountsFileName, otuCountsFile);
1289 string bases = pDataArray->flowOrder;
1291 for(int i=0;i<numOTUs;i++){
1293 if (pDataArray->m->control_pressed) {
1296 //output the translated version of the centroid sequence for the otu
1297 if(otuCounts[i] > 0){
1298 int index = centroids[i];
1300 otuCountsFile << "ideal\t";
1301 for(int j=8;j<numFlowCells;j++){
1302 char base = bases[j % 4];
1303 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1304 otuCountsFile << base;
1307 otuCountsFile << endl;
1309 for(int j=0;j<nSeqsPerOTU[i];j++){
1310 int sequence = aaI[i][j];
1311 otuCountsFile << seqNameVector[sequence] << '\t';
1315 for(int k=0;k<lengths[sequence];k++){
1316 char base = bases[k % 4];
1317 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1319 for(int s=0;s<freq;s++){
1321 //otuCountsFile << base;
1324 otuCountsFile << newSeq.substr(4) << endl;
1326 otuCountsFile << endl;
1329 otuCountsFile.close();
1330 pDataArray->outputNames.push_back(otuCountsFileName)
1331 /*****************************************************************************************************
1333 //writeGroups(flowFileName, numSeqs, seqNameVector);
1334 if (pDataArray->m->control_pressed) { break; }
1335 /*****************************************************************************************************
1336 thisOutputDir = pDataArray->outputDir;
1337 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1338 string fileRoot = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName));
1339 string groupFileName = fileRoot + "shhh.groups";
1341 pDataArray->m->openOutputFile(groupFileName, groupFile);
1343 for(int i=0;i<numSeqs;i++){
1344 if (pDataArray->m->control_pressed) { break; }
1345 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1348 pDataArray->outputNames.push_back(groupFileName);
1349 /*****************************************************************************************************
1351 pDataArray->m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
1354 if (pDataArray->m->control_pressed) { for (int i = 0; i < pDataArray->outputNames.size(); i++) { pDataArray->m->mothurRemove(pDataArray->outputNames[i]); } return 0; }
1359 catch(exception& e) {
1360 pDataArray->m->errorOut(e, "ShhherCommand", "ShhhFlowsThreadFunction");