8 * Created by Pat Schloss on 12/27/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
15 #include "readcolumn.h"
16 #include "readmatrix.hpp"
17 #include "rabundvector.hpp"
18 #include "sabundvector.hpp"
19 #include "listvector.hpp"
20 #include "cluster.hpp"
23 //**********************************************************************************************************************
28 #define MIN_WEIGHT 0.1
29 #define MIN_TAU 0.0001
31 //**********************************************************************************************************************
33 class ShhherCommand : public Command {
36 ShhherCommand(string);
40 vector<string> setParameters();
41 string getCommandName() { return "shhh.flows"; }
42 string getCommandCategory() { return "Sequence Processing"; }
43 string getOutputFileNameTag(string, string);
44 string getHelpString();
45 string getCitation() { return "Schloss PD, Gevers D, Westcott SL (2011). Reducing the effects of PCR amplification and sequencing artifacts on 16S rRNA-based studies. PLoS ONE. 6:e27310.\nQuince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011). Removing noise from pyrosequenced amplicons. BMC Bioinformatics 12:38.\nQuince C, Lanzén A, Curtis TP, Davenport RJ, Hall N, Head IM, Read LF, Sloan WT (2009). Accurate determination of microbial diversity from 454 pyrosequencing data. Nat. Methods 6:639.\nhttp://www.mothur.org/wiki/Shhh.flows"; }
46 string getDescription() { return "shhh.flows"; }
50 void help() { m->mothurOut(getHelpString()); }
56 linePair(int i, int j) : start(i), end(j) {}
60 string outputDir, flowFileName, flowFilesFileName, lookupFileName, compositeFASTAFileName, compositeNamesFileName;
62 int processors, maxIters, largeSize;
63 float cutoff, sigma, minDelta;
66 vector<string> outputNames;
67 vector<double> singleLookUp;
68 vector<double> jointLookUp;
69 vector<string> flowFileVector;
71 vector<string> parseFlowFiles(string);
72 int driver(vector<string>, string, string, int, int);
73 int createProcesses(vector<string>);
74 int getFlowData(string, vector<string>&, vector<int>&, vector<short>&, map<string, int>&, int&);
75 int getUniques(int, int, vector<short>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
76 int flowDistParentFork(int, string, int, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
77 float calcPairwiseDist(int, int, int, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
78 int createNamesFile(int, int, string, vector<string>&, vector<int>&, vector<int>&);
79 int cluster(string, string, string);
80 int getOTUData(int numSeqs, string, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&, vector<int>&, vector<int>&,map<string, int>&);
81 int calcCentroidsDriver(int numOTUs, vector<int>&, vector<int>&, vector<int>&, vector<short>&, vector<int>&, vector<double>&, vector<int>&, vector<short>&, vector<short>&, vector<int>&, int, vector<int>&);
82 double getDistToCentroid(int, int, int, vector<short>&, vector<short>&, int);
83 double getNewWeights(int, vector<int>&, vector<int>&, vector<double>&, vector<int>&, vector<double>&);
85 double getLikelihood(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<double>&);
86 int checkCentroids(int, vector<int>&, vector<double>&);
87 void calcNewDistances(int, int, vector<int>& , vector<double>&,vector<double>& , vector<short>& change, vector<int>&,vector<vector<int> >&, vector<double>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&, vector<short>&, int, vector<int>&);
88 int fill(int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&);
89 void setOTUs(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&,
90 vector<int>&, vector<double>&, vector<double>&, vector<vector<int> >&, vector<vector<int> >&);
91 void writeQualities(int, int, string, vector<int>, vector<int>&, vector<int>&, vector<double>&, vector<short>&, vector<short>&, vector<int>&, vector<int>&, vector<string>&, vector<int>&, vector<vector<int> >&);
92 void writeSequences(string, int, int, string, vector<int>, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&);
93 void writeNames(string, int, string, vector<int>, vector<string>&, vector<vector<int> >&, vector<int>&);
94 void writeGroups(string, string, int, vector<string>&);
95 void writeClusters(string, int, int, vector<int>, vector<int>&, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&);
97 void getSingleLookUp();
98 void getJointLookUp();
99 double getProbIntensity(int);
103 string flowDistMPI(int, int);
104 void calcNewDistancesChildMPI(int, int, vector<int>&);
111 float calcPairwiseDist(int, int);
112 void flowDistParentFork(string, int, int);
114 string createDistFile(int);
115 string createNamesFile();
116 string cluster(string, string);
118 void getOTUData(string);
119 void initPyroCluster();
121 void calcCentroids();
122 void calcCentroidsDriver(int, int);
123 double getDistToCentroid(int, int, int);
124 double getNewWeights();
125 double getLikelihood();
126 void checkCentroids();
127 void calcNewDistances();
128 void calcNewDistancesParent(int, int);
129 void calcNewDistancesChild(int, int, vector<int>&, vector<int>&, vector<double>&);
133 void writeQualities(vector<int>);
134 void writeSequences(vector<int>);
135 void writeNames(vector<int>);
137 void writeClusters(vector<int>);
139 vector<string> seqNameVector;
141 vector<short> flowDataIntI;
142 vector<double> flowDataPrI;
143 map<string, int> nameMap;
145 vector<int> cumNumSeqs;
146 vector<int> nSeqsPerOTU;
147 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
148 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
149 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
150 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
151 vector<double> dist; //adDist - distance of sequences to centroids
152 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
153 vector<int> centroids; //the representative flowgram for each cluster m
154 vector<double> weight;
155 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
156 vector<short> uniqueFlowgrams;
157 vector<int> uniqueCount;
158 vector<int> mapSeqToUnique;
159 vector<int> mapUniqueToSeq;
160 vector<int> uniqueLengths;
161 int numSeqs, numUniques, numOTUs, numFlowCells;
162 vector<int> nSeqsBreaks;
163 vector<int> nOTUsBreaks;
169 /**************************************************************************************************
170 //custom data structure for threads to use.
171 // This is passed by void pointer so it can be any data type
172 // that can be passed using a single void pointer (LPVOID).
173 struct shhhFlowsData {
174 int threadID, maxIters;
175 float cutoff, sigma, minDelta;
177 vector<double> singleLookUp;
178 vector<double> jointLookUp;
179 vector<string> filenames;
180 vector<string> outputNames;
181 string thisCompositeFASTAFileName, thisCompositeNameFileName, outputDir;
186 shhhFlowsData(vector<string> f, string cf, string cn, string ou, string flor, vector<double> jl, vector<double> sl, MothurOut* mout, int st, int sp, float cut, float si, float mD, int mx, int tid) {
188 thisCompositeFASTAFileName = cf;
189 thisCompositeNameFileName = cn;
205 /**************************************************************************************************
206 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
208 static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
209 shhhFlowsData* pDataArray;
210 pDataArray = (shhhFlowsData*)lpParam;
214 for(int l=pDataArray->start;l<pDataArray->stop;l++){
216 if (pDataArray->m->control_pressed) { break; }
218 string flowFileName = pDataArray->filenames[l];
220 pDataArray->m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(l+1) + " of " + toString(pDataArray->filenames.size()) + ")\t<<<<<\n");
221 pDataArray->m->mothurOut("Reading flowgrams...\n");
223 vector<string> seqNameVector;
225 vector<short> flowDataIntI;
226 vector<double> flowDataPrI;
227 map<string, int> nameMap;
228 vector<short> uniqueFlowgrams;
229 vector<int> uniqueCount;
230 vector<int> mapSeqToUnique;
231 vector<int> mapUniqueToSeq;
232 vector<int> uniqueLengths;
235 //int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
236 /*****************************************************************************************************
239 // cout << "herethread " << flowFileName << '\t' << &flowFile << endl;
240 pDataArray->m->openInputFile(flowFileName, flowFile);
242 // cout << "herethread " << flowFileName << endl;
244 int currentNumFlowCells;
247 flowFile >> numFlowCells;
248 int index = 0;//pcluster
249 while(!flowFile.eof()){
251 if (pDataArray->m->control_pressed) { flowFile.close(); return 0; }
253 flowFile >> seqName >> currentNumFlowCells;
254 lengths.push_back(currentNumFlowCells);
255 // cout << "herethread " << seqName << endl;
256 seqNameVector.push_back(seqName);
257 nameMap[seqName] = index++;//pcluster
259 for(int i=0;i<numFlowCells;i++){
260 flowFile >> intensity;
261 if(intensity > 9.99) { intensity = 9.99; }
262 int intI = int(100 * intensity + 0.0001);
263 flowDataIntI.push_back(intI);
265 pDataArray->m->gobble(flowFile);
269 int numSeqs = seqNameVector.size();
270 // cout << numSeqs << endl;
271 for(int i=0;i<numSeqs;i++){
273 if (pDataArray->m->control_pressed) { return 0; }
275 int iNumFlowCells = i * numFlowCells;
276 for(int j=lengths[i];j<numFlowCells;j++){
277 flowDataIntI[iNumFlowCells + j] = 0;
280 // cout << "here" << endl;
281 /*****************************************************************************************************
283 if (pDataArray->m->control_pressed) { return 0; }
285 pDataArray->m->mothurOut("Identifying unique flowgrams...\n");
286 //int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
287 /*****************************************************************************************************
289 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
290 uniqueCount.assign(numSeqs, 0); // anWeights
291 uniqueLengths.assign(numSeqs, 0);
292 mapSeqToUnique.assign(numSeqs, -1);
293 mapUniqueToSeq.assign(numSeqs, -1);
295 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
297 for(int i=0;i<numSeqs;i++){
299 if (pDataArray->m->control_pressed) { return 0; }
303 vector<short> current(numFlowCells);
304 for(int j=0;j<numFlowCells;j++){
305 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
308 for(int j=0;j<numUniques;j++){
309 int offset = j * numFlowCells;
313 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
314 else { shorterLength = uniqueLengths[j]; }
316 for(int k=0;k<shorterLength;k++){
317 if(current[k] != uniqueFlowgrams[offset + k]){
324 mapSeqToUnique[i] = j;
327 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
333 if(index == numUniques){
334 uniqueLengths[numUniques] = lengths[i];
335 uniqueCount[numUniques] = 1;
336 mapSeqToUnique[i] = numUniques;//anMap
337 mapUniqueToSeq[numUniques] = i;//anF
339 for(int k=0;k<numFlowCells;k++){
340 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
341 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
347 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
348 uniqueLengths.resize(numUniques);
350 flowDataPrI.resize(numSeqs * numFlowCells, 0);
351 for(int i=0;i<flowDataPrI.size();i++) {
352 if (pDataArray->m->control_pressed) { return 0; }
353 //flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);
355 flowDataPrI[i] = 100000000;
357 for(int j=0;j<HOMOPS;j++){//loop signal strength
359 if (pDataArray->m->control_pressed) { return 0; }
361 float negLogProb = pDataArray->singleLookUp[j * NUMBINS + flowDataIntI[i]];
362 if(negLogProb < flowDataPrI[i]) { flowDataPrI[i] = negLogProb; }
366 /*****************************************************************************************************
368 if (pDataArray->m->control_pressed) { return 0; }
370 pDataArray->m->mothurOut("Calculating distances between flowgrams...\n");
371 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
372 unsigned long long begTime = time(NULL);
373 double begClock = clock();
375 //flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
376 /*****************************************************************************************************
377 ostringstream outStream;
378 outStream.setf(ios::fixed, ios::floatfield);
379 outStream.setf(ios::dec, ios::basefield);
380 outStream.setf(ios::showpoint);
381 outStream.precision(6);
383 int thisbegTime = time(NULL);
384 double thisbegClock = clock();
386 for(int i=0;i<numUniques;i++){
388 if (pDataArray->m->control_pressed) { break; }
390 for(int j=0;j<i;j++){
391 //float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
392 /*****************************************************************************************************
393 int seqA = mapUniqueToSeq[i]; int seqB = mapUniqueToSeq[j];
394 int minLength = lengths[mapSeqToUnique[seqA]];
395 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
397 int ANumFlowCells = seqA * numFlowCells;
398 int BNumFlowCells = seqB * numFlowCells;
400 float flowDistance = 0;
402 for(int k=0;k<minLength;k++){
404 if (pDataArray->m->control_pressed) { break; }
406 int flowAIntI = flowDataIntI[ANumFlowCells + k];
407 float flowAPrI = flowDataPrI[ANumFlowCells + k];
409 int flowBIntI = flowDataIntI[BNumFlowCells + k];
410 float flowBPrI = flowDataPrI[BNumFlowCells + k];
411 flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
414 flowDistance /= (float) minLength;
415 /*****************************************************************************************************
417 if(flowDistance < 1e-6){
418 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
420 else if(flowDistance <= pDataArray->cutoff){
421 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
425 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - thisbegTime));
426 pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
427 pDataArray->m->mothurOutEndLine();
431 ofstream distFile(distFileName.c_str());
432 distFile << outStream.str();
435 if (pDataArray->m->control_pressed) {}
437 pDataArray->m->mothurOut(toString(numUniques-1) + "\t" + toString(time(NULL) - thisbegTime));
438 pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
439 pDataArray->m->mothurOutEndLine();
441 /*****************************************************************************************************
443 pDataArray->m->mothurOutEndLine();
444 pDataArray->m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
446 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
447 //createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
448 /*****************************************************************************************************
449 vector<string> duplicateNames(numUniques, "");
450 for(int i=0;i<numSeqs;i++){
451 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
455 pDataArray->m->openOutputFile(namesFileName, nameFile);
457 for(int i=0;i<numUniques;i++){
458 if (pDataArray->m->control_pressed) { nameFile.close(); return 0; }
459 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
462 /*****************************************************************************************************
464 if (pDataArray->m->control_pressed) { return 0; }
466 pDataArray->m->mothurOut("\nClustering flowgrams...\n");
467 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
468 //cluster(listFileName, distFileName, namesFileName);
469 /*****************************************************************************************************
470 ReadMatrix* read = new ReadColumnMatrix(distFileName);
471 read->setCutoff(pDataArray->cutoff);
473 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
474 clusterNameMap->readMap();
475 read->read(clusterNameMap);
477 ListVector* list = read->getListVector();
478 SparseMatrix* matrix = read->getMatrix();
481 delete clusterNameMap;
483 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
485 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, pDataArray->cutoff, "furthest");
486 string tag = cluster->getTag();
488 double clusterCutoff = pDataArray->cutoff;
489 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
491 if (pDataArray->m->control_pressed) { break; }
493 cluster->update(clusterCutoff);
496 list->setLabel(toString(pDataArray->cutoff));
498 ofstream listFileOut;
499 pDataArray->m->openOutputFile(listFileName, listFileOut);
500 list->print(listFileOut);
503 delete matrix; delete cluster; delete rabund; delete list;
504 /*****************************************************************************************************
506 if (pDataArray->m->control_pressed) { return 0; }
509 vector<int> cumNumSeqs;
510 vector<int> nSeqsPerOTU;
511 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
512 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
513 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
514 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
517 //int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
518 /*****************************************************************************************************
520 pDataArray->m->openInputFile(listFileName, listFile);
524 listFile >> label >> numOTUs;
526 otuData.assign(numSeqs, 0);
527 cumNumSeqs.assign(numOTUs, 0);
528 nSeqsPerOTU.assign(numOTUs, 0);
529 aaP.clear();aaP.resize(numOTUs);
535 string singleOTU = "";
537 for(int i=0;i<numOTUs;i++){
539 if (pDataArray->m->control_pressed) { break; }
541 listFile >> singleOTU;
543 istringstream otuString(singleOTU);
549 for(int j=0;j<singleOTU.length();j++){
550 char letter = otuString.get();
556 map<string,int>::iterator nmIt = nameMap.find(seqName);
557 int index = nmIt->second;
563 aaP[i].push_back(index);
568 map<string,int>::iterator nmIt = nameMap.find(seqName);
570 int index = nmIt->second;
575 aaP[i].push_back(index);
580 sort(aaP[i].begin(), aaP[i].end());
581 for(int j=0;j<nSeqsPerOTU[i];j++){
582 seqNumber.push_back(aaP[i][j]);
584 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
591 for(int i=1;i<numOTUs;i++){
592 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
595 seqIndex = seqNumber;
598 /*****************************************************************************************************
600 if (pDataArray->m->control_pressed) { return 0; }
602 pDataArray->m->mothurRemove(distFileName);
603 pDataArray->m->mothurRemove(namesFileName);
604 pDataArray->m->mothurRemove(listFileName);
606 vector<double> dist; //adDist - distance of sequences to centroids
607 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
608 vector<int> centroids; //the representative flowgram for each cluster m
609 vector<double> weight;
610 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
611 vector<int> nSeqsBreaks;
612 vector<int> nOTUsBreaks;
614 dist.assign(numSeqs * numOTUs, 0);
615 change.assign(numOTUs, 1);
616 centroids.assign(numOTUs, -1);
617 weight.assign(numOTUs, 0);
618 singleTau.assign(numSeqs, 1.0);
620 nSeqsBreaks.assign(2, 0);
621 nOTUsBreaks.assign(2, 0);
624 nSeqsBreaks[1] = numSeqs;
625 nOTUsBreaks[1] = numOTUs;
627 if (pDataArray->m->control_pressed) { break; }
633 begTime = time(NULL);
635 pDataArray->m->mothurOut("\nDenoising flowgrams...\n");
636 pDataArray->m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
638 while((pDataArray->maxIters == 0 && maxDelta > pDataArray->minDelta) || iter < MIN_ITER || (maxDelta > pDataArray->minDelta && iter < pDataArray->maxIters)){
640 if (pDataArray->m->control_pressed) { break; }
642 double cycClock = clock();
643 unsigned long long cycTime = time(NULL);
644 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
645 /*****************************************************************************************************
647 for(int i=0;i<numOTUs;i++){
649 if (pDataArray->m->control_pressed) { return 0; }
651 cumNumSeqs[i] = indexFill;
652 for(int j=0;j<nSeqsPerOTU[i];j++){
653 seqNumber[indexFill] = aaP[i][j];
654 seqIndex[indexFill] = aaI[i][j];
659 /*****************************************************************************************************
662 if (pDataArray->m->control_pressed) { break; }
664 //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
665 /*****************************************************************************************************
666 for(int i=0;i<numOTUs;i++){
668 if (pDataArray->m->control_pressed) { break; }
672 int minFlowGram = 100000000;
673 double minFlowValue = 1e8;
674 change[i] = 0; //FALSE
676 for(int j=0;j<nSeqsPerOTU[i];j++){
677 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
680 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
681 vector<double> adF(nSeqsPerOTU[i]);
682 vector<int> anL(nSeqsPerOTU[i]);
684 for(int j=0;j<nSeqsPerOTU[i];j++){
685 int index = cumNumSeqs[i] + j;
686 int nI = seqIndex[index];
687 int nIU = mapSeqToUnique[nI];
690 for(k=0;k<position;k++){
697 adF[position] = 0.0000;
702 for(int j=0;j<nSeqsPerOTU[i];j++){
703 int index = cumNumSeqs[i] + j;
704 int nI = seqIndex[index];
706 double tauValue = singleTau[seqNumber[index]];
708 for(int k=0;k<position;k++){
709 // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
710 /*****************************************************************************************************
711 int flowAValue = anL[k] * numFlowCells;
712 int flowBValue = nI * numFlowCells;
716 for(int l=0;l<lengths[nI];l++){
717 dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
722 dist = dist / (double)lengths[nI];
723 /*****************************************************************************************************
724 adF[k] += dist * tauValue;
728 for(int j=0;j<position;j++){
729 if(adF[j] < minFlowValue){
731 minFlowValue = adF[j];
735 if(centroids[i] != anL[minFlowGram]){
737 centroids[i] = anL[minFlowGram];
740 else if(centroids[i] != -1){
745 /*****************************************************************************************************
747 if (pDataArray->m->control_pressed) { break; }
749 //maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
750 /*****************************************************************************************************
751 double maxChange = 0;
753 for(int i=0;i<numOTUs;i++){
755 if (pDataArray->m->control_pressed) { break; }
757 double difference = weight[i];
760 for(int j=0;j<nSeqsPerOTU[i];j++){
761 int index = cumNumSeqs[i] + j;
762 double tauValue = singleTau[seqNumber[index]];
763 weight[i] += tauValue;
766 difference = fabs(weight[i] - difference);
767 if(difference > maxChange){ maxChange = difference; }
769 maxDelta = maxChange;
770 /*****************************************************************************************************
772 if (pDataArray->m->control_pressed) { break; }
774 //double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
775 /*****************************************************************************************************
776 vector<long double> P(numSeqs, 0);
779 for(int i=0;i<numOTUs;i++){
780 if(weight[i] > MIN_WEIGHT){
786 for(int i=0;i<numOTUs;i++){
788 if (pDataArray->m->control_pressed) { break; }
790 for(int j=0;j<nSeqsPerOTU[i];j++){
791 int index = cumNumSeqs[i] + j;
792 int nI = seqIndex[index];
793 double singleDist = dist[seqNumber[index]];
795 P[nI] += weight[i] * exp(-singleDist * pDataArray->sigma);
799 for(int i=0;i<numSeqs;i++){
800 if(P[i] == 0){ P[i] = DBL_EPSILON; }
805 nLL = nLL -(double)numSeqs * log(pDataArray->sigma);
806 /*****************************************************************************************************
808 if (pDataArray->m->control_pressed) { break; }
810 //checkCentroids(numOTUs, centroids, weight);
811 /*****************************************************************************************************
812 vector<int> unique(numOTUs, 1);
814 for(int i=0;i<numOTUs;i++){
815 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
820 for(int i=0;i<numOTUs;i++){
822 if (pDataArray->m->control_pressed) { break; }
825 for(int j=i+1;j<numOTUs;j++){
828 if(centroids[j] == centroids[i]){
832 weight[i] += weight[j];
839 /*****************************************************************************************************
841 if (pDataArray->m->control_pressed) { break; }
843 //calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
844 /*****************************************************************************************************
846 vector<double> newTau(numOTUs,0);
847 vector<double> norms(numSeqs, 0);
848 nSeqsPerOTU.assign(numOTUs, 0);
850 for(int i=0;i<numSeqs;i++){
852 if (pDataArray->m->control_pressed) { break; }
854 int indexOffset = i * numOTUs;
858 for(int j=0;j<numOTUs;j++){
860 if(weight[j] > MIN_WEIGHT && change[j] == 1){
861 //dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
862 /*****************************************************************************************************
863 int flowAValue = centroids[j] * numFlowCells;
864 int flowBValue = i * numFlowCells;
868 for(int l=0;l<lengths[i];l++){
869 distTemp += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
874 dist[indexOffset + j] = distTemp / (double)lengths[i];
875 /*****************************************************************************************************
879 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
880 offset = dist[indexOffset + j];
884 for(int j=0;j<numOTUs;j++){
885 if(weight[j] > MIN_WEIGHT){
886 newTau[j] = exp(pDataArray->sigma * (-dist[indexOffset + j] + offset)) * weight[j];
887 norms[i] += newTau[j];
894 for(int j=0;j<numOTUs;j++){
895 newTau[j] /= norms[i];
898 for(int j=0;j<numOTUs;j++){
899 if(newTau[j] > MIN_TAU){
901 int oldTotal = total;
905 singleTau.resize(total, 0);
906 seqNumber.resize(total, 0);
907 seqIndex.resize(total, 0);
909 singleTau[oldTotal] = newTau[j];
911 aaP[j][nSeqsPerOTU[j]] = oldTotal;
912 aaI[j][nSeqsPerOTU[j]] = i;
919 /*****************************************************************************************************
921 if (pDataArray->m->control_pressed) { break; }
925 pDataArray->m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
929 if (pDataArray->m->control_pressed) { break; }
931 pDataArray->m->mothurOut("\nFinalizing...\n");
932 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
933 /*****************************************************************************************************
935 for(int i=0;i<numOTUs;i++){
937 if (pDataArray->m->control_pressed) { return 0; }
939 cumNumSeqs[i] = indexFill;
940 for(int j=0;j<nSeqsPerOTU[i];j++){
941 seqNumber[indexFill] = aaP[i][j];
942 seqIndex[indexFill] = aaI[i][j];
947 /*****************************************************************************************************
949 if (pDataArray->m->control_pressed) { break; }
951 //setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
952 /*****************************************************************************************************
953 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
955 for(int i=0;i<numOTUs;i++){
957 if (pDataArray->m->control_pressed) { break; }
959 for(int j=0;j<nSeqsPerOTU[i];j++){
960 int index = cumNumSeqs[i] + j;
961 double tauValue = singleTau[seqNumber[index]];
962 int sIndex = seqIndex[index];
963 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
967 for(int i=0;i<numSeqs;i++){
968 double maxTau = -1.0000;
970 for(int j=0;j<numOTUs;j++){
971 if(bigTauMatrix[i * numOTUs + j] > maxTau){
972 maxTau = bigTauMatrix[i * numOTUs + j];
980 nSeqsPerOTU.assign(numOTUs, 0);
982 for(int i=0;i<numSeqs;i++){
983 int index = otuData[i];
985 singleTau[i] = 1.0000;
988 aaP[index][nSeqsPerOTU[index]] = i;
989 aaI[index][nSeqsPerOTU[index]] = i;
991 nSeqsPerOTU[index]++;
994 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
995 /*****************************************************************************************************
997 for(int i=0;i<numOTUs;i++){
999 if (pDataArray->m->control_pressed) { return 0; }
1001 cumNumSeqs[i] = indexFill;
1002 for(int j=0;j<nSeqsPerOTU[i];j++){
1003 seqNumber[indexFill] = aaP[i][j];
1004 seqIndex[indexFill] = aaI[i][j];
1009 /*****************************************************************************************************/
1011 /*****************************************************************************************************
1013 if (pDataArray->m->control_pressed) { break; }
1015 vector<int> otuCounts(numOTUs, 0);
1016 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
1018 //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
1019 /*****************************************************************************************************
1020 for(int i=0;i<numOTUs;i++){
1022 if (pDataArray->m->control_pressed) { break; }
1026 int minFlowGram = 100000000;
1027 double minFlowValue = 1e8;
1028 change[i] = 0; //FALSE
1030 for(int j=0;j<nSeqsPerOTU[i];j++){
1031 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1034 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1035 vector<double> adF(nSeqsPerOTU[i]);
1036 vector<int> anL(nSeqsPerOTU[i]);
1038 for(int j=0;j<nSeqsPerOTU[i];j++){
1039 int index = cumNumSeqs[i] + j;
1040 int nI = seqIndex[index];
1041 int nIU = mapSeqToUnique[nI];
1044 for(k=0;k<position;k++){
1050 anL[position] = nIU;
1051 adF[position] = 0.0000;
1056 for(int j=0;j<nSeqsPerOTU[i];j++){
1057 int index = cumNumSeqs[i] + j;
1058 int nI = seqIndex[index];
1060 double tauValue = singleTau[seqNumber[index]];
1062 for(int k=0;k<position;k++){
1063 // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
1064 /*****************************************************************************************************
1065 int flowAValue = anL[k] * numFlowCells;
1066 int flowBValue = nI * numFlowCells;
1070 for(int l=0;l<lengths[nI];l++){
1071 dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1076 dist = dist / (double)lengths[nI];
1077 /*****************************************************************************************************
1078 adF[k] += dist * tauValue;
1082 for(int j=0;j<position;j++){
1083 if(adF[j] < minFlowValue){
1085 minFlowValue = adF[j];
1089 if(centroids[i] != anL[minFlowGram]){
1091 centroids[i] = anL[minFlowGram];
1094 else if(centroids[i] != -1){
1100 /*****************************************************************************************************
1102 if (pDataArray->m->control_pressed) { break; }
1104 //writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI);
1105 if (pDataArray->m->control_pressed) { break; }
1106 /*****************************************************************************************************
1107 string thisOutputDir = pDataArray->outputDir;
1108 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1109 string qualityFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.qual";
1111 ofstream qualityFile;
1112 pDataArray->m->openOutputFile(qualityFileName, qualityFile);
1114 qualityFile.setf(ios::fixed, ios::floatfield);
1115 qualityFile.setf(ios::showpoint);
1116 qualityFile << setprecision(6);
1118 vector<vector<int> > qualities(numOTUs);
1119 vector<double> pr(HOMOPS, 0);
1122 for(int i=0;i<numOTUs;i++){
1124 if (pDataArray->m->control_pressed) { break; }
1129 if(nSeqsPerOTU[i] > 0){
1130 qualities[i].assign(1024, -1);
1132 while(index < numFlowCells){
1133 double maxPrValue = 1e8;
1134 short maxPrIndex = -1;
1135 double count = 0.0000;
1137 pr.assign(HOMOPS, 0);
1139 for(int j=0;j<nSeqsPerOTU[i];j++){
1140 int lIndex = cumNumSeqs[i] + j;
1141 double tauValue = singleTau[seqNumber[lIndex]];
1142 int sequenceIndex = aaI[i][j];
1143 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1147 for(int s=0;s<HOMOPS;s++){
1148 pr[s] += tauValue * pDataArray->singleLookUp[s * NUMBINS + intensity];
1152 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1153 maxPrValue = pr[maxPrIndex];
1155 if(count > MIN_COUNT){
1157 double norm = 0.0000;
1159 for(int s=0;s<HOMOPS;s++){
1160 norm += exp(-(pr[s] - maxPrValue));
1163 for(int s=1;s<=maxPrIndex;s++){
1165 double temp = 0.0000;
1167 U += exp(-(pr[s-1]-maxPrValue))/norm;
1175 temp = floor(-10 * temp);
1176 value = (int)floor(temp);
1177 if(value > 100){ value = 100; }
1179 qualities[i][base] = (int)value;
1189 if(otuCounts[i] > 0){
1190 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1192 int j=4; //need to get past the first four bases
1193 while(qualities[i][j] != -1){
1194 qualityFile << qualities[i][j] << ' ';
1197 qualityFile << endl;
1200 qualityFile.close();
1201 pDataArray->outputNames.push_back(qualityFileName);
1202 /*****************************************************************************************************
1204 // writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);
1205 if (pDataArray->m->control_pressed) { break; }
1206 /*****************************************************************************************************
1207 thisOutputDir = pDataArray->outputDir;
1208 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1209 string fastaFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.fasta";
1211 pDataArray->m->openOutputFile(fastaFileName, fastaFile);
1213 vector<string> names(numOTUs, "");
1215 for(int i=0;i<numOTUs;i++){
1217 if (pDataArray->m->control_pressed) { break; }
1219 int index = centroids[i];
1221 if(otuCounts[i] > 0){
1222 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1226 for(int j=0;j<numFlowCells;j++){
1228 char base = pDataArray->flowOrder[j % 4];
1229 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1234 fastaFile << newSeq.substr(4) << endl;
1239 pDataArray->outputNames.push_back(fastaFileName);
1241 if(pDataArray->thisCompositeFASTAFileName != ""){
1242 pDataArray->m->appendFiles(fastaFileName, pDataArray->thisCompositeFASTAFileName);
1245 /*****************************************************************************************************
1247 //writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);
1248 if (pDataArray->m->control_pressed) { break; }
1249 /*****************************************************************************************************
1250 thisOutputDir = pDataArray->outputDir;
1251 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1252 string nameFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.names";
1253 ofstream nameFileOut;
1254 pDataArray->m->openOutputFile(nameFileName, nameFileOut);
1256 for(int i=0;i<numOTUs;i++){
1258 if (pDataArray->m->control_pressed) { break; }
1260 if(otuCounts[i] > 0){
1261 nameFileOut << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1263 for(int j=1;j<nSeqsPerOTU[i];j++){
1264 nameFileOut << ',' << seqNameVector[aaI[i][j]];
1267 nameFileOut << endl;
1270 nameFileOut.close();
1271 pDataArray->outputNames.push_back(nameFileName);
1274 if(pDataArray->thisCompositeNameFileName != ""){
1275 pDataArray->m->appendFiles(nameFileName, pDataArray->thisCompositeNameFileName);
1277 /*****************************************************************************************************
1279 //writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);
1280 if (pDataArray->m->control_pressed) { break; }
1281 /*****************************************************************************************************
1282 thisOutputDir = pDataArray->outputDir;
1283 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1284 string otuCountsFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.counts";
1285 ofstream otuCountsFile;
1286 pDataArray->m->openOutputFile(otuCountsFileName, otuCountsFile);
1288 string bases = pDataArray->flowOrder;
1290 for(int i=0;i<numOTUs;i++){
1292 if (pDataArray->m->control_pressed) {
1295 //output the translated version of the centroid sequence for the otu
1296 if(otuCounts[i] > 0){
1297 int index = centroids[i];
1299 otuCountsFile << "ideal\t";
1300 for(int j=8;j<numFlowCells;j++){
1301 char base = bases[j % 4];
1302 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1303 otuCountsFile << base;
1306 otuCountsFile << endl;
1308 for(int j=0;j<nSeqsPerOTU[i];j++){
1309 int sequence = aaI[i][j];
1310 otuCountsFile << seqNameVector[sequence] << '\t';
1314 for(int k=0;k<lengths[sequence];k++){
1315 char base = bases[k % 4];
1316 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1318 for(int s=0;s<freq;s++){
1320 //otuCountsFile << base;
1323 otuCountsFile << newSeq.substr(4) << endl;
1325 otuCountsFile << endl;
1328 otuCountsFile.close();
1329 pDataArray->outputNames.push_back(otuCountsFileName)
1330 /*****************************************************************************************************
1332 //writeGroups(flowFileName, numSeqs, seqNameVector);
1333 if (pDataArray->m->control_pressed) { break; }
1334 /*****************************************************************************************************
1335 thisOutputDir = pDataArray->outputDir;
1336 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1337 string fileRoot = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName));
1338 string groupFileName = fileRoot + "shhh.groups";
1340 pDataArray->m->openOutputFile(groupFileName, groupFile);
1342 for(int i=0;i<numSeqs;i++){
1343 if (pDataArray->m->control_pressed) { break; }
1344 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1347 pDataArray->outputNames.push_back(groupFileName);
1348 /*****************************************************************************************************
1350 pDataArray->m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
1353 if (pDataArray->m->control_pressed) { for (int i = 0; i < pDataArray->outputNames.size(); i++) { pDataArray->m->mothurRemove(pDataArray->outputNames[i]); } return 0; }
1358 catch(exception& e) {
1359 pDataArray->m->errorOut(e, "ShhherCommand", "ShhhFlowsThreadFunction");