8 * Created by Pat Schloss on 12/27/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
14 #include "command.hpp"
15 #include "readcolumn.h"
16 #include "readmatrix.hpp"
17 #include "rabundvector.hpp"
18 #include "sabundvector.hpp"
19 #include "listvector.hpp"
20 #include "cluster.hpp"
21 #include "sparsematrix.hpp"
24 //**********************************************************************************************************************
29 #define MIN_WEIGHT 0.1
30 #define MIN_TAU 0.0001
32 //**********************************************************************************************************************
34 class ShhherCommand : public Command {
37 ShhherCommand(string);
41 vector<string> setParameters();
42 string getCommandName() { return "shhh.flows"; }
43 string getCommandCategory() { return "Sequence Processing"; }
44 string getHelpString();
45 string getCitation() { return "http://www.mothur.org/wiki/Shhh.flows"; }
46 string getDescription() { return "shhh.flows"; }
50 void help() { m->mothurOut(getHelpString()); }
56 linePair(int i, int j) : start(i), end(j) {}
60 string outputDir, flowFileName, flowFilesFileName, lookupFileName, compositeFASTAFileName, compositeNamesFileName;
62 int processors, maxIters;
63 float cutoff, sigma, minDelta;
66 vector<string> outputNames;
67 vector<double> singleLookUp;
68 vector<double> jointLookUp;
69 vector<string> flowFileVector;
71 int driver(vector<string>, string, string, int, int);
72 int createProcesses(vector<string>);
73 int getFlowData(string, vector<string>&, vector<int>&, vector<short>&, map<string, int>&, int&);
74 int getUniques(int, int, vector<short>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
75 int flowDistParentFork(int, string, int, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
76 float calcPairwiseDist(int, int, int, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
77 int createNamesFile(int, int, string, vector<string>&, vector<int>&, vector<int>&);
78 int cluster(string, string, string);
79 int getOTUData(int numSeqs, string, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&, vector<int>&, vector<int>&,map<string, int>&);
80 int calcCentroidsDriver(int numOTUs, vector<int>&, vector<int>&, vector<int>&, vector<short>&, vector<int>&, vector<double>&, vector<int>&, vector<short>&, vector<short>&, vector<int>&, int, vector<int>&);
81 double getDistToCentroid(int, int, int, vector<short>&, vector<short>&, int);
82 double getNewWeights(int, vector<int>&, vector<int>&, vector<double>&, vector<int>&, vector<double>&);
84 double getLikelihood(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<double>&);
85 int checkCentroids(int, vector<int>&, vector<double>&);
86 void calcNewDistances(int, int, vector<int>& , vector<double>&,vector<double>& , vector<short>& change, vector<int>&,vector<vector<int> >&, vector<double>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&, vector<short>&, int, vector<int>&);
87 int fill(int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&);
88 void setOTUs(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&,
89 vector<int>&, vector<double>&, vector<double>&, vector<vector<int> >&, vector<vector<int> >&);
90 void writeQualities(int, int, string, vector<int>, vector<int>&, vector<int>&, vector<double>&, vector<short>&, vector<short>&, vector<int>&, vector<int>&, vector<string>&, vector<int>&, vector<vector<int> >&);
91 void writeSequences(string, int, int, string, vector<int>, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&);
92 void writeNames(string, int, string, vector<int>, vector<string>&, vector<vector<int> >&, vector<int>&);
93 void writeGroups(string, int, vector<string>&);
94 void writeClusters(string, int, int, vector<int>, vector<int>&, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&);
96 void getSingleLookUp();
97 void getJointLookUp();
98 double getProbIntensity(int);
102 string flowDistMPI(int, int);
103 void calcNewDistancesChildMPI(int, int, vector<int>&);
110 float calcPairwiseDist(int, int);
111 void flowDistParentFork(string, int, int);
113 string createDistFile(int);
114 string createNamesFile();
115 string cluster(string, string);
117 void getOTUData(string);
118 void initPyroCluster();
120 void calcCentroids();
121 void calcCentroidsDriver(int, int);
122 double getDistToCentroid(int, int, int);
123 double getNewWeights();
124 double getLikelihood();
125 void checkCentroids();
126 void calcNewDistances();
127 void calcNewDistancesParent(int, int);
128 void calcNewDistancesChild(int, int, vector<int>&, vector<int>&, vector<double>&);
132 void writeQualities(vector<int>);
133 void writeSequences(vector<int>);
134 void writeNames(vector<int>);
136 void writeClusters(vector<int>);
138 vector<string> seqNameVector;
140 vector<short> flowDataIntI;
141 vector<double> flowDataPrI;
142 map<string, int> nameMap;
144 vector<int> cumNumSeqs;
145 vector<int> nSeqsPerOTU;
146 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
147 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
148 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
149 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
150 vector<double> dist; //adDist - distance of sequences to centroids
151 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
152 vector<int> centroids; //the representative flowgram for each cluster m
153 vector<double> weight;
154 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
155 vector<short> uniqueFlowgrams;
156 vector<int> uniqueCount;
157 vector<int> mapSeqToUnique;
158 vector<int> mapUniqueToSeq;
159 vector<int> uniqueLengths;
160 int numSeqs, numUniques, numOTUs, numFlowCells;
161 vector<int> nSeqsBreaks;
162 vector<int> nOTUsBreaks;
168 /**************************************************************************************************/
169 //custom data structure for threads to use.
170 // This is passed by void pointer so it can be any data type
171 // that can be passed using a single void pointer (LPVOID).
172 struct shhhFlowsData {
173 int threadID, maxIters;
174 float cutoff, sigma, minDelta;
176 vector<double> singleLookUp;
177 vector<double> jointLookUp;
178 vector<string> filenames;
179 vector<string> outputNames;
180 string thisCompositeFASTAFileName, thisCompositeNameFileName, outputDir;
185 shhhFlowsData(vector<string> f, string cf, string cn, string ou, string flor, vector<double> jl, vector<double> sl, MothurOut* mout, int st, int sp, float cut, float si, float mD, int mx, int tid) {
187 thisCompositeFASTAFileName = cf;
188 thisCompositeNameFileName = cn;
204 /**************************************************************************************************/
205 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
207 static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
208 shhhFlowsData* pDataArray;
209 pDataArray = (shhhFlowsData*)lpParam;
213 for(int l=pDataArray->start;l<pDataArray->stop;l++){
215 if (pDataArray->m->control_pressed) { break; }
217 string flowFileName = pDataArray->filenames[l];
219 pDataArray->m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(l+1) + " of " + toString(pDataArray->filenames.size()) + ")\t<<<<<\n");
220 pDataArray->m->mothurOut("Reading flowgrams...\n");
222 vector<string> seqNameVector;
224 vector<short> flowDataIntI;
225 vector<double> flowDataPrI;
226 map<string, int> nameMap;
227 vector<short> uniqueFlowgrams;
228 vector<int> uniqueCount;
229 vector<int> mapSeqToUnique;
230 vector<int> mapUniqueToSeq;
231 vector<int> uniqueLengths;
234 //int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
235 /*****************************************************************************************************/
238 // cout << "herethread " << flowFileName << '\t' << &flowFile << endl;
239 pDataArray->m->openInputFile(flowFileName, flowFile);
241 // cout << "herethread " << flowFileName << endl;
243 int currentNumFlowCells;
246 flowFile >> numFlowCells;
247 int index = 0;//pcluster
248 while(!flowFile.eof()){
250 if (pDataArray->m->control_pressed) { flowFile.close(); return 0; }
252 flowFile >> seqName >> currentNumFlowCells;
253 lengths.push_back(currentNumFlowCells);
254 // cout << "herethread " << seqName << endl;
255 seqNameVector.push_back(seqName);
256 nameMap[seqName] = index++;//pcluster
258 for(int i=0;i<numFlowCells;i++){
259 flowFile >> intensity;
260 if(intensity > 9.99) { intensity = 9.99; }
261 int intI = int(100 * intensity + 0.0001);
262 flowDataIntI.push_back(intI);
264 pDataArray->m->gobble(flowFile);
268 int numSeqs = seqNameVector.size();
269 // cout << numSeqs << endl;
270 for(int i=0;i<numSeqs;i++){
272 if (pDataArray->m->control_pressed) { return 0; }
274 int iNumFlowCells = i * numFlowCells;
275 for(int j=lengths[i];j<numFlowCells;j++){
276 flowDataIntI[iNumFlowCells + j] = 0;
279 // cout << "here" << endl;
280 /*****************************************************************************************************/
282 if (pDataArray->m->control_pressed) { return 0; }
284 pDataArray->m->mothurOut("Identifying unique flowgrams...\n");
285 //int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
286 /*****************************************************************************************************/
288 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
289 uniqueCount.assign(numSeqs, 0); // anWeights
290 uniqueLengths.assign(numSeqs, 0);
291 mapSeqToUnique.assign(numSeqs, -1);
292 mapUniqueToSeq.assign(numSeqs, -1);
294 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
296 for(int i=0;i<numSeqs;i++){
298 if (pDataArray->m->control_pressed) { return 0; }
302 vector<short> current(numFlowCells);
303 for(int j=0;j<numFlowCells;j++){
304 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
307 for(int j=0;j<numUniques;j++){
308 int offset = j * numFlowCells;
312 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
313 else { shorterLength = uniqueLengths[j]; }
315 for(int k=0;k<shorterLength;k++){
316 if(current[k] != uniqueFlowgrams[offset + k]){
323 mapSeqToUnique[i] = j;
326 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
332 if(index == numUniques){
333 uniqueLengths[numUniques] = lengths[i];
334 uniqueCount[numUniques] = 1;
335 mapSeqToUnique[i] = numUniques;//anMap
336 mapUniqueToSeq[numUniques] = i;//anF
338 for(int k=0;k<numFlowCells;k++){
339 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
340 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
346 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
347 uniqueLengths.resize(numUniques);
349 flowDataPrI.resize(numSeqs * numFlowCells, 0);
350 for(int i=0;i<flowDataPrI.size();i++) {
351 if (pDataArray->m->control_pressed) { return 0; }
352 //flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);
354 flowDataPrI[i] = 100000000;
356 for(int j=0;j<HOMOPS;j++){//loop signal strength
358 if (pDataArray->m->control_pressed) { return 0; }
360 float negLogProb = pDataArray->singleLookUp[j * NUMBINS + flowDataIntI[i]];
361 if(negLogProb < flowDataPrI[i]) { flowDataPrI[i] = negLogProb; }
365 /*****************************************************************************************************/
367 if (pDataArray->m->control_pressed) { return 0; }
369 pDataArray->m->mothurOut("Calculating distances between flowgrams...\n");
370 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
371 unsigned long long begTime = time(NULL);
372 double begClock = clock();
374 //flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
375 /*****************************************************************************************************/
376 ostringstream outStream;
377 outStream.setf(ios::fixed, ios::floatfield);
378 outStream.setf(ios::dec, ios::basefield);
379 outStream.setf(ios::showpoint);
380 outStream.precision(6);
382 int thisbegTime = time(NULL);
383 double thisbegClock = clock();
385 for(int i=0;i<numUniques;i++){
387 if (pDataArray->m->control_pressed) { break; }
389 for(int j=0;j<i;j++){
390 //float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
391 /*****************************************************************************************************/
392 int seqA = mapUniqueToSeq[i]; int seqB = mapUniqueToSeq[j];
393 int minLength = lengths[mapSeqToUnique[seqA]];
394 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
396 int ANumFlowCells = seqA * numFlowCells;
397 int BNumFlowCells = seqB * numFlowCells;
399 float flowDistance = 0;
401 for(int k=0;k<minLength;k++){
403 if (pDataArray->m->control_pressed) { break; }
405 int flowAIntI = flowDataIntI[ANumFlowCells + k];
406 float flowAPrI = flowDataPrI[ANumFlowCells + k];
408 int flowBIntI = flowDataIntI[BNumFlowCells + k];
409 float flowBPrI = flowDataPrI[BNumFlowCells + k];
410 flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
413 flowDistance /= (float) minLength;
414 /*****************************************************************************************************/
416 if(flowDistance < 1e-6){
417 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
419 else if(flowDistance <= pDataArray->cutoff){
420 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
424 pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - thisbegTime));
425 pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
426 pDataArray->m->mothurOutEndLine();
430 ofstream distFile(distFileName.c_str());
431 distFile << outStream.str();
434 if (pDataArray->m->control_pressed) {}
436 pDataArray->m->mothurOut(toString(numUniques-1) + "\t" + toString(time(NULL) - thisbegTime));
437 pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
438 pDataArray->m->mothurOutEndLine();
440 /*****************************************************************************************************/
442 pDataArray->m->mothurOutEndLine();
443 pDataArray->m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
445 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
446 //createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
447 /*****************************************************************************************************/
448 vector<string> duplicateNames(numUniques, "");
449 for(int i=0;i<numSeqs;i++){
450 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
454 pDataArray->m->openOutputFile(namesFileName, nameFile);
456 for(int i=0;i<numUniques;i++){
457 if (pDataArray->m->control_pressed) { nameFile.close(); return 0; }
458 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
461 /*****************************************************************************************************/
463 if (pDataArray->m->control_pressed) { return 0; }
465 pDataArray->m->mothurOut("\nClustering flowgrams...\n");
466 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
467 //cluster(listFileName, distFileName, namesFileName);
468 /*****************************************************************************************************/
469 ReadMatrix* read = new ReadColumnMatrix(distFileName);
470 read->setCutoff(pDataArray->cutoff);
472 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
473 clusterNameMap->readMap();
474 read->read(clusterNameMap);
476 ListVector* list = read->getListVector();
477 SparseMatrix* matrix = read->getMatrix();
480 delete clusterNameMap;
482 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
484 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, pDataArray->cutoff, "furthest");
485 string tag = cluster->getTag();
487 double clusterCutoff = pDataArray->cutoff;
488 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
490 if (pDataArray->m->control_pressed) { break; }
492 cluster->update(clusterCutoff);
495 list->setLabel(toString(pDataArray->cutoff));
497 ofstream listFileOut;
498 pDataArray->m->openOutputFile(listFileName, listFileOut);
499 list->print(listFileOut);
502 delete matrix; delete cluster; delete rabund; delete list;
503 /*****************************************************************************************************/
505 if (pDataArray->m->control_pressed) { return 0; }
508 vector<int> cumNumSeqs;
509 vector<int> nSeqsPerOTU;
510 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
511 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
512 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
513 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
516 //int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
517 /*****************************************************************************************************/
519 pDataArray->m->openInputFile(listFileName, listFile);
523 listFile >> label >> numOTUs;
525 otuData.assign(numSeqs, 0);
526 cumNumSeqs.assign(numOTUs, 0);
527 nSeqsPerOTU.assign(numOTUs, 0);
528 aaP.clear();aaP.resize(numOTUs);
534 string singleOTU = "";
536 for(int i=0;i<numOTUs;i++){
538 if (pDataArray->m->control_pressed) { break; }
540 listFile >> singleOTU;
542 istringstream otuString(singleOTU);
548 for(int j=0;j<singleOTU.length();j++){
549 char letter = otuString.get();
555 map<string,int>::iterator nmIt = nameMap.find(seqName);
556 int index = nmIt->second;
562 aaP[i].push_back(index);
567 map<string,int>::iterator nmIt = nameMap.find(seqName);
569 int index = nmIt->second;
574 aaP[i].push_back(index);
579 sort(aaP[i].begin(), aaP[i].end());
580 for(int j=0;j<nSeqsPerOTU[i];j++){
581 seqNumber.push_back(aaP[i][j]);
583 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
590 for(int i=1;i<numOTUs;i++){
591 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
594 seqIndex = seqNumber;
597 /*****************************************************************************************************/
599 if (pDataArray->m->control_pressed) { return 0; }
601 pDataArray->m->mothurRemove(distFileName);
602 pDataArray->m->mothurRemove(namesFileName);
603 pDataArray->m->mothurRemove(listFileName);
605 vector<double> dist; //adDist - distance of sequences to centroids
606 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
607 vector<int> centroids; //the representative flowgram for each cluster m
608 vector<double> weight;
609 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
610 vector<int> nSeqsBreaks;
611 vector<int> nOTUsBreaks;
613 dist.assign(numSeqs * numOTUs, 0);
614 change.assign(numOTUs, 1);
615 centroids.assign(numOTUs, -1);
616 weight.assign(numOTUs, 0);
617 singleTau.assign(numSeqs, 1.0);
619 nSeqsBreaks.assign(2, 0);
620 nOTUsBreaks.assign(2, 0);
623 nSeqsBreaks[1] = numSeqs;
624 nOTUsBreaks[1] = numOTUs;
626 if (pDataArray->m->control_pressed) { break; }
632 begTime = time(NULL);
634 pDataArray->m->mothurOut("\nDenoising flowgrams...\n");
635 pDataArray->m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
637 while((pDataArray->maxIters == 0 && maxDelta > pDataArray->minDelta) || iter < MIN_ITER || (maxDelta > pDataArray->minDelta && iter < pDataArray->maxIters)){
639 if (pDataArray->m->control_pressed) { break; }
641 double cycClock = clock();
642 unsigned long long cycTime = time(NULL);
643 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
644 /*****************************************************************************************************/
646 for(int i=0;i<numOTUs;i++){
648 if (pDataArray->m->control_pressed) { return 0; }
650 cumNumSeqs[i] = indexFill;
651 for(int j=0;j<nSeqsPerOTU[i];j++){
652 seqNumber[indexFill] = aaP[i][j];
653 seqIndex[indexFill] = aaI[i][j];
658 /*****************************************************************************************************/
661 if (pDataArray->m->control_pressed) { break; }
663 //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
664 /*****************************************************************************************************/
665 for(int i=0;i<numOTUs;i++){
667 if (pDataArray->m->control_pressed) { break; }
671 int minFlowGram = 100000000;
672 double minFlowValue = 1e8;
673 change[i] = 0; //FALSE
675 for(int j=0;j<nSeqsPerOTU[i];j++){
676 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
679 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
680 vector<double> adF(nSeqsPerOTU[i]);
681 vector<int> anL(nSeqsPerOTU[i]);
683 for(int j=0;j<nSeqsPerOTU[i];j++){
684 int index = cumNumSeqs[i] + j;
685 int nI = seqIndex[index];
686 int nIU = mapSeqToUnique[nI];
689 for(k=0;k<position;k++){
696 adF[position] = 0.0000;
701 for(int j=0;j<nSeqsPerOTU[i];j++){
702 int index = cumNumSeqs[i] + j;
703 int nI = seqIndex[index];
705 double tauValue = singleTau[seqNumber[index]];
707 for(int k=0;k<position;k++){
708 // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
709 /*****************************************************************************************************/
710 int flowAValue = anL[k] * numFlowCells;
711 int flowBValue = nI * numFlowCells;
715 for(int l=0;l<lengths[nI];l++){
716 dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
721 dist = dist / (double)lengths[nI];
722 /*****************************************************************************************************/
723 adF[k] += dist * tauValue;
727 for(int j=0;j<position;j++){
728 if(adF[j] < minFlowValue){
730 minFlowValue = adF[j];
734 if(centroids[i] != anL[minFlowGram]){
736 centroids[i] = anL[minFlowGram];
739 else if(centroids[i] != -1){
744 /*****************************************************************************************************/
746 if (pDataArray->m->control_pressed) { break; }
748 //maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
749 /*****************************************************************************************************/
750 double maxChange = 0;
752 for(int i=0;i<numOTUs;i++){
754 if (pDataArray->m->control_pressed) { break; }
756 double difference = weight[i];
759 for(int j=0;j<nSeqsPerOTU[i];j++){
760 int index = cumNumSeqs[i] + j;
761 double tauValue = singleTau[seqNumber[index]];
762 weight[i] += tauValue;
765 difference = fabs(weight[i] - difference);
766 if(difference > maxChange){ maxChange = difference; }
768 maxDelta = maxChange;
769 /*****************************************************************************************************/
771 if (pDataArray->m->control_pressed) { break; }
773 //double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
774 /*****************************************************************************************************/
775 vector<long double> P(numSeqs, 0);
778 for(int i=0;i<numOTUs;i++){
779 if(weight[i] > MIN_WEIGHT){
785 for(int i=0;i<numOTUs;i++){
787 if (pDataArray->m->control_pressed) { break; }
789 for(int j=0;j<nSeqsPerOTU[i];j++){
790 int index = cumNumSeqs[i] + j;
791 int nI = seqIndex[index];
792 double singleDist = dist[seqNumber[index]];
794 P[nI] += weight[i] * exp(-singleDist * pDataArray->sigma);
798 for(int i=0;i<numSeqs;i++){
799 if(P[i] == 0){ P[i] = DBL_EPSILON; }
804 nLL = nLL -(double)numSeqs * log(pDataArray->sigma);
805 /*****************************************************************************************************/
807 if (pDataArray->m->control_pressed) { break; }
809 //checkCentroids(numOTUs, centroids, weight);
810 /*****************************************************************************************************/
811 vector<int> unique(numOTUs, 1);
813 for(int i=0;i<numOTUs;i++){
814 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
819 for(int i=0;i<numOTUs;i++){
821 if (pDataArray->m->control_pressed) { break; }
824 for(int j=i+1;j<numOTUs;j++){
827 if(centroids[j] == centroids[i]){
831 weight[i] += weight[j];
838 /*****************************************************************************************************/
840 if (pDataArray->m->control_pressed) { break; }
842 //calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
843 /*****************************************************************************************************/
845 vector<double> newTau(numOTUs,0);
846 vector<double> norms(numSeqs, 0);
847 nSeqsPerOTU.assign(numOTUs, 0);
849 for(int i=0;i<numSeqs;i++){
851 if (pDataArray->m->control_pressed) { break; }
853 int indexOffset = i * numOTUs;
857 for(int j=0;j<numOTUs;j++){
859 if(weight[j] > MIN_WEIGHT && change[j] == 1){
860 //dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
861 /*****************************************************************************************************/
862 int flowAValue = centroids[j] * numFlowCells;
863 int flowBValue = i * numFlowCells;
867 for(int l=0;l<lengths[i];l++){
868 distTemp += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
873 dist[indexOffset + j] = distTemp / (double)lengths[i];
874 /*****************************************************************************************************/
878 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
879 offset = dist[indexOffset + j];
883 for(int j=0;j<numOTUs;j++){
884 if(weight[j] > MIN_WEIGHT){
885 newTau[j] = exp(pDataArray->sigma * (-dist[indexOffset + j] + offset)) * weight[j];
886 norms[i] += newTau[j];
893 for(int j=0;j<numOTUs;j++){
894 newTau[j] /= norms[i];
897 for(int j=0;j<numOTUs;j++){
898 if(newTau[j] > MIN_TAU){
900 int oldTotal = total;
904 singleTau.resize(total, 0);
905 seqNumber.resize(total, 0);
906 seqIndex.resize(total, 0);
908 singleTau[oldTotal] = newTau[j];
910 aaP[j][nSeqsPerOTU[j]] = oldTotal;
911 aaI[j][nSeqsPerOTU[j]] = i;
918 /*****************************************************************************************************/
920 if (pDataArray->m->control_pressed) { break; }
924 pDataArray->m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
928 if (pDataArray->m->control_pressed) { break; }
930 pDataArray->m->mothurOut("\nFinalizing...\n");
931 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
932 /*****************************************************************************************************/
934 for(int i=0;i<numOTUs;i++){
936 if (pDataArray->m->control_pressed) { return 0; }
938 cumNumSeqs[i] = indexFill;
939 for(int j=0;j<nSeqsPerOTU[i];j++){
940 seqNumber[indexFill] = aaP[i][j];
941 seqIndex[indexFill] = aaI[i][j];
946 /*****************************************************************************************************/
948 if (pDataArray->m->control_pressed) { break; }
950 //setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
951 /*****************************************************************************************************/
952 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
954 for(int i=0;i<numOTUs;i++){
956 if (pDataArray->m->control_pressed) { break; }
958 for(int j=0;j<nSeqsPerOTU[i];j++){
959 int index = cumNumSeqs[i] + j;
960 double tauValue = singleTau[seqNumber[index]];
961 int sIndex = seqIndex[index];
962 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
966 for(int i=0;i<numSeqs;i++){
967 double maxTau = -1.0000;
969 for(int j=0;j<numOTUs;j++){
970 if(bigTauMatrix[i * numOTUs + j] > maxTau){
971 maxTau = bigTauMatrix[i * numOTUs + j];
979 nSeqsPerOTU.assign(numOTUs, 0);
981 for(int i=0;i<numSeqs;i++){
982 int index = otuData[i];
984 singleTau[i] = 1.0000;
987 aaP[index][nSeqsPerOTU[index]] = i;
988 aaI[index][nSeqsPerOTU[index]] = i;
990 nSeqsPerOTU[index]++;
993 //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
994 /*****************************************************************************************************/
996 for(int i=0;i<numOTUs;i++){
998 if (pDataArray->m->control_pressed) { return 0; }
1000 cumNumSeqs[i] = indexFill;
1001 for(int j=0;j<nSeqsPerOTU[i];j++){
1002 seqNumber[indexFill] = aaP[i][j];
1003 seqIndex[indexFill] = aaI[i][j];
1008 /*****************************************************************************************************/
1010 /*****************************************************************************************************/
1012 if (pDataArray->m->control_pressed) { break; }
1014 vector<int> otuCounts(numOTUs, 0);
1015 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
1017 //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
1018 /*****************************************************************************************************/
1019 for(int i=0;i<numOTUs;i++){
1021 if (pDataArray->m->control_pressed) { break; }
1025 int minFlowGram = 100000000;
1026 double minFlowValue = 1e8;
1027 change[i] = 0; //FALSE
1029 for(int j=0;j<nSeqsPerOTU[i];j++){
1030 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1033 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1034 vector<double> adF(nSeqsPerOTU[i]);
1035 vector<int> anL(nSeqsPerOTU[i]);
1037 for(int j=0;j<nSeqsPerOTU[i];j++){
1038 int index = cumNumSeqs[i] + j;
1039 int nI = seqIndex[index];
1040 int nIU = mapSeqToUnique[nI];
1043 for(k=0;k<position;k++){
1049 anL[position] = nIU;
1050 adF[position] = 0.0000;
1055 for(int j=0;j<nSeqsPerOTU[i];j++){
1056 int index = cumNumSeqs[i] + j;
1057 int nI = seqIndex[index];
1059 double tauValue = singleTau[seqNumber[index]];
1061 for(int k=0;k<position;k++){
1062 // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
1063 /*****************************************************************************************************/
1064 int flowAValue = anL[k] * numFlowCells;
1065 int flowBValue = nI * numFlowCells;
1069 for(int l=0;l<lengths[nI];l++){
1070 dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1075 dist = dist / (double)lengths[nI];
1076 /*****************************************************************************************************/
1077 adF[k] += dist * tauValue;
1081 for(int j=0;j<position;j++){
1082 if(adF[j] < minFlowValue){
1084 minFlowValue = adF[j];
1088 if(centroids[i] != anL[minFlowGram]){
1090 centroids[i] = anL[minFlowGram];
1093 else if(centroids[i] != -1){
1099 /*****************************************************************************************************/
1101 if (pDataArray->m->control_pressed) { break; }
1103 //writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI);
1104 if (pDataArray->m->control_pressed) { break; }
1105 /*****************************************************************************************************/
1106 string thisOutputDir = pDataArray->outputDir;
1107 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1108 string qualityFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.qual";
1110 ofstream qualityFile;
1111 pDataArray->m->openOutputFile(qualityFileName, qualityFile);
1113 qualityFile.setf(ios::fixed, ios::floatfield);
1114 qualityFile.setf(ios::showpoint);
1115 qualityFile << setprecision(6);
1117 vector<vector<int> > qualities(numOTUs);
1118 vector<double> pr(HOMOPS, 0);
1121 for(int i=0;i<numOTUs;i++){
1123 if (pDataArray->m->control_pressed) { break; }
1128 if(nSeqsPerOTU[i] > 0){
1129 qualities[i].assign(1024, -1);
1131 while(index < numFlowCells){
1132 double maxPrValue = 1e8;
1133 short maxPrIndex = -1;
1134 double count = 0.0000;
1136 pr.assign(HOMOPS, 0);
1138 for(int j=0;j<nSeqsPerOTU[i];j++){
1139 int lIndex = cumNumSeqs[i] + j;
1140 double tauValue = singleTau[seqNumber[lIndex]];
1141 int sequenceIndex = aaI[i][j];
1142 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1146 for(int s=0;s<HOMOPS;s++){
1147 pr[s] += tauValue * pDataArray->singleLookUp[s * NUMBINS + intensity];
1151 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1152 maxPrValue = pr[maxPrIndex];
1154 if(count > MIN_COUNT){
1156 double norm = 0.0000;
1158 for(int s=0;s<HOMOPS;s++){
1159 norm += exp(-(pr[s] - maxPrValue));
1162 for(int s=1;s<=maxPrIndex;s++){
1164 double temp = 0.0000;
1166 U += exp(-(pr[s-1]-maxPrValue))/norm;
1174 temp = floor(-10 * temp);
1175 value = (int)floor(temp);
1176 if(value > 100){ value = 100; }
1178 qualities[i][base] = (int)value;
1188 if(otuCounts[i] > 0){
1189 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1191 int j=4; //need to get past the first four bases
1192 while(qualities[i][j] != -1){
1193 qualityFile << qualities[i][j] << ' ';
1196 qualityFile << endl;
1199 qualityFile.close();
1200 pDataArray->outputNames.push_back(qualityFileName);
1201 /*****************************************************************************************************/
1203 // writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);
1204 if (pDataArray->m->control_pressed) { break; }
1205 /*****************************************************************************************************/
1206 thisOutputDir = pDataArray->outputDir;
1207 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1208 string fastaFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.fasta";
1210 pDataArray->m->openOutputFile(fastaFileName, fastaFile);
1212 vector<string> names(numOTUs, "");
1214 for(int i=0;i<numOTUs;i++){
1216 if (pDataArray->m->control_pressed) { break; }
1218 int index = centroids[i];
1220 if(otuCounts[i] > 0){
1221 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1225 for(int j=0;j<numFlowCells;j++){
1227 char base = pDataArray->flowOrder[j % 4];
1228 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1233 fastaFile << newSeq.substr(4) << endl;
1238 pDataArray->outputNames.push_back(fastaFileName);
1240 if(pDataArray->thisCompositeFASTAFileName != ""){
1241 pDataArray->m->appendFiles(fastaFileName, pDataArray->thisCompositeFASTAFileName);
1244 /*****************************************************************************************************/
1246 //writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);
1247 if (pDataArray->m->control_pressed) { break; }
1248 /*****************************************************************************************************/
1249 thisOutputDir = pDataArray->outputDir;
1250 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1251 string nameFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.names";
1252 ofstream nameFileOut;
1253 pDataArray->m->openOutputFile(nameFileName, nameFileOut);
1255 for(int i=0;i<numOTUs;i++){
1257 if (pDataArray->m->control_pressed) { break; }
1259 if(otuCounts[i] > 0){
1260 nameFileOut << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1262 for(int j=1;j<nSeqsPerOTU[i];j++){
1263 nameFileOut << ',' << seqNameVector[aaI[i][j]];
1266 nameFileOut << endl;
1269 nameFileOut.close();
1270 pDataArray->outputNames.push_back(nameFileName);
1273 if(pDataArray->thisCompositeNameFileName != ""){
1274 pDataArray->m->appendFiles(nameFileName, pDataArray->thisCompositeNameFileName);
1276 /*****************************************************************************************************/
1278 //writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);
1279 if (pDataArray->m->control_pressed) { break; }
1280 /*****************************************************************************************************/
1281 thisOutputDir = pDataArray->outputDir;
1282 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1283 string otuCountsFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.counts";
1284 ofstream otuCountsFile;
1285 pDataArray->m->openOutputFile(otuCountsFileName, otuCountsFile);
1287 string bases = pDataArray->flowOrder;
1289 for(int i=0;i<numOTUs;i++){
1291 if (pDataArray->m->control_pressed) {
1294 //output the translated version of the centroid sequence for the otu
1295 if(otuCounts[i] > 0){
1296 int index = centroids[i];
1298 otuCountsFile << "ideal\t";
1299 for(int j=8;j<numFlowCells;j++){
1300 char base = bases[j % 4];
1301 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1302 otuCountsFile << base;
1305 otuCountsFile << endl;
1307 for(int j=0;j<nSeqsPerOTU[i];j++){
1308 int sequence = aaI[i][j];
1309 otuCountsFile << seqNameVector[sequence] << '\t';
1313 for(int k=0;k<lengths[sequence];k++){
1314 char base = bases[k % 4];
1315 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1317 for(int s=0;s<freq;s++){
1319 //otuCountsFile << base;
1322 otuCountsFile << newSeq.substr(4) << endl;
1324 otuCountsFile << endl;
1327 otuCountsFile.close();
1328 pDataArray->outputNames.push_back(otuCountsFileName);
1329 /*****************************************************************************************************/
1331 //writeGroups(flowFileName, numSeqs, seqNameVector);
1332 if (pDataArray->m->control_pressed) { break; }
1333 /*****************************************************************************************************/
1334 thisOutputDir = pDataArray->outputDir;
1335 if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
1336 string fileRoot = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName));
1337 string groupFileName = fileRoot + "shhh.groups";
1339 pDataArray->m->openOutputFile(groupFileName, groupFile);
1341 for(int i=0;i<numSeqs;i++){
1342 if (pDataArray->m->control_pressed) { break; }
1343 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1346 pDataArray->outputNames.push_back(groupFileName);
1347 /*****************************************************************************************************/
1349 pDataArray->m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
1352 if (pDataArray->m->control_pressed) { for (int i = 0; i < pDataArray->outputNames.size(); i++) { pDataArray->m->mothurRemove(pDataArray->outputNames[i]); } return 0; }
1357 catch(exception& e) {
1358 pDataArray->m->errorOut(e, "ShhherCommand", "ShhhFlowsThreadFunction");