]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
modifications to shhh.seqs
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsematrix.hpp"
19 #include <cfloat>
20
21 //**********************************************************************************************************************
22
23 #define NUMBINS 1000
24 #define HOMOPS 10
25 #define MIN_COUNT 0.1
26 #define MIN_WEIGHT 0.1
27 #define MIN_TAU 0.0001
28 #define MIN_ITER 10
29
30 //**********************************************************************************************************************
31
32 vector<string> ShhherCommand::getValidParameters(){     
33         try {
34                 string Array[] =  {     
35                         "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors"       
36                 };
37                 
38                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
39                 return myArray;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ShhherCommand", "getValidParameters");
43                 exit(1);
44         }
45 }
46
47 //**********************************************************************************************************************
48
49 ShhherCommand::ShhherCommand(){ 
50         try {
51                 abort = true;
52                 
53                 //initialize outputTypes
54                 vector<string> tempOutNames;
55                 outputTypes["pn.dist"] = tempOutNames;
56
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
60                 exit(1);
61         }
62 }
63
64 //**********************************************************************************************************************
65
66 vector<string> ShhherCommand::getRequiredParameters(){  
67         try {
68                 string Array[] =  {"flow"};
69                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
70                 return myArray;
71         }
72         catch(exception& e) {
73                 m->errorOut(e, "ShhherCommand", "getRequiredParameters");
74                 exit(1);
75         }
76 }
77
78 //**********************************************************************************************************************
79
80 vector<string> ShhherCommand::getRequiredFiles(){       
81         try {
82                 vector<string> myArray;
83                 return myArray;
84         }
85         catch(exception& e) {
86                 m->errorOut(e, "ShhherCommand", "getRequiredFiles");
87                 exit(1);
88         }
89 }
90
91 //**********************************************************************************************************************
92
93 ShhherCommand::ShhherCommand(string option) {
94         try {
95
96 #ifdef USE_MPI
97                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
98                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
99
100                 if(pid == 0){
101 #endif
102                 
103                 
104                 abort = false;
105                 
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; }
109                 
110                 else {
111                         
112                         //valid paramters for this command
113                         string AlignArray[] =  {
114                                 "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors"       
115                         };
116                         
117                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
118                         
119                         OptionParser parser(option);
120                         map<string,string> parameters = parser.getParameters();
121                         
122                         ValidParameters validParameter;
123                         map<string,string>::iterator it;
124                         
125                         //check to make sure all parameters are valid for command
126                         for (it = parameters.begin(); it != parameters.end(); it++) { 
127                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
128                         }
129                         
130                         //initialize outputTypes
131                         vector<string> tempOutNames;
132                         outputTypes["pn.dist"] = tempOutNames;
133                         //                      outputTypes["fasta"] = tempOutNames;
134                         
135                         //if the user changes the input directory command factory will send this info to us in the output parameter 
136                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
137                         if (inputDir == "not found"){   inputDir = "";          }
138                         else {
139                                 string path;
140                                 it = parameters.find("flow");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
146                                 }
147                                 
148                                 it = parameters.find("lookup");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
154                                 }
155
156                                 it = parameters.find("file");
157                                 //user has given a template file
158                                 if(it != parameters.end()){ 
159                                         path = m->hasPath(it->second);
160                                         //if the user has not given a path then, add inputdir. else leave path alone.
161                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
162                                 }
163                         }
164                         
165                         
166                         //check for required parameters
167                         flowFileName = validParameter.validFile(parameters, "flow", true);
168                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
169                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
170                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
171                                 m->mothurOutEndLine();
172                                 abort = true; 
173                         }
174                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }       
175                         
176                         //if the user changes the output directory command factory will send this info to us in the output parameter 
177                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
178                                 outputDir = ""; 
179                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
180                         }
181                         
182                         
183                         //check for optional parameter and set defaults
184                         // ...at some point should added some additional type checking...
185                         string temp;
186                         temp = validParameter.validFile(parameters, "lookup", true);
187                         if (temp == "not found")        {       lookupFileName = "LookUp_Titanium.pat"; }
188                         else if(temp == "not open")     {       abort = true;                   } 
189                         else                                            {       lookupFileName = temp;  }
190                         
191                         temp = validParameter.validFile(parameters, "processors", false);if (temp == "not found"){      temp = "1";                     }
192                         convert(temp, processors); 
193
194                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
195                         convert(temp, cutoff); 
196                         
197                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
198                         convert(temp, minDelta); 
199
200                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
201                         convert(temp, maxIters); 
202
203                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
204                         convert(temp, sigma); 
205                         
206                         globaldata = GlobalData::getInstance();
207                 }
208                         
209 #ifdef USE_MPI
210                 }                               
211 #endif
212                                 
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
216                 exit(1);
217         }
218 }
219
220 //**********************************************************************************************************************
221
222 ShhherCommand::~ShhherCommand(){}
223
224 //**********************************************************************************************************************
225
226 void ShhherCommand::help(){
227         try {
228                 m->mothurOut("The shhher command reads a file containing flowgrams and creates a file of corrected sequences.\n");
229         }
230         catch(exception& e) {
231                 m->errorOut(e, "ShhherCommand", "help");
232                 exit(1);
233         }
234 }
235
236 //**********************************************************************************************************************
237 #ifdef USE_MPI
238 int ShhherCommand::execute(){
239         try {
240                 int tag = 1976;
241                 MPI_Status status; 
242
243                 double begClock = clock();
244                 unsigned long int begTime = time(NULL);
245
246                 cout.setf(ios::fixed, ios::floatfield);
247                 cout.setf(ios::showpoint);
248                 cout << setprecision(2);
249                 
250                 
251                 if(pid == 0){
252
253                         for(int i=1;i<ncpus;i++){
254                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
255                         }
256                         if(abort == 1){ return 0;       }
257
258                         processors = ncpus;
259                         
260                         cout << "\nGetting preliminary data..." << endl;
261                         getSingleLookUp();
262                         getJointLookUp();
263                         
264                         vector<string> flowFileVector;
265                         if(flowFilesFileName != "not found"){
266                                 string fName;
267
268                                 ifstream flowFilesFile;
269                                 m->openInputFile(flowFilesFileName, flowFilesFile);
270                                 while(flowFilesFile){
271                                         flowFilesFile >> fName;
272                                         flowFileVector.push_back(fName);
273                                         m->gobble(flowFilesFile);
274                                 }
275                         }
276                         else{
277                                 flowFileVector.push_back(flowFileName);
278                         }
279                         int numFiles = flowFileVector.size();
280
281                         for(int i=1;i<ncpus;i++){
282                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
283                         }
284                         
285                         for(int i=0;i<numFiles;i++){
286                                 flowFileName = flowFileVector[i];
287                         
288                                 cout << "\n>>>>>\tProcessing " << flowFileName << " (file " << i+1 << " of " << numFiles << ")\t<<<<<" << endl;
289                                 cout << "Reading flowgrams..." << endl;
290                                 getFlowData();
291                                 cout << "Identifying unique flowgrams..." << endl;
292                                 getUniques();
293
294                                 cout << "Calculating distances between flowgrams..." << endl;
295                                 char fileName[1024];
296                                 strcpy(fileName, flowFileName.c_str());
297
298                                 for(int i=1;i<ncpus;i++){
299                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
300
301                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
302                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
303                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
304                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
305                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
306                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
307                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
308                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
309                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
310                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
311                                 }                               
312                                                         
313                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
314                                 
315                                 int done;
316                                 for(int i=1;i<ncpus;i++){
317                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
318                                         
319                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
320                                         remove((distFileName + ".temp." + toString(i)).c_str());
321                                 }
322
323                                 string namesFileName = createNamesFile();
324                                 
325                                 cout << "\nClustering flowgrams..." << endl;
326                                 string listFileName = cluster(distFileName, namesFileName);
327         //                      string listFileName = "PriestPot_C7.pn.list";
328         //                      string listFileName = "test.mock_rep3.v69.pn.list";
329                         
330                                 getOTUData(listFileName);
331                                 initPyroCluster();
332
333                                 for(int i=1;i<ncpus;i++){
334                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
335                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
336                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
337                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
338                                 }
339                                 
340                                 
341                                 double maxDelta = 0;
342                                 int iter = 0;
343                                 
344                                 int numOTUsOnCPU = numOTUs / ncpus;
345                                 int numSeqsOnCPU = numSeqs / ncpus;
346                                 
347                                 cout << "\nDenoising flowgrams..." << endl;
348                                 cout << "iter\tmaxDelta\tnLL\t\tcycletime" << endl;
349                                 
350                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
351
352                                         double cycClock = clock();
353                                         unsigned long int cycTime = time(NULL);
354                                         fill();
355
356                                         int total = singleTau.size();
357                                         for(int i=1;i<ncpus;i++){
358                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
359                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
360                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
361                                                 
362                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
363                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
364                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
365                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
366                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
367                                         }
368                                 
369                                         calcCentroidsDriver(0, numOTUsOnCPU);
370                                         
371                                         for(int i=1;i<ncpus;i++){
372                                                 int otuStart = i * numOTUs / ncpus;
373                                                 int otuStop = (i + 1) * numOTUs / ncpus;
374                                                 
375                                                 vector<int> tempCentroids(numOTUs, 0);
376                                                 vector<short> tempChange(numOTUs, 0);
377                                                 
378                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
379                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
380                                                 
381                                                 for(int j=otuStart;j<otuStop;j++){
382                                                         centroids[j] = tempCentroids[j];
383                                                         change[j] = tempChange[j];
384                                                 }
385                                         }
386                                                                         
387                                         maxDelta = getNewWeights();
388                                         double nLL = getLikelihood();
389                                         checkCentroids();
390                                         
391                                         for(int i=1;i<ncpus;i++){
392                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
393                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
394                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
395                                         }
396                                         
397                                         calcNewDistancesParent(0, numSeqsOnCPU);
398
399                                         total = singleTau.size();
400
401                                         for(int i=1;i<ncpus;i++){
402                                                 int childTotal;
403                                                 int seqStart = i * numSeqs / ncpus;
404                                                 int seqStop = (i + 1) * numSeqs / ncpus;
405                                                 
406                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
407
408                                                 vector<int> childSeqIndex(childTotal, 0);
409                                                 vector<double> childSingleTau(childTotal, 0);
410                                                 vector<double> childDist(numSeqs * numOTUs, 0);
411                                                 vector<int> otuIndex(childTotal, 0);
412                                                 
413                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
414                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
415                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
416                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
417
418                                                 int oldTotal = total;
419                                                 total += childTotal;
420                                                 singleTau.resize(total, 0);
421                                                 seqIndex.resize(total, 0);
422                                                 seqNumber.resize(total, 0);
423                                                 
424                                                 int childIndex = 0;
425                                                 
426                                                 for(int j=oldTotal;j<total;j++){
427                                                         int otuI = otuIndex[childIndex];
428                                                         int seqI = childSeqIndex[childIndex];
429                                                         
430                                                         singleTau[j] = childSingleTau[childIndex];
431                                                         
432                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
433                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
434                                                         nSeqsPerOTU[otuI]++;
435                                                         childIndex++;
436                                                 }
437                                                 
438                                                 int index = seqStart * numOTUs;
439                                                 for(int j=seqStart;j<seqStop;j++){
440                                                         for(int k=0;k<numOTUs;k++){
441                                                                 dist[index] = childDist[index];
442                                                                 index++;
443                                                         }
444                                                 }                                       
445                                         }
446                                         
447                                         iter++;
448                                         
449                                         cout << iter << '\t' << maxDelta << '\t' << setprecision(2) << nLL << '\t' << time(NULL) - cycTime << '\t' << setprecision(6) << (clock() - cycClock)/(double)CLOCKS_PER_SEC << endl;                   
450
451                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
452                                                 int live = 1;
453                                                 for(int i=1;i<ncpus;i++){
454                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
455                                                 }
456                                         }
457                                         else{
458                                                 int live = 0;
459                                                 for(int i=1;i<ncpus;i++){
460                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
461                                                 }
462                                         }
463                                         
464                                 }       
465                                 
466                                 cout << "\nFinalizing..." << endl;
467                                 fill();
468                                 setOTUs();
469                                 vector<int> otuCounts(numOTUs, 0);
470                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
471                                 calcCentroidsDriver(0, numOTUs);
472                                 writeQualities(otuCounts);
473                                 writeSequences(otuCounts);
474                                 writeNames(otuCounts);
475                                 writeClusters(otuCounts);
476                                 writeGroups();
477                                 
478                                 remove(distFileName.c_str());
479                                 remove(namesFileName.c_str());
480                                 remove(listFileName.c_str());
481                                 
482                                 cout << "Total time to process " << flowFileName << ":\t" << time(NULL) - begTime << '\t' << setprecision(6) << (clock() - begClock)/(double)CLOCKS_PER_SEC << endl;                    
483                         }
484
485                 }
486                 else{
487                         int abort = 1;
488                         bool live = 1;
489
490                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
491                         if(abort){      return 0;       }
492
493                         int numFiles;
494                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
495
496                         for(int i=0;i<numFiles;i++){
497                                 //Now into the pyrodist part
498                                 char fileName[1024];
499                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
500                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
501                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
502                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
503                                 
504                                 flowDataIntI.resize(numSeqs * numFlowCells);
505                                 flowDataPrI.resize(numSeqs * numFlowCells);
506                                 mapUniqueToSeq.resize(numSeqs);
507                                 mapSeqToUnique.resize(numSeqs);
508                                 lengths.resize(numSeqs);
509                                 jointLookUp.resize(NUMBINS * NUMBINS);
510
511                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
512                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
513                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
514                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
515                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
516                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
517                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
518
519                                 flowFileName = string(fileName);
520                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
521                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
522                                 
523                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
524
525                                 int done = 1;
526                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
527
528                                 //Now into the pyronoise part
529                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
530                                 
531                                 singleLookUp.resize(HOMOPS * NUMBINS);
532                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
533                                 weight.resize(numOTUs);
534                                 centroids.resize(numOTUs);
535                                 change.resize(numOTUs);
536                                 dist.assign(numOTUs * numSeqs, 0);
537                                 nSeqsPerOTU.resize(numOTUs);
538                                 cumNumSeqs.resize(numOTUs);
539
540                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
541                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
542                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
543                                 
544                                 int startOTU = pid * numOTUs / ncpus;
545                                 int endOTU = (pid + 1) * numOTUs / ncpus;
546
547                                 int startSeq = pid * numSeqs / ncpus;
548                                 int endSeq = (pid + 1) * numSeqs /ncpus;
549                                 
550                                 int total;
551
552                                 while(live){
553
554                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
555                                         singleTau.assign(total, 0.0000);
556                                         seqNumber.assign(total, 0);
557                                         seqIndex.assign(total, 0);
558                                         
559                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
560                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
561                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
562                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
563                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
564                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
565                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
566                                         
567                                         calcCentroidsDriver(startOTU, endOTU);
568
569                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
570                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
571
572                                         
573                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
574                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
575                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
576
577                                         vector<int> otuIndex(total, 0);
578                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
579                                         total = otuIndex.size();
580                                         
581                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
582                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
583                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
584                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
585                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
586                                 
587                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
588                                 }
589                         }
590                 }               
591
592                 MPI_Barrier(MPI_COMM_WORLD);
593                 return 0;
594
595         }
596         catch(exception& e) {
597                 m->errorOut(e, "ShhherCommand", "execute");
598                 exit(1);
599         }
600 }
601
602 /**************************************************************************************************/
603
604 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
605         try{            
606                 ostringstream outStream;
607                 outStream.setf(ios::fixed, ios::floatfield);
608                 outStream.setf(ios::dec, ios::basefield);
609                 outStream.setf(ios::showpoint);
610                 outStream.precision(6);
611                 
612                 int begTime = time(NULL);
613                 double begClock = clock();
614                 
615                 for(int i=startSeq;i<stopSeq;i++){
616                         for(int j=0;j<i;j++){
617                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
618                                 
619                                 if(flowDistance < 1e-6){
620                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
621                                 }
622                                 else if(flowDistance <= cutoff){
623                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
624                                 }
625                         }
626                         if(i % 100 == 0){
627                                 cout << i << "\t" << (time(NULL) - begTime) << "\t" << (clock()-begClock)/CLOCKS_PER_SEC << endl;
628                         }
629                 }
630                 cout << stopSeq << "\t" << (time(NULL) - begTime) << "\t" << (clock()-begClock)/CLOCKS_PER_SEC << endl;
631                 
632                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
633                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
634
635                 ofstream distFile(fDistFileName.c_str());
636                 distFile << outStream.str();            
637                 distFile.close();
638                 
639                 return fDistFileName;
640         }
641         catch(exception& e) {
642                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
643                 exit(1);
644         }
645 }
646
647 #else
648 //**********************************************************************************************************************
649
650 int ShhherCommand::execute(){
651         try {
652                 if (abort == true) { return 0; }
653                 
654                 cout.setf(ios::fixed, ios::floatfield);
655                 cout.setf(ios::showpoint);
656                 
657                 getSingleLookUp();
658                 getJointLookUp();
659                                 
660                 vector<string> flowFileVector;
661                 if(flowFilesFileName != "not found"){
662                         string fName;
663                         
664                         ifstream flowFilesFile;
665                         m->openInputFile(flowFilesFileName, flowFilesFile);
666                         while(flowFilesFile){
667                                 flowFilesFile >> fName;
668                                 flowFileVector.push_back(fName);
669                                 m->gobble(flowFilesFile);
670                         }
671                 }
672                 else{
673                         flowFileVector.push_back(flowFileName);
674                 }
675                 int numFiles = flowFileVector.size();
676                 
677                 
678                 for(int i=0;i<numFiles;i++){
679                         flowFileName = flowFileVector[i];
680
681                         cout << "\n>>>>>\tProcessing " << flowFileName << " (file " << i+1 << " of " << numFiles << ")\t<<<<<" << endl;
682                         cout << "Reading flowgrams..." << endl;
683                         getFlowData();
684                         cout << "Identifying unique flowgrams..." << endl;
685                         getUniques();
686                         
687                         
688                         cout << "Calculating distances between flowgrams..." << endl;                   
689                         string distFileName = createDistFile(processors);
690                         string namesFileName = createNamesFile();
691                         
692                         cout << "\nClustering flowgrams..." << endl;
693                         string listFileName = cluster(distFileName, namesFileName);
694                         getOTUData(listFileName);
695                         
696                         initPyroCluster();
697                         
698                         double maxDelta = 0;
699                         int iter = 0;
700                         
701                         double begClock = clock();
702                         unsigned long int begTime = time(NULL);
703
704                         cout << "\nDenoising flowgrams..." << endl;
705                         cout << "iter\tmaxDelta\tnLL\t\tcycletime" << endl;
706                         
707                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
708                                 
709                                 double cycClock = clock();
710                                 unsigned long int cycTime = time(NULL);
711                                 fill();
712                                 
713                                 calcCentroids();
714                                 
715                                 maxDelta = getNewWeights();
716                                 double nLL = getLikelihood();
717                                 checkCentroids();
718                                 
719                                 calcNewDistances();
720
721                                 iter++;
722                                 
723                                 cout << iter << '\t' << maxDelta << '\t' << setprecision(2) << nLL << '\t' << time(NULL) - cycTime << '\t' << setprecision(6) << (clock() - cycClock)/(double)CLOCKS_PER_SEC << endl;                   
724                         }       
725                         
726                         cout << "\nFinalizing..." << endl;
727                         fill();
728                         setOTUs();
729                         
730                         vector<int> otuCounts(numOTUs, 0);
731                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
732                         
733                         calcCentroidsDriver(0, numOTUs);
734                         writeQualities(otuCounts);
735                         writeSequences(otuCounts);
736                         writeNames(otuCounts);
737                         writeClusters(otuCounts);
738                         writeGroups();
739                         
740                         remove(distFileName.c_str());
741                         remove(namesFileName.c_str());
742                         remove(listFileName.c_str());
743                         
744                         cout << "Total time to process " << flowFileName << ":\t" << time(NULL) - begTime << '\t' << setprecision(6) << (clock() - begClock)/(double)CLOCKS_PER_SEC << endl;                    
745                 }
746                 return 0;
747         }
748         catch(exception& e) {
749                 m->errorOut(e, "ShhherCommand", "execute");
750                 exit(1);
751         }
752 }
753 #endif
754 /**************************************************************************************************/
755
756 void ShhherCommand::getFlowData(){
757         try{
758                 ifstream flowFile;
759                 m->openInputFile(flowFileName, flowFile);
760                 
761                 string seqName;
762                 
763                 int currentNumFlowCells;
764                 
765                 float intensity;
766                 
767                 flowFile >> numFlowCells;
768                 int index = 0;//pcluster
769                 while(!flowFile.eof()){
770                         flowFile >> seqName >> currentNumFlowCells;
771                         lengths.push_back(currentNumFlowCells);
772
773                         seqNameVector.push_back(seqName);
774                         nameMap[seqName] = index++;//pcluster
775
776                         for(int i=0;i<numFlowCells;i++){
777                                 flowFile >> intensity;
778                                 if(intensity > 9.99)    {       intensity = 9.99;       }
779                                 int intI = int(100 * intensity + 0.0001);
780                                 flowDataIntI.push_back(intI);
781                         }
782                         m->gobble(flowFile);
783                 }
784                 flowFile.close();
785                 
786                 numSeqs = seqNameVector.size();         
787                 
788                 for(int i=0;i<numSeqs;i++){
789                         int iNumFlowCells = i * numFlowCells;
790                         for(int j=lengths[i];j<numFlowCells;j++){
791                                 flowDataIntI[iNumFlowCells + j] = 0;
792                         }
793                 }
794                 
795         }
796         catch(exception& e) {
797                 m->errorOut(e, "ShhherCommand", "getFlowData");
798                 exit(1);
799         }
800 }
801
802 /**************************************************************************************************/
803
804 void ShhherCommand::getSingleLookUp(){
805         try{
806                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
807                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
808                 
809                 int index = 0;
810                 ifstream lookUpFile;
811                 m->openInputFile(lookupFileName, lookUpFile);
812                 
813                 for(int i=0;i<HOMOPS;i++){
814                         float logFracFreq;
815                         lookUpFile >> logFracFreq;
816                         
817                         for(int j=0;j<NUMBINS;j++)      {
818                                 lookUpFile >> singleLookUp[index];
819                                 index++;
820                         }
821                 }       
822                 lookUpFile.close();
823         }
824         catch(exception& e) {
825                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
826                 exit(1);
827         }
828 }
829
830 /**************************************************************************************************/
831
832 void ShhherCommand::getJointLookUp(){
833         try{
834                 
835                 //      the most likely joint probability (-log) that two intenities have the same polymer length
836                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
837                 
838                 for(int i=0;i<NUMBINS;i++){
839                         for(int j=0;j<NUMBINS;j++){             
840                                 
841                                 float minSum = 10000000000;
842                                 
843                                 for(int k=0;k<HOMOPS;k++){
844                                         float sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
845                                         
846                                         if(sum < minSum)        {       minSum = sum;           }
847                                 }       
848                                 jointLookUp[i * NUMBINS + j] = minSum;
849                         }
850                 }
851         }
852         catch(exception& e) {
853                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
854                 exit(1);
855         }
856 }
857
858 /**************************************************************************************************/
859
860 float ShhherCommand::getProbIntensity(int intIntensity){                          
861         try{
862                 float minNegLogProb = 10000000000; 
863                 
864                 for(int i=0;i<HOMOPS;i++){//loop signal strength
865                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
866                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
867                 }
868                 
869                 return minNegLogProb;
870         }
871         catch(exception& e) {
872                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
873                 exit(1);
874         }
875 }
876
877 /**************************************************************************************************/
878
879 void ShhherCommand::getUniques(){
880         try{
881                 
882                 
883                 numUniques = 0;
884                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
885                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
886                 uniqueLengths.assign(numSeqs, 0);
887                 mapSeqToUnique.assign(numSeqs, -1);
888                 mapUniqueToSeq.assign(numSeqs, -1);
889                 
890                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
891                 
892                 for(int i=0;i<numSeqs;i++){
893                         int index = 0;
894                         
895                         vector<short> current(numFlowCells);
896                         for(int j=0;j<numFlowCells;j++){        current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));        }
897                                                 
898                         for(int j=0;j<numUniques;j++){
899                                 int offset = j * numFlowCells;
900                                 bool toEnd = 1;
901                                 
902                                 for(int k=0;k<numFlowCells;k++){
903                                         if(current[k] != uniqueFlowgrams[offset + k]){
904                                                 toEnd = 0;
905                                                 break;
906                                         }
907                                 }
908                                 
909                                 if(toEnd){
910                                         mapSeqToUnique[i] = j;
911                                         uniqueCount[j]++;
912                                         index = j;
913                                         break;
914                                 }
915                                 index++;
916                         }
917                         
918                         if(index == numUniques){
919                                 uniqueLengths[numUniques] = lengths[i];
920                                 uniqueCount[numUniques] = 1;
921                                 mapSeqToUnique[i] = numUniques;//anMap
922                                 mapUniqueToSeq[numUniques] = i;//anF
923                                 
924                                 for(int k=0;k<numFlowCells;k++){
925                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
926                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
927                                 }
928                                 
929                                 numUniques++;
930                         }
931                 }
932                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
933                 uniqueLengths.resize(numUniques);       
934                 
935                 flowDataPrI.assign(numSeqs * numFlowCells, 0);
936                 for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
937         }
938         catch(exception& e) {
939                 m->errorOut(e, "ShhherCommand", "getUniques");
940                 exit(1);
941         }
942 }
943
944 /**************************************************************************************************/
945
946 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
947         try{
948                 int minLength = lengths[mapSeqToUnique[seqA]];
949                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
950                 
951                 int ANumFlowCells = seqA * numFlowCells;
952                 int BNumFlowCells = seqB * numFlowCells;
953                 
954                 float dist = 0;
955                 
956                 for(int i=0;i<minLength;i++){
957                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
958                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
959                         
960                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
961                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
962                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
963                 }
964                 
965                 dist /= (float) minLength;
966                 return dist;
967         }
968         catch(exception& e) {
969                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
970                 exit(1);
971         }
972 }
973
974 /**************************************************************************************************/
975
976 void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
977         try{            
978
979                 ostringstream outStream;
980                 outStream.setf(ios::fixed, ios::floatfield);
981                 outStream.setf(ios::dec, ios::basefield);
982                 outStream.setf(ios::showpoint);
983                 outStream.precision(6);
984                 
985                 int begTime = time(NULL);
986                 double begClock = clock();
987
988                 for(int i=startSeq;i<stopSeq;i++){
989                         for(int j=0;j<i;j++){
990                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
991
992                                 if(flowDistance < 1e-6){
993                                         outStream << seqNameVector[mapUniqueToSeq[i]] << '\t' << seqNameVector[mapUniqueToSeq[j]] << '\t' << 0.000000 << endl;
994                                 }
995                                 else if(flowDistance <= cutoff){
996                                         outStream << seqNameVector[mapUniqueToSeq[i]] << '\t' << seqNameVector[mapUniqueToSeq[j]] << '\t' << flowDistance << endl;
997                                 }
998                         }
999                         if(i % 100 == 0){
1000                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
1001                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1002                                 m->mothurOutEndLine();
1003                         }
1004                 }
1005                 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
1006                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1007                 m->mothurOutEndLine();
1008                 
1009                 ofstream distFile(distFileName.c_str());
1010                 distFile << outStream.str();            
1011                 distFile.close();
1012         }
1013         catch(exception& e) {
1014                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
1015                 exit(1);
1016         }
1017 }
1018
1019 /**************************************************************************************************/
1020
1021 string ShhherCommand::createDistFile(int processors){
1022         try{
1023                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
1024                                 
1025                 unsigned long int begTime = time(NULL);
1026                 double begClock = clock();
1027
1028                 vector<int> start;
1029                 vector<int> end;
1030                 
1031 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1032                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
1033                 else{ //you have multiple processors
1034                         
1035                         if (numSeqs < processors){      processors = 1; }
1036                         
1037                         vector<int> start(processors, 0);
1038                         vector<int> end(processors, 0);
1039                         
1040                         for (int i = 0; i < processors; i++) {
1041                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
1042                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
1043                         }
1044                         
1045                         int process = 1;
1046                         vector<int> processIDs;
1047                         
1048                         //loop through and create all the processes you want
1049                         while (process != processors) {
1050                                 int pid = fork();
1051                                 
1052                                 if (pid > 0) {
1053                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1054                                         process++;
1055                                 }else if (pid == 0){
1056                                         flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
1057                                         exit(0);
1058                                 }else { 
1059                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1060                                         perror(" : ");
1061                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1062                                         exit(0);
1063                                 }
1064                         }
1065                         
1066                         //parent does its part
1067                         flowDistParentFork(fDistFileName, start[0], end[0]);
1068                         
1069                         //force parent to wait until all the processes are done
1070                         for (int i=0;i<processIDs.size();i++) { 
1071                                 int temp = processIDs[i];
1072                                 wait(&temp);
1073                         }
1074                         
1075                         //append and remove temp files
1076                         for (int i=0;i<processIDs.size();i++) { 
1077                                 m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
1078                                 remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
1079                         }
1080                         
1081                 }
1082                 
1083 #else
1084                 flowDistParentFork(fDistFileName, 0, numUniques);
1085 #endif
1086
1087                 m->mothurOutEndLine();
1088                 
1089                 cout << "Total time: " << (time(NULL) - begTime) << "\t"  << (clock() - begClock)/CLOCKS_PER_SEC << endl;;
1090
1091                 return fDistFileName;
1092         }
1093         catch(exception& e) {
1094                 m->errorOut(e, "ShhherCommand", "createDistFile");
1095                 exit(1);
1096         }
1097         
1098 }
1099
1100 /**************************************************************************************************/
1101
1102 string ShhherCommand::createNamesFile(){
1103         try{
1104                 
1105                 vector<string> duplicateNames(numUniques, "");
1106                 for(int i=0;i<numSeqs;i++){
1107                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
1108                 }
1109                 
1110                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.names";
1111                 
1112                 ofstream nameFile;
1113                 m->openOutputFile(nameFileName, nameFile);
1114                 
1115                 for(int i=0;i<numUniques;i++){
1116 //                      nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1117                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1118                 }
1119                 
1120                 nameFile.close();
1121                 return  nameFileName;
1122         }
1123         catch(exception& e) {
1124                 m->errorOut(e, "ShhherCommand", "createNamesFile");
1125                 exit(1);
1126         }
1127 }
1128
1129 //**********************************************************************************************************************
1130
1131 string ShhherCommand::cluster(string distFileName, string namesFileName){
1132         try {
1133                 
1134                 SparseMatrix* matrix;
1135                 ListVector* list;
1136                 RAbundVector* rabund;
1137                 
1138                 globaldata->setNameFile(namesFileName);
1139                 globaldata->setColumnFile(distFileName);
1140                 globaldata->setFormat("column");
1141                 
1142                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1143                 read->setCutoff(cutoff);
1144                 
1145                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1146                 clusterNameMap->readMap();
1147                 read->read(clusterNameMap);
1148                 
1149                 list = read->getListVector();
1150                 matrix = read->getMatrix();
1151                 
1152                 delete read; 
1153                 delete clusterNameMap; 
1154                                 
1155                 rabund = new RAbundVector(list->getRAbundVector());
1156                 
1157                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1158                 string tag = cluster->getTag();
1159                 
1160                 double clusterCutoff = cutoff;
1161                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1162                         cluster->update(clusterCutoff);
1163                         float dist = matrix->getSmallDist();
1164                 }
1165                 
1166                 list->setLabel(toString(cutoff));
1167                 
1168                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.list";
1169                 ofstream listFile;
1170                 m->openOutputFile(listFileName, listFile);
1171                 list->print(listFile);
1172                 listFile.close();
1173                 
1174                 delete matrix;  delete cluster; delete rabund; delete list;
1175         
1176                 return listFileName;
1177         }
1178         catch(exception& e) {
1179                 m->errorOut(e, "ShhherCommand", "cluster");
1180                 exit(1);        
1181         }               
1182 }
1183
1184 /**************************************************************************************************/
1185
1186 void ShhherCommand::getOTUData(string listFileName){
1187         try {
1188
1189                 ifstream listFile;
1190                 m->openInputFile(listFileName, listFile);
1191                 string label;
1192                 
1193                 listFile >> label >> numOTUs;
1194
1195                 otuData.assign(numSeqs, 0);
1196                 cumNumSeqs.assign(numOTUs, 0);
1197                 nSeqsPerOTU.assign(numOTUs, 0);
1198                 aaP.resize(numOTUs);
1199                 
1200                 string singleOTU = "";
1201                 
1202                 for(int i=0;i<numOTUs;i++){
1203
1204                         listFile >> singleOTU;
1205                         
1206                         istringstream otuString(singleOTU);
1207
1208                         while(otuString){
1209                                 
1210                                 string seqName = "";
1211                                 
1212                                 for(int j=0;j<singleOTU.length();j++){
1213                                         char letter = otuString.get();
1214                                         
1215                                         if(letter != ','){
1216                                                 seqName += letter;
1217                                         }
1218                                         else{
1219                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1220                                                 int index = nmIt->second;
1221                                                 
1222                                                 nameMap.erase(nmIt);
1223                                                 
1224                                                 otuData[index] = i;
1225                                                 nSeqsPerOTU[i]++;
1226                                                 aaP[i].push_back(index);
1227                                                 seqName = "";
1228                                         }
1229                                 }
1230                                 
1231                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1232
1233                                 int index = nmIt->second;
1234                                 nameMap.erase(nmIt);
1235
1236                                 otuData[index] = i;
1237                                 nSeqsPerOTU[i]++;
1238                                 aaP[i].push_back(index);        
1239                                 
1240                                 otuString.get();
1241                         }
1242                         
1243                         sort(aaP[i].begin(), aaP[i].end());
1244                         for(int j=0;j<nSeqsPerOTU[i];j++){
1245                                 seqNumber.push_back(aaP[i][j]);
1246                         }
1247                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
1248                                 aaP[i].push_back(0);
1249                         }
1250                 }
1251                 
1252                 for(int i=1;i<numOTUs;i++){
1253                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
1254                 }
1255                 aaI = aaP;
1256                 seqIndex = seqNumber;
1257                 
1258                 listFile.close();       
1259         }
1260         catch(exception& e) {
1261                 m->errorOut(e, "ShhherCommand", "getOTUData");
1262                 exit(1);        
1263         }               
1264 }
1265
1266 /**************************************************************************************************/
1267
1268 void ShhherCommand::initPyroCluster(){                          
1269         try{
1270                 dist.assign(numSeqs * numOTUs, 0);
1271                 change.assign(numOTUs, 1);
1272                 centroids.assign(numOTUs, -1);
1273                 weight.assign(numOTUs, 0);
1274                 singleTau.assign(numSeqs, 1.0);
1275                 
1276                 nSeqsBreaks.assign(processors+1, 0);
1277                 nOTUsBreaks.assign(processors+1, 0);
1278
1279                 nSeqsBreaks[0] = 0;
1280                 for(int i=0;i<processors;i++){
1281                         nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1282                         nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1283                 }
1284                 nSeqsBreaks[processors] = numSeqs;
1285                 nOTUsBreaks[processors] = numOTUs;
1286         }
1287         catch(exception& e) {
1288                 m->errorOut(e, "ShhherCommand", "initPyroCluster");
1289                 exit(1);        
1290         }
1291 }
1292
1293 /**************************************************************************************************/
1294
1295 void ShhherCommand::fill(){
1296         try {
1297                 int index = 0;
1298                 for(int i=0;i<numOTUs;i++){
1299                         cumNumSeqs[i] = index;
1300                         for(int j=0;j<nSeqsPerOTU[i];j++){
1301                                 seqNumber[index] = aaP[i][j];
1302                                 seqIndex[index] = aaI[i][j];
1303                                 
1304                                 index++;
1305                         }
1306                 }
1307         }
1308         catch(exception& e) {
1309                 m->errorOut(e, "ShhherCommand", "fill");
1310                 exit(1);        
1311         }               
1312 }
1313
1314 /**************************************************************************************************/
1315
1316 void ShhherCommand::calcCentroids(){                          
1317         try{
1318                 
1319 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1320
1321                 if(processors == 1)     {
1322                         calcCentroidsDriver(0, numOTUs);                
1323                 }
1324                 else{ //you have multiple processors
1325                         if (numOTUs < processors){      processors = 1; }
1326                         
1327                         int process = 1;
1328                         vector<int> processIDs;
1329                         
1330                         //loop through and create all the processes you want
1331                         while (process != processors) {
1332                                 int pid = vfork();
1333                                 
1334                                 if (pid > 0) {
1335                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1336                                         process++;
1337                                 }else if (pid == 0){
1338                                         calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
1339                                         exit(0);
1340                                 }else { 
1341                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1342                                         perror(" : ");
1343                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1344                                         exit(0);
1345                                 }
1346                         }
1347                         
1348                         //parent does its part
1349                         calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
1350
1351                         //force parent to wait until all the processes are done
1352                         for (int i=0;i<processIDs.size();i++) { 
1353                                 int temp = processIDs[i];
1354                                 wait(&temp);
1355                         }
1356                 }
1357                 
1358 #else
1359                 calcCentroidsDriver(0, numOTUs);
1360 #endif          
1361         }
1362         catch(exception& e) {
1363                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1364                 exit(1);        
1365         }               
1366 }
1367
1368 /**************************************************************************************************/
1369
1370 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1371         
1372         //this function gets the most likely homopolymer length at a flow position for a group of sequences
1373         //within an otu
1374         
1375         try{
1376                 
1377                 for(int i=start;i<finish;i++){
1378                         
1379                         double count = 0;
1380                         int position = 0;
1381                         int minFlowGram = 100000000;
1382                         double minFlowValue = 1e8;
1383                         change[i] = 0; //FALSE
1384                         
1385                         for(int j=0;j<nSeqsPerOTU[i];j++){
1386                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1387                         }
1388                         
1389                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1390                                 vector<double> adF(nSeqsPerOTU[i]);
1391                                 vector<int> anL(nSeqsPerOTU[i]);
1392                                 
1393                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1394                                         int index = cumNumSeqs[i] + j;
1395                                         int nI = seqIndex[index];
1396                                         int nIU = mapSeqToUnique[nI];
1397                                         
1398                                         int k;
1399                                         for(k=0;k<position;k++){
1400                                                 if(nIU == anL[k]){
1401                                                         break;
1402                                                 }
1403                                         }
1404                                         if(k == position){
1405                                                 anL[position] = nIU;
1406                                                 adF[position] = 0.0000;
1407                                                 position++;
1408                                         }                                               
1409                                 }
1410                                 
1411                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1412                                         int index = cumNumSeqs[i] + j;
1413                                         int nI = seqIndex[index];
1414                                         int nIU = mapSeqToUnique[nI];
1415                                         
1416                                         double tauValue = singleTau[seqNumber[index]];
1417                                         
1418                                         for(int k=0;k<position;k++){
1419                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1420                                                 adF[k] += dist * tauValue;
1421                                         }
1422                                 }
1423                                 
1424                                 for(int j=0;j<position;j++){
1425                                         if(adF[j] < minFlowValue){
1426                                                 minFlowGram = j;
1427                                                 minFlowValue = adF[j];
1428                                         }
1429                                 }
1430                                 
1431                                 if(centroids[i] != anL[minFlowGram]){
1432                                         change[i] = 1;
1433                                         centroids[i] = anL[minFlowGram];
1434                                 }
1435                         }
1436                         else if(centroids[i] != -1){
1437                                 change[i] = 1;
1438                                 centroids[i] = -1;                      
1439                         }
1440                 }
1441         }
1442         catch(exception& e) {
1443                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1444                 exit(1);        
1445         }               
1446 }
1447
1448 /**************************************************************************************************/
1449
1450 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1451         try{
1452                 
1453                 int flowAValue = cent * numFlowCells;
1454                 int flowBValue = flow * numFlowCells;
1455                 
1456                 double dist = 0;
1457                 
1458                 for(int i=0;i<length;i++){
1459                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1460                         flowAValue++;
1461                         flowBValue++;
1462                 }
1463                 
1464                 return dist / (double)length;
1465         }
1466         catch(exception& e) {
1467                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1468                 exit(1);        
1469         }               
1470 }
1471
1472 /**************************************************************************************************/
1473
1474 double ShhherCommand::getNewWeights(){
1475         try{
1476                 
1477                 double maxChange = 0;
1478                 
1479                 for(int i=0;i<numOTUs;i++){
1480                         
1481                         double difference = weight[i];
1482                         weight[i] = 0;
1483                         
1484                         for(int j=0;j<nSeqsPerOTU[i];j++){
1485                                 int index = cumNumSeqs[i] + j;
1486                                 int nI = seqIndex[index];
1487                                 double tauValue = singleTau[seqNumber[index]];
1488                                 weight[i] += tauValue;
1489                         }
1490                         
1491                         difference = fabs(weight[i] - difference);
1492                         if(difference > maxChange){     maxChange = difference; }
1493                 }
1494                 return maxChange;
1495         }
1496         catch(exception& e) {
1497                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1498                 exit(1);        
1499         }               
1500 }
1501
1502 /**************************************************************************************************/
1503
1504 double ShhherCommand::getLikelihood(){
1505         
1506         try{
1507                 
1508                 vector<long double> P(numSeqs, 0);
1509                 int effNumOTUs = 0;
1510                 
1511                 for(int i=0;i<numOTUs;i++){
1512                         if(weight[i] > MIN_WEIGHT){
1513                                 effNumOTUs++;
1514                         }
1515                 }
1516                 
1517                 string hold;
1518                 for(int i=0;i<numOTUs;i++){
1519                         for(int j=0;j<nSeqsPerOTU[i];j++){
1520                                 int index = cumNumSeqs[i] + j;
1521                                 int nI = seqIndex[index];
1522                                 double singleDist = dist[seqNumber[index]];
1523                                 
1524                                 P[nI] += weight[i] * exp(-singleDist * sigma);
1525                         }
1526                 }
1527                 double nLL = 0.00;
1528                 for(int i=0;i<numSeqs;i++){
1529                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
1530
1531                         nLL += -log(P[i]);
1532                 }
1533                 
1534                 nLL = nLL -(double)numSeqs * log(sigma);
1535
1536                 return nLL; 
1537         }
1538         catch(exception& e) {
1539                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1540                 exit(1);        
1541         }               
1542 }
1543
1544 /**************************************************************************************************/
1545
1546 void ShhherCommand::checkCentroids(){
1547         try{
1548                 vector<int> unique(numOTUs, 1);
1549                 
1550                 for(int i=0;i<numOTUs;i++){
1551                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1552                                 unique[i] = -1;
1553                         }
1554                 }
1555                 
1556                 for(int i=0;i<numOTUs;i++){
1557                         if(unique[i] == 1){
1558                                 for(int j=i+1;j<numOTUs;j++){
1559                                         if(unique[j] == 1){
1560                                                 
1561                                                 if(centroids[j] == centroids[i]){
1562                                                         unique[j] = 0;
1563                                                         centroids[j] = -1;
1564                                                         
1565                                                         weight[i] += weight[j];
1566                                                         weight[j] = 0.0;
1567                                                 }
1568                                         }
1569                                 }
1570                         }
1571                 }
1572         }
1573         catch(exception& e) {
1574                 m->errorOut(e, "ShhherCommand", "checkCentroids");
1575                 exit(1);        
1576         }               
1577 }
1578
1579 /**************************************************************************************************/
1580
1581 void ShhherCommand::calcNewDistances(){                          
1582         try{
1583                 
1584 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1585                 
1586                 if(processors == 1)     {
1587                         calcNewDistancesParent(0, numSeqs);             
1588                 }
1589                 else{ //you have multiple processors
1590                         if (numSeqs < processors){      processors = 1; }
1591                         
1592                         vector<vector<int> > child_otuIndex(processors);
1593                         vector<vector<int> > child_seqIndex(processors);
1594                         vector<vector<double> > child_singleTau(processors);                    
1595                         vector<int> totals(processors);
1596                         
1597                         int process = 1;
1598                         vector<int> processIDs;
1599
1600                         //loop through and create all the processes you want
1601                         while (process != processors) {
1602                                 int pid = vfork();
1603                                 
1604                                 if (pid > 0) {
1605                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1606                                         process++;
1607                                 }else if (pid == 0){
1608                                         calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
1609                                         totals[process] = child_otuIndex[process].size();
1610
1611                                         exit(0);
1612                                 }else { 
1613                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1614                                         perror(" : ");
1615                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1616                                         exit(0);
1617                                 }
1618                         }
1619                         
1620                         //parent does its part
1621                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
1622                         int total = seqIndex.size();
1623
1624                         //force parent to wait until all the processes are done
1625                         for (int i=0;i<processIDs.size();i++) { 
1626                                 int temp = processIDs[i];
1627                                 wait(&temp);
1628                         }
1629
1630                         for(int i=1;i<processors;i++){
1631                                 int oldTotal = total;
1632                                 total += totals[i];
1633
1634                                 singleTau.resize(total, 0);
1635                                 seqIndex.resize(total, 0);
1636                                 seqNumber.resize(total, 0);
1637                                 
1638                                 int childIndex = 0;
1639                                 
1640                                 for(int j=oldTotal;j<total;j++){
1641                                         int otuI = child_otuIndex[i][childIndex];
1642                                         int seqI = child_seqIndex[i][childIndex];
1643
1644                                         singleTau[j] = child_singleTau[i][childIndex];
1645                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
1646                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
1647                                         nSeqsPerOTU[otuI]++;
1648
1649                                         childIndex++;
1650                                 }
1651                         }
1652                 }
1653 #else
1654                 calcNewDistancesParent(0, numSeqs);             
1655 #endif          
1656         }
1657         catch(exception& e) {
1658                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1659                 exit(1);        
1660         }               
1661 }
1662
1663 /**************************************************************************************************/
1664 #ifdef USE_MPI
1665 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1666         
1667         try{
1668                 vector<double> newTau(numOTUs,0);
1669                 vector<double> norms(numSeqs, 0);
1670                 otuIndex.resize(0);
1671                 seqIndex.resize(0);
1672                 singleTau.resize(0);
1673                 
1674                 
1675                 
1676                 for(int i=startSeq;i<stopSeq;i++){
1677                         double offset = 1e8;
1678                         int indexOffset = i * numOTUs;
1679                         
1680                         for(int j=0;j<numOTUs;j++){
1681                                 
1682                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1683                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1684                                 }
1685                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1686                                         offset = dist[indexOffset + j];
1687                                 }
1688                         }
1689                         
1690                         for(int j=0;j<numOTUs;j++){
1691                                 if(weight[j] > MIN_WEIGHT){
1692                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1693                                         norms[i] += newTau[j];
1694                                 }
1695                                 else{
1696                                         newTau[j] = 0.0;
1697                                 }
1698                         }
1699                         
1700                         for(int j=0;j<numOTUs;j++){
1701
1702                                 newTau[j] /= norms[i];
1703                                 
1704                                 if(newTau[j] > MIN_TAU){
1705                                         otuIndex.push_back(j);
1706                                         seqIndex.push_back(i);
1707                                         singleTau.push_back(newTau[j]);
1708                                 }
1709                         }
1710                         
1711                 }
1712         }
1713         catch(exception& e) {
1714                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1715                 exit(1);        
1716         }               
1717 }
1718 #endif
1719 /**************************************************************************************************/
1720
1721 void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
1722         
1723         try{
1724                 vector<double> newTau(numOTUs,0);
1725                 vector<double> norms(numSeqs, 0);
1726                 child_otuIndex.resize(0);
1727                 child_seqIndex.resize(0);
1728                 child_singleTau.resize(0);
1729                 
1730                 for(int i=startSeq;i<stopSeq;i++){
1731                         double offset = 1e8;
1732                         int indexOffset = i * numOTUs;
1733                         
1734                         
1735                         for(int j=0;j<numOTUs;j++){
1736                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1737                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1738                                 }
1739                                 
1740                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1741                                         offset = dist[indexOffset + j];
1742                                 }
1743                         }
1744                         
1745                         for(int j=0;j<numOTUs;j++){
1746                                 if(weight[j] > MIN_WEIGHT){
1747                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1748                                         norms[i] += newTau[j];
1749                                 }
1750                                 else{
1751                                         newTau[j] = 0.0;
1752                                 }
1753                         }
1754                         
1755                         for(int j=0;j<numOTUs;j++){
1756                                 newTau[j] /= norms[i];
1757                                 
1758                                 if(newTau[j] > MIN_TAU){
1759                                         child_otuIndex.push_back(j);
1760                                         child_seqIndex.push_back(i);
1761                                         child_singleTau.push_back(newTau[j]);
1762                                 }
1763                         }
1764                 }
1765         }
1766         catch(exception& e) {
1767                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
1768                 exit(1);        
1769         }               
1770 }
1771
1772 /**************************************************************************************************/
1773
1774 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1775         
1776         try{
1777                 
1778                 int total = 0;
1779                 vector<double> newTau(numOTUs,0);
1780                 vector<double> norms(numSeqs, 0);
1781                 nSeqsPerOTU.assign(numOTUs, 0);
1782                 
1783                 for(int i=startSeq;i<stopSeq;i++){
1784                         int indexOffset = i * numOTUs;
1785                         
1786                         double offset = 1e8;
1787                         
1788                         for(int j=0;j<numOTUs;j++){
1789                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1790                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1791                                 }
1792                                 
1793                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1794                                         offset = dist[indexOffset + j];
1795                                 }
1796                         }
1797                         
1798                         for(int j=0;j<numOTUs;j++){
1799                                 if(weight[j] > MIN_WEIGHT){
1800                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1801                                         norms[i] += newTau[j];
1802                                 }
1803                                 else{
1804                                         newTau[j] = 0.0;
1805                                 }
1806                         }
1807                         
1808                         for(int j=0;j<numOTUs;j++){
1809                                 newTau[j] /= norms[i];
1810                         }
1811                         
1812                         for(int j=0;j<numOTUs;j++){
1813                                 if(newTau[j] > MIN_TAU){
1814                                         
1815                                         int oldTotal = total;
1816                                         
1817                                         total++;
1818                                         
1819                                         singleTau.resize(total, 0);
1820                                         seqNumber.resize(total, 0);
1821                                         seqIndex.resize(total, 0);
1822                                         
1823                                         singleTau[oldTotal] = newTau[j];
1824                                         
1825                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
1826                                         aaI[j][nSeqsPerOTU[j]] = i;
1827                                         nSeqsPerOTU[j]++;
1828                                 }
1829                         }
1830                 }
1831         }
1832         catch(exception& e) {
1833                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1834                 exit(1);        
1835         }               
1836 }
1837
1838 /**************************************************************************************************/
1839
1840 void ShhherCommand::setOTUs(){
1841         
1842         try {
1843                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1844                 
1845                 for(int i=0;i<numOTUs;i++){
1846                         for(int j=0;j<nSeqsPerOTU[i];j++){
1847                                 int index = cumNumSeqs[i] + j;
1848                                 double tauValue = singleTau[seqNumber[index]];
1849                                 int sIndex = seqIndex[index];
1850                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1851                         }
1852                 }
1853                 
1854                 for(int i=0;i<numSeqs;i++){
1855                         double maxTau = -1.0000;
1856                         int maxOTU = -1;
1857                         for(int j=0;j<numOTUs;j++){
1858                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1859                                         maxTau = bigTauMatrix[i * numOTUs + j];
1860                                         maxOTU = j;
1861                                 }
1862                         }
1863                         
1864                         otuData[i] = maxOTU;
1865                 }
1866                 
1867                 nSeqsPerOTU.assign(numOTUs, 0);         
1868                 
1869                 for(int i=0;i<numSeqs;i++){
1870                         int index = otuData[i];
1871                         
1872                         singleTau[i] = 1.0000;
1873                         dist[i] = 0.0000;
1874                         
1875                         aaP[index][nSeqsPerOTU[index]] = i;
1876                         aaI[index][nSeqsPerOTU[index]] = i;
1877                         
1878                         nSeqsPerOTU[index]++;
1879                 }
1880                 fill(); 
1881         }
1882         catch(exception& e) {
1883                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1884                 exit(1);        
1885         }               
1886 }
1887
1888 /**************************************************************************************************/
1889
1890 void ShhherCommand::writeQualities(vector<int> otuCounts){
1891         
1892         try {
1893                 string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.qual";
1894
1895                 ofstream qualityFile;
1896                 m->openOutputFile(qualityFileName, qualityFile);
1897
1898                 qualityFile.setf(ios::fixed, ios::floatfield);
1899                 qualityFile.setf(ios::showpoint);
1900                 qualityFile << setprecision(6);
1901                 
1902                 vector<vector<int> > qualities(numOTUs);
1903                 vector<double> pr(HOMOPS, 0);
1904                 
1905                 int index = 0;
1906                 
1907                 for(int i=0;i<numOTUs;i++){
1908                         int index = 0;
1909                         int base = 0;
1910                         
1911                         if(nSeqsPerOTU[i] > 0){
1912                                 qualities[i].assign(1024, -1);
1913                                 
1914                                 while(index < numFlowCells){
1915                                         double maxPrValue = 1e8;
1916                                         short maxPrIndex = -1;
1917                                         double count = 0.0000;
1918                                         
1919                                         pr.assign(HOMOPS, 0);
1920                                         
1921                                         for(int j=0;j<nSeqsPerOTU[i];j++){
1922                                                 int lIndex = cumNumSeqs[i] + j;
1923                                                 double tauValue = singleTau[seqNumber[lIndex]];
1924                                                 int sequenceIndex = aaI[i][j];
1925                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1926                                                 
1927                                                 count += tauValue;
1928                                                 
1929                                                 for(int s=0;s<HOMOPS;s++){
1930                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1931                                                 }
1932                                         }
1933                                         
1934                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1935                                         maxPrValue = pr[maxPrIndex];
1936                                         
1937                                         if(count > MIN_COUNT){
1938                                                 double U = 0.0000;
1939                                                 double norm = 0.0000;
1940                                                 
1941                                                 for(int s=0;s<HOMOPS;s++){
1942                                                         norm += exp(-(pr[s] - maxPrValue));
1943                                                 }
1944                                                 
1945                                                 for(int s=1;s<=maxPrIndex;s++){
1946                                                         int value = 0;
1947                                                         double temp = 0.0000;
1948                                                         
1949                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
1950                                                         
1951                                                         if(U>0.00){
1952                                                                 temp = log10(U);
1953                                                         }
1954                                                         else{
1955                                                                 temp = -10.1;
1956                                                         }
1957                                                         temp = floor(-10 * temp);
1958                                                         value = (int)floor(temp);
1959                                                         if(value > 100){        value = 100;    }
1960                                                         
1961                                                         qualities[i][base] = (int)value;
1962                                                         base++;
1963                                                 }
1964                                         }
1965                                         
1966                                         index++;
1967                                 }
1968                         }
1969                         
1970                         
1971                         if(otuCounts[i] > 0){
1972                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1973                                 
1974                                 int j=4;        //need to get past the first four bases
1975                                 while(qualities[i][j] != -1){
1976                                         qualityFile << qualities[i][j] << ' ';
1977                                         j++;
1978                                 }
1979                                 qualityFile << endl;
1980                         }
1981                 }
1982                 qualityFile.close();
1983                 
1984         }
1985         catch(exception& e) {
1986                 m->errorOut(e, "ShhherCommand", "writeQualities");
1987                 exit(1);        
1988         }               
1989 }
1990
1991 /**************************************************************************************************/
1992
1993 void ShhherCommand::writeSequences(vector<int> otuCounts){
1994         try {
1995                 
1996                 string bases = "TACG";
1997                 
1998                 string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.fasta";
1999                 ofstream fastaFile;
2000                 m->openOutputFile(fastaFileName, fastaFile);
2001                 
2002                 vector<string> names(numOTUs, "");
2003                 
2004                 for(int i=0;i<numOTUs;i++){
2005                         int index = centroids[i];
2006                         
2007                         if(otuCounts[i] > 0){
2008                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
2009                                 
2010                                 for(int j=8;j<numFlowCells;j++){
2011                                         
2012                                         char base = bases[j % 4];
2013                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
2014                                                 fastaFile << base;
2015                                         }
2016                                 }
2017                                 fastaFile << endl;
2018                         }
2019                 }
2020                 fastaFile.close();
2021         }
2022         catch(exception& e) {
2023                 m->errorOut(e, "ShhherCommand", "writeSequences");
2024                 exit(1);        
2025         }               
2026 }
2027
2028 /**************************************************************************************************/
2029
2030 void ShhherCommand::writeNames(vector<int> otuCounts){
2031         try {
2032                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.final.names";
2033                 ofstream nameFile;
2034                 m->openOutputFile(nameFileName, nameFile);
2035                 
2036                 for(int i=0;i<numOTUs;i++){
2037                         if(otuCounts[i] > 0){
2038                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
2039                                 
2040                                 for(int j=1;j<nSeqsPerOTU[i];j++){
2041                                         nameFile << ',' << seqNameVector[aaI[i][j]];
2042                                 }
2043                                 
2044                                 nameFile << endl;
2045                         }
2046                 }
2047                 nameFile.close();
2048         }
2049         catch(exception& e) {
2050                 m->errorOut(e, "ShhherCommand", "writeNames");
2051                 exit(1);        
2052         }               
2053 }
2054
2055 /**************************************************************************************************/
2056
2057 void ShhherCommand::writeGroups(){
2058         try {
2059                 string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
2060                 string groupFileName = fileRoot + ".pn.groups";
2061                 ofstream groupFile;
2062                 m->openOutputFile(groupFileName, groupFile);
2063                 
2064                 for(int i=0;i<numSeqs;i++){
2065                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
2066                 }
2067                 groupFile.close();
2068         }
2069         catch(exception& e) {
2070                 m->errorOut(e, "ShhherCommand", "writeGroups");
2071                 exit(1);        
2072         }               
2073 }
2074
2075 /**************************************************************************************************/
2076
2077 void ShhherCommand::writeClusters(vector<int> otuCounts){
2078         try {
2079                 string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.counts";
2080                 ofstream otuCountsFile;
2081                 m->openOutputFile(otuCountsFileName, otuCountsFile);
2082                 
2083                 string bases = "TACG";
2084                 
2085                 for(int i=0;i<numOTUs;i++){
2086                         //output the translated version of the centroid sequence for the otu
2087                         if(otuCounts[i] > 0){
2088                                 int index = centroids[i];
2089                                 
2090                                 otuCountsFile << "ideal\t";
2091                                 for(int j=8;j<numFlowCells;j++){
2092                                         char base = bases[j % 4];
2093                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
2094                                                 otuCountsFile << base;
2095                                         }
2096                                 }
2097                                 otuCountsFile << endl;
2098                                 
2099                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2100                                         int sequence = aaI[i][j];
2101                                         otuCountsFile << seqNameVector[sequence] << '\t';
2102                                         
2103                                         for(int k=8;k<lengths[sequence];k++){
2104                                                 char base = bases[k % 4];
2105                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
2106                                                 
2107                                                 for(int s=0;s<freq;s++){
2108                                                         otuCountsFile << base;
2109                                                 }
2110                                         }
2111                                         otuCountsFile << endl;
2112                                 }
2113                                 otuCountsFile << endl;
2114                         }
2115                 }
2116                 otuCountsFile.close();
2117         }
2118         catch(exception& e) {
2119                 m->errorOut(e, "ShhherCommand", "writeClusters");
2120                 exit(1);        
2121         }               
2122 }
2123
2124 //**********************************************************************************************************************