]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added shhh.seqs command
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsematrix.hpp"
19
20 //**********************************************************************************************************************
21
22 #define NUMBINS 1000
23 #define HOMOPS 10
24 #define MIN_COUNT 0.1
25 #define MIN_WEIGHT 0.1
26 #define MIN_TAU 0.0001
27 #define MIN_ITER 10
28
29 //**********************************************************************************************************************
30
31 vector<string> ShhherCommand::getValidParameters(){     
32         try {
33                 string Array[] =  {     
34                         "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors"       
35                 };
36                 
37                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
38                 return myArray;
39         }
40         catch(exception& e) {
41                 m->errorOut(e, "ShhherCommand", "getValidParameters");
42                 exit(1);
43         }
44 }
45
46 //**********************************************************************************************************************
47
48 ShhherCommand::ShhherCommand(){ 
49         try {
50                 abort = true;
51                 
52                 //initialize outputTypes
53                 vector<string> tempOutNames;
54                 outputTypes["pn.dist"] = tempOutNames;
55
56         }
57         catch(exception& e) {
58                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
59                 exit(1);
60         }
61 }
62
63 //**********************************************************************************************************************
64
65 vector<string> ShhherCommand::getRequiredParameters(){  
66         try {
67                 string Array[] =  {"flow"};
68                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
69                 return myArray;
70         }
71         catch(exception& e) {
72                 m->errorOut(e, "ShhherCommand", "getRequiredParameters");
73                 exit(1);
74         }
75 }
76
77 //**********************************************************************************************************************
78
79 vector<string> ShhherCommand::getRequiredFiles(){       
80         try {
81                 vector<string> myArray;
82                 return myArray;
83         }
84         catch(exception& e) {
85                 m->errorOut(e, "ShhherCommand", "getRequiredFiles");
86                 exit(1);
87         }
88 }
89
90 //**********************************************************************************************************************
91
92 ShhherCommand::ShhherCommand(string option) {
93         try {
94
95 #ifdef USE_MPI
96                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
97                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
98
99                 if(pid == 0){
100 #endif
101                 
102                 
103                 abort = false;
104                 
105                 
106                 //allow user to run help
107                 if(option == "help") { help(); abort = true; }
108                 
109                 else {
110                         
111                         //valid paramters for this command
112                         string AlignArray[] =  {
113                                 "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors"       
114                         };
115                         
116                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
117                         
118                         OptionParser parser(option);
119                         map<string,string> parameters = parser.getParameters();
120                         
121                         ValidParameters validParameter;
122                         map<string,string>::iterator it;
123                         
124                         //check to make sure all parameters are valid for command
125                         for (it = parameters.begin(); it != parameters.end(); it++) { 
126                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
127                         }
128                         
129                         //initialize outputTypes
130                         vector<string> tempOutNames;
131                         outputTypes["pn.dist"] = tempOutNames;
132                         //                      outputTypes["fasta"] = tempOutNames;
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("flow");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("lookup");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
153                                 }
154
155                                 it = parameters.find("file");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
161                                 }
162                         }
163                         
164                         
165                         //check for required parameters
166                         flowFileName = validParameter.validFile(parameters, "flow", true);
167                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
168                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
169                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
170                                 m->mothurOutEndLine();
171                                 abort = true; 
172                         }
173                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }       
174                         
175                         //if the user changes the output directory command factory will send this info to us in the output parameter 
176                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
177                                 outputDir = ""; 
178                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
179                         }
180                         
181                         
182                         //check for optional parameter and set defaults
183                         // ...at some point should added some additional type checking...
184                         string temp;
185                         temp = validParameter.validFile(parameters, "lookup", true);
186                         if (temp == "not found")        {       lookupFileName = "LookUp_E123.pat";     }
187                         else if(temp == "not open")     {       abort = true;                   } 
188                         else                                            {       lookupFileName = temp;  }
189                         
190                         temp = validParameter.validFile(parameters, "processors", false);if (temp == "not found"){      temp = "1";                     }
191                         convert(temp, processors); 
192
193                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
194                         convert(temp, cutoff); 
195                         
196                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
197                         convert(temp, minDelta); 
198
199                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
200                         convert(temp, maxIters); 
201
202                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
203                         convert(temp, sigma); 
204                         
205                         globaldata = GlobalData::getInstance();
206                 }
207                         
208 #ifdef USE_MPI
209                 }                               
210 #endif
211                                 
212         }
213         catch(exception& e) {
214                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
215                 exit(1);
216         }
217 }
218
219 //**********************************************************************************************************************
220
221 ShhherCommand::~ShhherCommand(){}
222
223 //**********************************************************************************************************************
224
225 void ShhherCommand::help(){
226         try {
227                 m->mothurOut("The shhher command reads a file containing flowgrams and creates a file of corrected sequences.\n");
228         }
229         catch(exception& e) {
230                 m->errorOut(e, "ShhherCommand", "help");
231                 exit(1);
232         }
233 }
234
235 //**********************************************************************************************************************
236 #ifdef USE_MPI
237 int ShhherCommand::execute(){
238         try {
239                 int tag = 1976;
240                 MPI_Status status; 
241
242                 double begClock = clock();
243                 unsigned long int begTime = time(NULL);
244
245                 cout.setf(ios::fixed, ios::floatfield);
246                 cout.setf(ios::showpoint);
247                 cout << setprecision(2);
248                 
249                 
250                 if(pid == 0){
251
252                         for(int i=1;i<ncpus;i++){
253                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
254                         }
255                         if(abort == 1){ return 0;       }
256
257                         processors = ncpus;
258                         
259                         cout << "\nGetting preliminary data..." << endl;
260                         getSingleLookUp();
261                         getJointLookUp();
262                                                 
263                         vector<string> flowFileVector;
264                         if(flowFilesFileName != "not found"){
265                                 string fName;
266
267                                 ifstream flowFilesFile;
268                                 m->openInputFile(flowFilesFileName, flowFilesFile);
269                                 while(flowFilesFile){
270                                         flowFilesFile >> fName;
271                                         flowFileVector.push_back(fName);
272                                         m->gobble(flowFilesFile);
273                                 }
274                         }
275                         else{
276                                 flowFileVector.push_back(flowFileName);
277                         }
278                         int numFiles = flowFileVector.size();
279
280                         for(int i=1;i<ncpus;i++){
281                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
282                         }
283                         
284                         for(int i=0;i<numFiles;i++){
285                                 flowFileName = flowFileVector[i];
286                         
287                                 cout << "\n>>>>>\tProcessing " << flowFileName << " (file " << i+1 << " of " << numFiles << ")\t<<<<<" << endl;
288                                 cout << "Reading flowgrams..." << endl;
289                                 getFlowData();
290                                 cout << "Identifying unique flowgrams..." << endl;
291                                 getUniques();
292
293                                 cout << "Calculating distances between flowgrams..." << endl;
294                                 char fileName[1024];
295                                 strcpy(fileName, flowFileName.c_str());
296
297                                 for(int i=1;i<ncpus;i++){
298                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
299
300                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
301                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
302                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
303                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
304                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
305                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
306                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
307                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
308                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
309                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
310                                 }                               
311                                                         
312                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
313                                 
314                                 int done;
315                                 for(int i=1;i<ncpus;i++){
316                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
317                                         
318                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
319                                         remove((distFileName + ".temp." + toString(i)).c_str());
320                                 }
321
322                                 string namesFileName = createNamesFile();
323                                 
324                                 cout << "\nClustering flowgrams..." << endl;
325                                 string listFileName = cluster(distFileName, namesFileName);
326         //                      string listFileName = "PriestPot_C7.pn.list";
327         //                      string listFileName = "test.mock_rep3.v69.pn.list";
328                         
329                                 getOTUData(listFileName);
330                                 initPyroCluster();
331
332                                 for(int i=1;i<ncpus;i++){
333                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
334                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
335                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
336                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
337                                 }
338                                 
339                                 
340                                 double maxDelta = 0;
341                                 int iter = 0;
342                                 
343                                 int numOTUsOnCPU = numOTUs / ncpus;
344                                 int numSeqsOnCPU = numSeqs / ncpus;
345                                 
346                                 cout << "\nDenoising flowgrams..." << endl;
347                                 cout << "iter\tmaxDelta\tnLL\t\tcycletime" << endl;
348                                 
349                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
350
351                                         double cycClock = clock();
352                                         unsigned long int cycTime = time(NULL);
353                                         fill();
354
355                                         int total = singleTau.size();
356                                         for(int i=1;i<ncpus;i++){
357                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
358                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
359                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
360                                                 
361                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
362                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
363                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
364                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
365                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
366                                         }
367                                 
368                                         calcCentroidsDriver(0, numOTUsOnCPU);
369                                         
370                                         for(int i=1;i<ncpus;i++){
371                                                 int otuStart = i * numOTUs / ncpus;
372                                                 int otuStop = (i + 1) * numOTUs / ncpus;
373                                                 
374                                                 vector<int> tempCentroids(numOTUs, 0);
375                                                 vector<short> tempChange(numOTUs, 0);
376                                                 
377                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
378                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
379                                                 
380                                                 for(int j=otuStart;j<otuStop;j++){
381                                                         centroids[j] = tempCentroids[j];
382                                                         change[j] = tempChange[j];
383                                                 }
384                                         }
385                                                                         
386                                         maxDelta = getNewWeights();
387                                         double nLL = getLikelihood();
388                                         checkCentroids();
389                                         
390                                         for(int i=1;i<ncpus;i++){
391                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
392                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
393                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
394                                         }
395                                         
396                                         calcNewDistancesParent(0, numSeqsOnCPU);
397
398                                         total = singleTau.size();
399
400                                         for(int i=1;i<ncpus;i++){
401                                                 int childTotal;
402                                                 int seqStart = i * numSeqs / ncpus;
403                                                 int seqStop = (i + 1) * numSeqs / ncpus;
404                                                 
405                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
406
407                                                 vector<int> childSeqIndex(childTotal, 0);
408                                                 vector<double> childSingleTau(childTotal, 0);
409                                                 vector<double> childDist(numSeqs * numOTUs, 0);
410                                                 vector<int> otuIndex(childTotal, 0);
411                                                 
412                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
413                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
414                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
415                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
416
417                                                 int oldTotal = total;
418                                                 total += childTotal;
419                                                 singleTau.resize(total, 0);
420                                                 seqIndex.resize(total, 0);
421                                                 seqNumber.resize(total, 0);
422                                                 
423                                                 int childIndex = 0;
424                                                 
425                                                 for(int j=oldTotal;j<total;j++){
426                                                         int otuI = otuIndex[childIndex];
427                                                         int seqI = childSeqIndex[childIndex];
428                                                         
429                                                         singleTau[j] = childSingleTau[childIndex];
430                                                         
431                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
432                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
433                                                         nSeqsPerOTU[otuI]++;
434                                                         childIndex++;
435                                                 }
436                                                 
437                                                 int index = seqStart * numOTUs;
438                                                 for(int j=seqStart;j<seqStop;j++){
439                                                         for(int k=0;k<numOTUs;k++){
440                                                                 dist[index] = childDist[index];
441                                                                 index++;
442                                                         }
443                                                 }                                       
444                                         }
445                                         
446                                         iter++;
447                                         
448                                         cout << iter << '\t' << maxDelta << '\t' << setprecision(2) << nLL << '\t' << time(NULL) - cycTime << '\t' << setprecision(6) << (clock() - cycClock)/(double)CLOCKS_PER_SEC << endl;                   
449
450                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
451                                                 int live = 1;
452                                                 for(int i=1;i<ncpus;i++){
453                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
454                                                 }
455                                         }
456                                         else{
457                                                 int live = 0;
458                                                 for(int i=1;i<ncpus;i++){
459                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
460                                                 }
461                                         }
462                                         
463                                 }       
464                                 
465                                 cout << "\nFinalizing..." << endl;
466                                 fill();
467                                 setOTUs();
468                                 vector<int> otuCounts(numOTUs, 0);
469                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
470                                 calcCentroidsDriver(0, numOTUs);
471                                 writeQualities(otuCounts);
472                                 writeSequences(otuCounts);
473                                 writeNames(otuCounts);
474                                 writeClusters(otuCounts);
475                                 writeGroups();
476                                 
477                                 remove(distFileName.c_str());
478                                 remove(namesFileName.c_str());
479                                 remove(listFileName.c_str());
480                                 
481                                 cout << "Total time to process " << flowFileName << ":\t" << time(NULL) - begTime << '\t' << setprecision(6) << (clock() - begClock)/(double)CLOCKS_PER_SEC << endl;                    
482                         }
483
484                 }
485                 else{
486                         int abort = 1;
487                         bool live = 1;
488
489                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
490                         if(abort){      return 0;       }
491
492                         int numFiles;
493                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
494
495                         for(int i=0;i<numFiles;i++){
496                                 //Now into the pyrodist part
497                                 char fileName[1024];
498                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
499                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
500                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
501                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
502                                 
503                                 flowDataIntI.resize(numSeqs * numFlowCells);
504                                 flowDataPrI.resize(numSeqs * numFlowCells);
505                                 mapUniqueToSeq.resize(numSeqs);
506                                 mapSeqToUnique.resize(numSeqs);
507                                 lengths.resize(numSeqs);
508                                 jointLookUp.resize(NUMBINS * NUMBINS);
509
510                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
511                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
512                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
513                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
514                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
515                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
516                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
517
518                                 flowFileName = string(fileName);
519                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
520                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
521                                 
522                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
523
524                                 int done = 1;
525                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
526
527                                 //Now into the pyronoise part
528                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
529                                 
530                                 singleLookUp.resize(HOMOPS * NUMBINS);
531                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
532                                 weight.resize(numOTUs);
533                                 centroids.resize(numOTUs);
534                                 change.resize(numOTUs);
535                                 dist.assign(numOTUs * numSeqs, 0);
536                                 nSeqsPerOTU.resize(numOTUs);
537                                 cumNumSeqs.resize(numOTUs);
538
539                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
540                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
541                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
542                                 
543                                 int startOTU = pid * numOTUs / ncpus;
544                                 int endOTU = (pid + 1) * numOTUs / ncpus;
545
546                                 int startSeq = pid * numSeqs / ncpus;
547                                 int endSeq = (pid + 1) * numSeqs /ncpus;
548                                 
549                                 int total;
550
551                                 while(live){
552
553                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
554                                         singleTau.assign(total, 0.0000);
555                                         seqNumber.assign(total, 0);
556                                         seqIndex.assign(total, 0);
557                                         
558                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
559                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
560                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
561                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
562                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
563                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
564                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
565                                         
566                                         calcCentroidsDriver(startOTU, endOTU);
567
568                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
569                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
570
571                                         
572                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
573                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
574                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
575
576                                         vector<int> otuIndex(total, 0);
577                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
578                                         total = otuIndex.size();
579                                         
580                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
581                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
582                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
583                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
584                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
585                                 
586                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
587                                 }
588                         }
589                 }               
590
591                 MPI_Barrier(MPI_COMM_WORLD);
592                 return 0;
593
594         }
595         catch(exception& e) {
596                 m->errorOut(e, "ShhherCommand", "execute");
597                 exit(1);
598         }
599 }
600
601 /**************************************************************************************************/
602
603 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
604         try{            
605                 ostringstream outStream;
606                 outStream.setf(ios::fixed, ios::floatfield);
607                 outStream.setf(ios::dec, ios::basefield);
608                 outStream.setf(ios::showpoint);
609                 outStream.precision(6);
610                 
611                 int begTime = time(NULL);
612                 double begClock = clock();
613                 
614                 for(int i=startSeq;i<stopSeq;i++){
615                         for(int j=0;j<i;j++){
616                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
617                                 
618                                 if(flowDistance < 1e-6){
619                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
620                                 }
621                                 else if(flowDistance <= cutoff){
622                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
623                                 }
624                         }
625                         if(i % 100 == 0){
626                                 cout << i << "\t" << (time(NULL) - begTime) << "\t" << (clock()-begClock)/CLOCKS_PER_SEC << endl;
627                         }
628                 }
629                 cout << stopSeq << "\t" << (time(NULL) - begTime) << "\t" << (clock()-begClock)/CLOCKS_PER_SEC << endl;
630                 
631                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
632                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
633
634                 ofstream distFile(fDistFileName.c_str());
635                 distFile << outStream.str();            
636                 distFile.close();
637                 
638                 return fDistFileName;
639         }
640         catch(exception& e) {
641                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
642                 exit(1);
643         }
644 }
645
646 #else
647 //**********************************************************************************************************************
648
649 int ShhherCommand::execute(){
650         try {
651                 if (abort == true) { return 0; }
652                 
653                 cout.setf(ios::fixed, ios::floatfield);
654                 cout.setf(ios::showpoint);
655                 
656                 getSingleLookUp();
657                 getJointLookUp();
658                 
659                 
660                 vector<string> flowFileVector;
661                 if(flowFilesFileName != "not found"){
662                         string fName;
663                         
664                         ifstream flowFilesFile;
665                         m->openInputFile(flowFilesFileName, flowFilesFile);
666                         while(flowFilesFile){
667                                 flowFilesFile >> fName;
668                                 flowFileVector.push_back(fName);
669                                 m->gobble(flowFilesFile);
670                         }
671                 }
672                 else{
673                         flowFileVector.push_back(flowFileName);
674                 }
675                 int numFiles = flowFileVector.size();
676                 
677                 
678                 for(int i=0;i<numFiles;i++){
679                         flowFileName = flowFileVector[i];
680
681                         cout << "\n>>>>>\tProcessing " << flowFileName << " (file " << i+1 << " of " << numFiles << ")\t<<<<<" << endl;
682                         cout << "Reading flowgrams..." << endl;
683                         getFlowData();
684                         cout << "Identifying unique flowgrams..." << endl;
685                         getUniques();
686                         
687                         
688                         cout << "Calculating distances between flowgrams..." << endl;                   
689                         string distFileName = createDistFile(processors);
690                         string namesFileName = createNamesFile();
691                         
692                         cout << "\nClustering flowgrams..." << endl;
693                         string listFileName = cluster(distFileName, namesFileName);
694                         getOTUData(listFileName);
695                         
696                         initPyroCluster();
697                         
698                         double maxDelta = 0;
699                         int iter = 0;
700                         
701                         double begClock = clock();
702                         unsigned long int begTime = time(NULL);
703
704                         cout << "\nDenoising flowgrams..." << endl;
705                         cout << "iter\tmaxDelta\tnLL\t\tcycletime" << endl;
706                         
707                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
708                                 
709                                 double cycClock = clock();
710                                 unsigned long int cycTime = time(NULL);
711                                 fill();
712                                 
713                                 calcCentroids();
714                                 
715                                 maxDelta = getNewWeights();
716                                 double nLL = getLikelihood();
717                                 checkCentroids();
718                                 
719                                 calcNewDistances();
720
721                                 iter++;
722                                 
723                                 cout << iter << '\t' << maxDelta << '\t' << setprecision(2) << nLL << '\t' << time(NULL) - cycTime << '\t' << setprecision(6) << (clock() - cycClock)/(double)CLOCKS_PER_SEC << endl;                   
724                         }       
725                         
726                         cout << "\nFinalizing..." << endl;
727                         fill();
728                         setOTUs();
729                         
730                         vector<int> otuCounts(numOTUs, 0);
731                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
732                         
733                         calcCentroidsDriver(0, numOTUs);
734                         writeQualities(otuCounts);
735                         writeSequences(otuCounts);
736                         writeNames(otuCounts);
737                         writeClusters(otuCounts);
738                         writeGroups();
739                         
740                         remove(distFileName.c_str());
741                         remove(namesFileName.c_str());
742                         remove(listFileName.c_str());
743                         
744                         cout << "Total time to process " << flowFileName << ":\t" << time(NULL) - begTime << '\t' << setprecision(6) << (clock() - begClock)/(double)CLOCKS_PER_SEC << endl;                    
745                 }
746                 return 0;
747         }
748         catch(exception& e) {
749                 m->errorOut(e, "ShhherCommand", "execute");
750                 exit(1);
751         }
752 }
753 #endif
754 /**************************************************************************************************/
755
756 void ShhherCommand::getFlowData(){
757         try{
758                 ifstream flowFile;
759                 m->openInputFile(flowFileName, flowFile);
760                 
761                 string seqName;
762                 
763                 int currentNumFlowCells;
764                 
765                 float intensity;
766                 
767                 flowFile >> numFlowCells;
768                 int index = 0;//pcluster
769                 while(!flowFile.eof()){
770                         flowFile >> seqName >> currentNumFlowCells;
771                         lengths.push_back(currentNumFlowCells);
772
773                         seqNameVector.push_back(seqName);
774                         nameMap[seqName] = index++;//pcluster
775
776                         for(int i=0;i<numFlowCells;i++){
777                                 flowFile >> intensity;
778                                 if(intensity > 9.99)    {       intensity = 9.99;       }
779                                 int intI = int(100 * intensity + 0.0001);
780                                 flowDataIntI.push_back(intI);
781                         }
782                         m->gobble(flowFile);
783                 }
784                 flowFile.close();
785                 
786                 numSeqs = seqNameVector.size();         
787                 
788                 for(int i=0;i<numSeqs;i++){
789                         int iNumFlowCells = i * numFlowCells;
790                         for(int j=lengths[i];j<numFlowCells;j++){
791                                 flowDataIntI[iNumFlowCells + j] = 0;
792                         }
793                 }
794                 
795         }
796         catch(exception& e) {
797                 m->errorOut(e, "ShhherCommand", "getFlowData");
798                 exit(1);
799         }
800 }
801
802 /**************************************************************************************************/
803
804 void ShhherCommand::getSingleLookUp(){
805         try{
806                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
807                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
808                 
809                 int index = 0;
810                 ifstream lookUpFile;
811                 m->openInputFile(lookupFileName, lookUpFile);
812                 
813                 for(int i=0;i<HOMOPS;i++){
814                         float logFracFreq;
815                         lookUpFile >> logFracFreq;
816                         
817                         for(int j=0;j<NUMBINS;j++)      {
818                                 lookUpFile >> singleLookUp[index];
819                                 index++;
820                         }
821                 }       
822                 lookUpFile.close();
823         }
824         catch(exception& e) {
825                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
826                 exit(1);
827         }
828 }
829
830 /**************************************************************************************************/
831
832 void ShhherCommand::getJointLookUp(){
833         try{
834                 
835                 //      the most likely joint probability (-log) that two intenities have the same polymer length
836                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
837                 
838                 for(int i=0;i<NUMBINS;i++){
839                         for(int j=0;j<NUMBINS;j++){             
840                                 
841                                 float minSum = 10000000000;
842                                 
843                                 for(int k=0;k<HOMOPS;k++){
844                                         float sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
845                                         
846                                         if(sum < minSum)        {       minSum = sum;           }
847                                 }       
848                                 jointLookUp[i * NUMBINS + j] = minSum;
849                         }
850                 }
851         }
852         catch(exception& e) {
853                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
854                 exit(1);
855         }
856 }
857
858 /**************************************************************************************************/
859
860 float ShhherCommand::getProbIntensity(int intIntensity){                          
861         try{
862                 float minNegLogProb = 10000000000; 
863                 
864                 for(int i=0;i<HOMOPS;i++){//loop signal strength
865                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
866                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
867                 }
868                 
869                 return minNegLogProb;
870         }
871         catch(exception& e) {
872                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
873                 exit(1);
874         }
875 }
876
877 /**************************************************************************************************/
878
879 void ShhherCommand::getUniques(){
880         try{
881                 
882                 
883                 numUniques = 0;
884                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
885                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
886                 uniqueLengths.assign(numSeqs, 0);
887                 mapSeqToUnique.assign(numSeqs, -1);
888                 mapUniqueToSeq.assign(numSeqs, -1);
889                 
890                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
891                 
892                 for(int i=0;i<numSeqs;i++){
893                         int index = 0;
894                         
895                         vector<short> current(numFlowCells);
896                         for(int j=0;j<numFlowCells;j++){        current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));        }
897                                                 
898                         for(int j=0;j<numUniques;j++){
899                                 int offset = j * numFlowCells;
900                                 bool toEnd = 1;
901                                 
902                                 for(int k=0;k<numFlowCells;k++){
903                                         if(current[k] != uniqueFlowgrams[offset + k]){
904                                                 toEnd = 0;
905                                                 break;
906                                         }
907                                 }
908                                 
909                                 if(toEnd){
910                                         mapSeqToUnique[i] = j;
911                                         uniqueCount[j]++;
912                                         index = j;
913                                         break;
914                                 }
915                                 index++;
916                         }
917                         
918                         if(index == numUniques){
919                                 uniqueLengths[numUniques] = lengths[i];
920                                 uniqueCount[numUniques] = 1;
921                                 mapSeqToUnique[i] = numUniques;//anMap
922                                 mapUniqueToSeq[numUniques] = i;//anF
923                                 
924                                 for(int k=0;k<numFlowCells;k++){
925                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
926                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
927                                 }
928                                 
929                                 numUniques++;
930                         }
931                 }
932                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
933                 uniqueLengths.resize(numUniques);       
934                 
935                 flowDataPrI.assign(numSeqs * numFlowCells, 0);
936                 for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
937         }
938         catch(exception& e) {
939                 m->errorOut(e, "ShhherCommand", "getUniques");
940                 exit(1);
941         }
942 }
943
944 /**************************************************************************************************/
945
946 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
947         try{
948                 int minLength = lengths[mapSeqToUnique[seqA]];
949                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
950                 
951                 int ANumFlowCells = seqA * numFlowCells;
952                 int BNumFlowCells = seqB * numFlowCells;
953                 
954                 float dist = 0;
955                 
956                 for(int i=0;i<minLength;i++){
957                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
958                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
959                         
960                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
961                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
962                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
963                 }
964                 
965                 dist /= (float) minLength;
966                 return dist;
967         }
968         catch(exception& e) {
969                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
970                 exit(1);
971         }
972 }
973
974 /**************************************************************************************************/
975
976 void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
977         try{            
978
979                 ostringstream outStream;
980                 outStream.setf(ios::fixed, ios::floatfield);
981                 outStream.setf(ios::dec, ios::basefield);
982                 outStream.setf(ios::showpoint);
983                 outStream.precision(6);
984                 
985                 int begTime = time(NULL);
986                 double begClock = clock();
987
988                 for(int i=startSeq;i<stopSeq;i++){
989                         for(int j=0;j<i;j++){
990                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
991
992                                 if(flowDistance < 1e-6){
993                                         outStream << seqNameVector[mapUniqueToSeq[i]] << '\t' << seqNameVector[mapUniqueToSeq[j]] << '\t' << 0.000000 << endl;
994                                 }
995                                 else if(flowDistance <= cutoff){
996                                         outStream << seqNameVector[mapUniqueToSeq[i]] << '\t' << seqNameVector[mapUniqueToSeq[j]] << '\t' << flowDistance << endl;
997                                 }
998                         }
999                         if(i % 100 == 0){
1000                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
1001                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1002                                 m->mothurOutEndLine();
1003                         }
1004                 }
1005                 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
1006                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1007                 m->mothurOutEndLine();
1008                 
1009                 ofstream distFile((distFileName + "temp." + toString(pid)).c_str());
1010                 distFile << outStream.str();            
1011                 distFile.close();
1012         }
1013         catch(exception& e) {
1014                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
1015                 exit(1);
1016         }
1017 }
1018
1019 /**************************************************************************************************/
1020
1021 string ShhherCommand::createDistFile(int processors){
1022         try{
1023                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
1024                                 
1025                 unsigned long int begTime = time(NULL);
1026                 double begClock = clock();
1027
1028                 vector<int> start;
1029                 vector<int> end;
1030                 
1031 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1032                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
1033                 else{ //you have multiple processors
1034                         
1035                         if (numSeqs < processors){      processors = 1; }
1036                         
1037                         vector<int> start(processors, 0);
1038                         vector<int> end(processors, 0);
1039                         
1040                         for (int i = 0; i < processors; i++) {
1041                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
1042                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
1043                         }
1044                         
1045                         int process = 1;
1046                         vector<int> processIDs;
1047                         
1048                         //loop through and create all the processes you want
1049                         while (process != processors) {
1050                                 int pid = fork();
1051                                 
1052                                 if (pid > 0) {
1053                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1054                                         process++;
1055                                 }else if (pid == 0){
1056                                         flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
1057                                         exit(0);
1058                                 }else { 
1059                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1060                                         perror(" : ");
1061                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1062                                         exit(0);
1063                                 }
1064                         }
1065                         
1066                         //parent does its part
1067                         flowDistParentFork(fDistFileName, start[0], end[0]);
1068                         
1069                         //force parent to wait until all the processes are done
1070                         for (int i=0;i<processIDs.size();i++) { 
1071                                 int temp = processIDs[i];
1072                                 wait(&temp);
1073                         }
1074                         
1075                         //append and remove temp files
1076                         for (int i=0;i<processIDs.size();i++) { 
1077                                 m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
1078                                 remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
1079                         }
1080                         
1081                 }
1082                 
1083 #else
1084                 flowDistParentFork(fDistFileName, 0, numUniques);
1085 #endif
1086
1087                 m->mothurOutEndLine();
1088                 
1089                 cout << "Total time: " << (time(NULL) - begTime) << "\t"  << (clock() - begClock)/CLOCKS_PER_SEC << endl;;
1090
1091                 return fDistFileName;
1092         }
1093         catch(exception& e) {
1094                 m->errorOut(e, "ShhherCommand", "createDistFile");
1095                 exit(1);
1096         }
1097         
1098 }
1099
1100 /**************************************************************************************************/
1101
1102 string ShhherCommand::createNamesFile(){
1103         try{
1104                 
1105                 vector<string> duplicateNames(numUniques, "");
1106                 for(int i=0;i<numSeqs;i++){
1107                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
1108                 }
1109                 
1110                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.names";
1111                 
1112                 ofstream nameFile;
1113                 m->openOutputFile(nameFileName, nameFile);
1114                 
1115                 for(int i=0;i<numUniques;i++){
1116 //                      nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1117                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1118                 }
1119                 
1120                 nameFile.close();
1121                 return  nameFileName;
1122         }
1123         catch(exception& e) {
1124                 m->errorOut(e, "ShhherCommand", "createNamesFile");
1125                 exit(1);
1126         }
1127 }
1128
1129 //**********************************************************************************************************************
1130
1131 string ShhherCommand::cluster(string distFileName, string namesFileName){
1132         try {
1133                 
1134                 SparseMatrix* matrix;
1135                 ListVector* list;
1136                 RAbundVector* rabund;
1137                 
1138                 globaldata->setNameFile(namesFileName);
1139                 globaldata->setColumnFile(distFileName);
1140                 globaldata->setFormat("column");
1141                 
1142                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1143                 read->setCutoff(cutoff);
1144                 
1145                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1146                 clusterNameMap->readMap();
1147                 read->read(clusterNameMap);
1148                 
1149                 list = read->getListVector();
1150                 matrix = read->getMatrix();
1151                 
1152                 delete read; 
1153                 delete clusterNameMap; 
1154                                 
1155                 rabund = new RAbundVector(list->getRAbundVector());
1156                 
1157                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1158                 string tag = cluster->getTag();
1159                 
1160                 double clusterCutoff = cutoff;
1161                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1162                         cluster->update(clusterCutoff);
1163                         float dist = matrix->getSmallDist();
1164                 }
1165                 
1166                 list->setLabel(toString(cutoff));
1167                 
1168                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.list";
1169                 ofstream listFile;
1170                 m->openOutputFile(listFileName, listFile);
1171                 list->print(listFile);
1172                 listFile.close();
1173                 
1174                 delete matrix;  delete cluster; delete rabund; delete list;
1175         
1176                 return listFileName;
1177         }
1178         catch(exception& e) {
1179                 m->errorOut(e, "ShhherCommand", "cluster");
1180                 exit(1);        
1181         }               
1182 }
1183
1184 /**************************************************************************************************/
1185
1186 void ShhherCommand::getOTUData(string listFileName){
1187         try {
1188
1189                 ifstream listFile;
1190                 m->openInputFile(listFileName, listFile);
1191                 string label;
1192                 
1193                 listFile >> label >> numOTUs;
1194
1195                 otuData.assign(numSeqs, 0);
1196                 cumNumSeqs.assign(numOTUs, 0);
1197                 nSeqsPerOTU.assign(numOTUs, 0);
1198                 aaP.resize(numOTUs);
1199                 
1200                 string singleOTU = "";
1201                 
1202                 for(int i=0;i<numOTUs;i++){
1203
1204                         listFile >> singleOTU;
1205                         
1206                         istringstream otuString(singleOTU);
1207
1208                         while(otuString){
1209                                 
1210                                 string seqName = "";
1211                                 
1212                                 for(int j=0;j<singleOTU.length();j++){
1213                                         char letter = otuString.get();
1214                                         
1215                                         if(letter != ','){
1216                                                 seqName += letter;
1217                                         }
1218                                         else{
1219                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1220                                                 int index = nmIt->second;
1221                                                 
1222                                                 nameMap.erase(nmIt);
1223                                                 
1224                                                 otuData[index] = i;
1225                                                 nSeqsPerOTU[i]++;
1226                                                 aaP[i].push_back(index);
1227                                                 seqName = "";
1228                                         }
1229                                 }
1230                                 
1231                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1232
1233                                 int index = nmIt->second;
1234                                 nameMap.erase(nmIt);
1235
1236                                 otuData[index] = i;
1237                                 nSeqsPerOTU[i]++;
1238                                 aaP[i].push_back(index);        
1239                                 
1240                                 otuString.get();
1241                         }
1242                         
1243                         sort(aaP[i].begin(), aaP[i].end());
1244                         for(int j=0;j<nSeqsPerOTU[i];j++){
1245                                 seqNumber.push_back(aaP[i][j]);
1246                         }
1247                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
1248                                 aaP[i].push_back(0);
1249                         }
1250                 }
1251                 
1252                 for(int i=1;i<numOTUs;i++){
1253                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
1254                 }
1255                 aaI = aaP;
1256                 seqIndex = seqNumber;
1257                 
1258                 listFile.close();       
1259         }
1260         catch(exception& e) {
1261                 m->errorOut(e, "ShhherCommand", "getOTUData");
1262                 exit(1);        
1263         }               
1264 }
1265
1266 /**************************************************************************************************/
1267
1268 void ShhherCommand::initPyroCluster(){                          
1269         try{
1270                 dist.assign(numSeqs * numOTUs, 0);
1271                 change.assign(numOTUs, 1);
1272                 centroids.assign(numOTUs, -1);
1273                 weight.assign(numOTUs, 0);
1274                 singleTau.assign(numSeqs, 1.0);
1275                 
1276                 nSeqsBreaks.assign(processors+1, 0);
1277                 nOTUsBreaks.assign(processors+1, 0);
1278
1279                 nSeqsBreaks[0] = 0;
1280                 for(int i=0;i<processors;i++){
1281                         nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1282                         nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1283                 }
1284                 nSeqsBreaks[processors] = numSeqs;
1285                 nOTUsBreaks[processors] = numOTUs;
1286         }
1287         catch(exception& e) {
1288                 m->errorOut(e, "ShhherCommand", "initPyroCluster");
1289                 exit(1);        
1290         }
1291 }
1292
1293 /**************************************************************************************************/
1294
1295 void ShhherCommand::fill(){
1296         try {
1297                 int index = 0;
1298                 for(int i=0;i<numOTUs;i++){
1299                         cumNumSeqs[i] = index;
1300                         for(int j=0;j<nSeqsPerOTU[i];j++){
1301                                 seqNumber[index] = aaP[i][j];
1302                                 seqIndex[index] = aaI[i][j];
1303                                 
1304                                 index++;
1305                         }
1306                 }
1307         }
1308         catch(exception& e) {
1309                 m->errorOut(e, "ShhherCommand", "fill");
1310                 exit(1);        
1311         }               
1312 }
1313
1314 /**************************************************************************************************/
1315
1316 void ShhherCommand::calcCentroids(){                          
1317         try{
1318                 
1319 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1320
1321                 if(processors == 1)     {
1322                         calcCentroidsDriver(0, numOTUs);                
1323                 }
1324                 else{ //you have multiple processors
1325                         if (numOTUs < processors){      processors = 1; }
1326                         
1327                         int process = 1;
1328                         vector<int> processIDs;
1329                         
1330                         //loop through and create all the processes you want
1331                         while (process != processors) {
1332                                 int pid = vfork();
1333                                 
1334                                 if (pid > 0) {
1335                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1336                                         process++;
1337                                 }else if (pid == 0){
1338                                         calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
1339                                         exit(0);
1340                                 }else { 
1341                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1342                                         perror(" : ");
1343                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1344                                         exit(0);
1345                                 }
1346                         }
1347                         
1348                         //parent does its part
1349                         calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
1350
1351                         //force parent to wait until all the processes are done
1352                         for (int i=0;i<processIDs.size();i++) { 
1353                                 int temp = processIDs[i];
1354                                 wait(&temp);
1355                         }
1356                 }
1357                 
1358 #else
1359                 calcCentroidsDriver(0, numOTUs);
1360 #endif          
1361         }
1362         catch(exception& e) {
1363                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1364                 exit(1);        
1365         }               
1366 }
1367
1368 /**************************************************************************************************/
1369
1370 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1371         
1372         //this function gets the most likely homopolymer length at a flow position for a group of sequences
1373         //within an otu
1374         
1375         try{
1376                 
1377                 for(int i=start;i<finish;i++){
1378                         
1379                         double count = 0;
1380                         int position = 0;
1381                         int minFlowGram = 100000000;
1382                         double minFlowValue = 1e8;
1383                         change[i] = 0; //FALSE
1384                         
1385                         for(int j=0;j<nSeqsPerOTU[i];j++){
1386                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1387                         }
1388                         
1389                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1390                                 vector<double> adF(nSeqsPerOTU[i]);
1391                                 vector<int> anL(nSeqsPerOTU[i]);
1392                                 
1393                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1394                                         int index = cumNumSeqs[i] + j;
1395                                         int nI = seqIndex[index];
1396                                         int nIU = mapSeqToUnique[nI];
1397                                         
1398                                         int k;
1399                                         for(k=0;k<position;k++){
1400                                                 if(nIU == anL[k]){
1401                                                         break;
1402                                                 }
1403                                         }
1404                                         if(k == position){
1405                                                 anL[position] = nIU;
1406                                                 adF[position] = 0.0000;
1407                                                 position++;
1408                                         }                                               
1409                                 }
1410                                 
1411                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1412                                         int index = cumNumSeqs[i] + j;
1413                                         int nI = seqIndex[index];
1414                                         int nIU = mapSeqToUnique[nI];
1415                                         
1416                                         double tauValue = singleTau[seqNumber[index]];
1417                                         
1418                                         for(int k=0;k<position;k++){
1419                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1420                                                 adF[k] += dist * tauValue;
1421                                         }
1422                                 }
1423                                 
1424                                 for(int j=0;j<position;j++){
1425                                         if(adF[j] < minFlowValue){
1426                                                 minFlowGram = j;
1427                                                 minFlowValue = adF[j];
1428                                         }
1429                                 }
1430                                 
1431                                 if(centroids[i] != anL[minFlowGram]){
1432                                         change[i] = 1;
1433                                         centroids[i] = anL[minFlowGram];
1434                                 }
1435                         }
1436                         else if(centroids[i] != -1){
1437                                 change[i] = 1;
1438                                 centroids[i] = -1;                      
1439                         }
1440                 }
1441         }
1442         catch(exception& e) {
1443                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1444                 exit(1);        
1445         }               
1446 }
1447
1448 /**************************************************************************************************/
1449
1450 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1451         try{
1452                 
1453                 int flowAValue = cent * numFlowCells;
1454                 int flowBValue = flow * numFlowCells;
1455                 
1456                 double dist = 0;
1457                 
1458                 for(int i=0;i<length;i++){
1459                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1460                         flowAValue++;
1461                         flowBValue++;
1462                 }
1463                 
1464                 return dist / (double)length;
1465         }
1466         catch(exception& e) {
1467                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1468                 exit(1);        
1469         }               
1470 }
1471
1472 /**************************************************************************************************/
1473
1474 double ShhherCommand::getNewWeights(){
1475         try{
1476                 
1477                 double maxChange = 0;
1478                 
1479                 for(int i=0;i<numOTUs;i++){
1480                         
1481                         double difference = weight[i];
1482                         weight[i] = 0;
1483                         
1484                         for(int j=0;j<nSeqsPerOTU[i];j++){
1485                                 int index = cumNumSeqs[i] + j;
1486                                 int nI = seqIndex[index];
1487                                 double tauValue = singleTau[seqNumber[index]];
1488                                 weight[i] += tauValue;
1489                         }
1490                         
1491                         difference = fabs(weight[i] - difference);
1492                         if(difference > maxChange){     maxChange = difference; }
1493                 }
1494                 return maxChange;
1495         }
1496         catch(exception& e) {
1497                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1498                 exit(1);        
1499         }               
1500 }
1501
1502 /**************************************************************************************************/
1503
1504 double ShhherCommand::getLikelihood(){
1505         
1506         try{
1507                 
1508                 vector<double> P(numSeqs, 0);
1509                 int effNumOTUs = 0;
1510                 
1511                 for(int i=0;i<numOTUs;i++){
1512                         if(weight[i] > MIN_WEIGHT){
1513                                 effNumOTUs++;
1514                         }
1515                 }
1516                 
1517                 for(int i=0;i<numOTUs;i++){
1518                         for(int j=0;j<nSeqsPerOTU[i];j++){
1519                                 int index = cumNumSeqs[i] + j;
1520                                 int nI = seqIndex[index];
1521                                 double singleDist = dist[seqNumber[index]];
1522                                 
1523                                 P[nI] += weight[i] * exp(-singleDist * sigma);
1524                         }
1525                 }
1526                 
1527                 double nLL = 0.00;
1528                 for(int i=0;i<numSeqs;i++){
1529                         nLL += -log(P[i]);
1530                 }
1531                 
1532                 nLL = nLL -(double)numSeqs * log(sigma);
1533                 
1534                 return nLL; 
1535         }
1536         catch(exception& e) {
1537                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1538                 exit(1);        
1539         }               
1540 }
1541
1542 /**************************************************************************************************/
1543
1544 void ShhherCommand::checkCentroids(){
1545         try{
1546                 vector<int> unique(numOTUs, 1);
1547                 
1548                 for(int i=0;i<numOTUs;i++){
1549                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1550                                 unique[i] = -1;
1551                         }
1552                 }
1553                 
1554                 for(int i=0;i<numOTUs;i++){
1555                         if(unique[i] == 1){
1556                                 for(int j=i+1;j<numOTUs;j++){
1557                                         if(unique[j] == 1){
1558                                                 
1559                                                 if(centroids[j] == centroids[i]){
1560                                                         unique[j] = 0;
1561                                                         centroids[j] = -1;
1562                                                         
1563                                                         weight[i] += weight[j];
1564                                                         weight[j] = 0.0;
1565                                                 }
1566                                         }
1567                                 }
1568                         }
1569                 }
1570         }
1571         catch(exception& e) {
1572                 m->errorOut(e, "ShhherCommand", "checkCentroids");
1573                 exit(1);        
1574         }               
1575 }
1576
1577 /**************************************************************************************************/
1578
1579 void ShhherCommand::calcNewDistances(){                          
1580         try{
1581                 
1582 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1583                 
1584                 if(processors == 1)     {
1585                         calcNewDistancesParent(0, numSeqs);             
1586                 }
1587                 else{ //you have multiple processors
1588                         if (numSeqs < processors){      processors = 1; }
1589                         
1590                         vector<vector<int> > child_otuIndex(processors);
1591                         vector<vector<int> > child_seqIndex(processors);
1592                         vector<vector<double> > child_singleTau(processors);                    
1593                         vector<int> totals(processors);
1594                         
1595                         int process = 1;
1596                         vector<int> processIDs;
1597
1598                         //loop through and create all the processes you want
1599                         while (process != processors) {
1600                                 int pid = vfork();
1601                                 
1602                                 if (pid > 0) {
1603                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1604                                         process++;
1605                                 }else if (pid == 0){
1606                                         calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
1607                                         totals[process] = child_otuIndex[process].size();
1608
1609                                         exit(0);
1610                                 }else { 
1611                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1612                                         perror(" : ");
1613                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1614                                         exit(0);
1615                                 }
1616                         }
1617                         
1618                         //parent does its part
1619                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
1620                         int total = seqIndex.size();
1621
1622                         //force parent to wait until all the processes are done
1623                         for (int i=0;i<processIDs.size();i++) { 
1624                                 int temp = processIDs[i];
1625                                 wait(&temp);
1626                         }
1627
1628                         for(int i=1;i<processors;i++){
1629                                 int oldTotal = total;
1630                                 total += totals[i];
1631
1632                                 singleTau.resize(total, 0);
1633                                 seqIndex.resize(total, 0);
1634                                 seqNumber.resize(total, 0);
1635                                 
1636                                 int childIndex = 0;
1637                                 
1638                                 for(int j=oldTotal;j<total;j++){
1639                                         int otuI = child_otuIndex[i][childIndex];
1640                                         int seqI = child_seqIndex[i][childIndex];
1641
1642                                         singleTau[j] = child_singleTau[i][childIndex];
1643                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
1644                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
1645                                         nSeqsPerOTU[otuI]++;
1646
1647                                         childIndex++;
1648                                 }
1649                         }
1650                 }
1651 #else
1652                 calcNewDistancesParent(0, numSeqs);             
1653 #endif          
1654         }
1655         catch(exception& e) {
1656                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1657                 exit(1);        
1658         }               
1659 }
1660
1661 /**************************************************************************************************/
1662 #ifdef USE_MPI
1663 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1664         
1665         try{
1666                 vector<double> newTau(numOTUs,0);
1667                 vector<double> norms(numSeqs, 0);
1668                 otuIndex.resize(0);
1669                 seqIndex.resize(0);
1670                 singleTau.resize(0);
1671                 
1672                 
1673                 
1674                 for(int i=startSeq;i<stopSeq;i++){
1675                         double offset = 1e8;
1676                         int indexOffset = i * numOTUs;
1677                         
1678                         for(int j=0;j<numOTUs;j++){
1679                                 
1680                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1681                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1682                                 }
1683                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1684                                         offset = dist[indexOffset + j];
1685                                 }
1686                         }
1687                         
1688                         for(int j=0;j<numOTUs;j++){
1689                                 if(weight[j] > MIN_WEIGHT){
1690                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1691                                         norms[i] += newTau[j];
1692                                 }
1693                                 else{
1694                                         newTau[j] = 0.0;
1695                                 }
1696                         }
1697                         
1698                         for(int j=0;j<numOTUs;j++){
1699
1700                                 newTau[j] /= norms[i];
1701                                 
1702                                 if(newTau[j] > MIN_TAU){
1703                                         otuIndex.push_back(j);
1704                                         seqIndex.push_back(i);
1705                                         singleTau.push_back(newTau[j]);
1706                                 }
1707                         }
1708                         
1709                 }
1710         }
1711         catch(exception& e) {
1712                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1713                 exit(1);        
1714         }               
1715 }
1716 #endif
1717 /**************************************************************************************************/
1718
1719 void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
1720         
1721         try{
1722                 vector<double> newTau(numOTUs,0);
1723                 vector<double> norms(numSeqs, 0);
1724                 child_otuIndex.resize(0);
1725                 child_seqIndex.resize(0);
1726                 child_singleTau.resize(0);
1727                 
1728                 for(int i=startSeq;i<stopSeq;i++){
1729                         double offset = 1e8;
1730                         int indexOffset = i * numOTUs;
1731                         
1732                         
1733                         for(int j=0;j<numOTUs;j++){
1734                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1735                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1736                                 }
1737                                 
1738                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1739                                         offset = dist[indexOffset + j];
1740                                 }
1741                         }
1742                         
1743                         for(int j=0;j<numOTUs;j++){
1744                                 if(weight[j] > MIN_WEIGHT){
1745                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1746                                         norms[i] += newTau[j];
1747                                 }
1748                                 else{
1749                                         newTau[j] = 0.0;
1750                                 }
1751                         }
1752                         
1753                         for(int j=0;j<numOTUs;j++){
1754                                 newTau[j] /= norms[i];
1755                                 
1756                                 if(newTau[j] > MIN_TAU){
1757                                         child_otuIndex.push_back(j);
1758                                         child_seqIndex.push_back(i);
1759                                         child_singleTau.push_back(newTau[j]);
1760                                 }
1761                         }
1762                 }
1763         }
1764         catch(exception& e) {
1765                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
1766                 exit(1);        
1767         }               
1768 }
1769
1770 /**************************************************************************************************/
1771
1772 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1773         
1774         try{
1775                 
1776                 int total = 0;
1777                 vector<double> newTau(numOTUs,0);
1778                 vector<double> norms(numSeqs, 0);
1779                 nSeqsPerOTU.assign(numOTUs, 0);
1780                 
1781                 for(int i=startSeq;i<stopSeq;i++){
1782                         int indexOffset = i * numOTUs;
1783                         
1784                         double offset = 1e8;
1785                         
1786                         for(int j=0;j<numOTUs;j++){
1787                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1788                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1789                                 }
1790                                 
1791                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1792                                         offset = dist[indexOffset + j];
1793                                 }
1794                         }
1795                         
1796                         for(int j=0;j<numOTUs;j++){
1797                                 if(weight[j] > MIN_WEIGHT){
1798                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1799                                         norms[i] += newTau[j];
1800                                 }
1801                                 else{
1802                                         newTau[j] = 0.0;
1803                                 }
1804                         }
1805                         
1806                         for(int j=0;j<numOTUs;j++){
1807                                 newTau[j] /= norms[i];
1808                         }
1809                         
1810                         for(int j=0;j<numOTUs;j++){
1811                                 if(newTau[j] > MIN_TAU){
1812                                         
1813                                         int oldTotal = total;
1814                                         
1815                                         total++;
1816                                         
1817                                         singleTau.resize(total, 0);
1818                                         seqNumber.resize(total, 0);
1819                                         seqIndex.resize(total, 0);
1820                                         
1821                                         singleTau[oldTotal] = newTau[j];
1822                                         
1823                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
1824                                         aaI[j][nSeqsPerOTU[j]] = i;
1825                                         nSeqsPerOTU[j]++;
1826                                 }
1827                         }
1828                 }
1829         }
1830         catch(exception& e) {
1831                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1832                 exit(1);        
1833         }               
1834 }
1835
1836 /**************************************************************************************************/
1837
1838 void ShhherCommand::setOTUs(){
1839         
1840         try {
1841                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1842                 
1843                 for(int i=0;i<numOTUs;i++){
1844                         for(int j=0;j<nSeqsPerOTU[i];j++){
1845                                 int index = cumNumSeqs[i] + j;
1846                                 double tauValue = singleTau[seqNumber[index]];
1847                                 int sIndex = seqIndex[index];
1848                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1849                         }
1850                 }
1851                 
1852                 for(int i=0;i<numSeqs;i++){
1853                         double maxTau = -1.0000;
1854                         int maxOTU = -1;
1855                         for(int j=0;j<numOTUs;j++){
1856                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1857                                         maxTau = bigTauMatrix[i * numOTUs + j];
1858                                         maxOTU = j;
1859                                 }
1860                         }
1861                         
1862                         otuData[i] = maxOTU;
1863                 }
1864                 
1865                 nSeqsPerOTU.assign(numOTUs, 0);         
1866                 
1867                 for(int i=0;i<numSeqs;i++){
1868                         int index = otuData[i];
1869                         
1870                         singleTau[i] = 1.0000;
1871                         dist[i] = 0.0000;
1872                         
1873                         aaP[index][nSeqsPerOTU[index]] = i;
1874                         aaI[index][nSeqsPerOTU[index]] = i;
1875                         
1876                         nSeqsPerOTU[index]++;
1877                 }
1878                 fill(); 
1879         }
1880         catch(exception& e) {
1881                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1882                 exit(1);        
1883         }               
1884 }
1885
1886 /**************************************************************************************************/
1887
1888 void ShhherCommand::writeQualities(vector<int> otuCounts){
1889         
1890         try {
1891                 string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.qual";
1892
1893                 ofstream qualityFile;
1894                 m->openOutputFile(qualityFileName, qualityFile);
1895
1896                 qualityFile.setf(ios::fixed, ios::floatfield);
1897                 qualityFile.setf(ios::showpoint);
1898                 qualityFile << setprecision(6);
1899                 
1900                 vector<vector<int> > qualities(numOTUs);
1901                 vector<double> pr(HOMOPS, 0);
1902                 
1903                 int index = 0;
1904                 
1905                 for(int i=0;i<numOTUs;i++){
1906                         int index = 0;
1907                         int base = 0;
1908                         
1909                         if(nSeqsPerOTU[i] > 0){
1910                                 qualities[i].assign(1024, -1);
1911                                 
1912                                 while(index < numFlowCells){
1913                                         double maxPrValue = 1e8;
1914                                         short maxPrIndex = -1;
1915                                         double count = 0.0000;
1916                                         
1917                                         pr.assign(HOMOPS, 0);
1918                                         
1919                                         for(int j=0;j<nSeqsPerOTU[i];j++){
1920                                                 int lIndex = cumNumSeqs[i] + j;
1921                                                 double tauValue = singleTau[seqNumber[lIndex]];
1922                                                 int sequenceIndex = aaI[i][j];
1923                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1924                                                 
1925                                                 count += tauValue;
1926                                                 
1927                                                 for(int s=0;s<HOMOPS;s++){
1928                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1929                                                 }
1930                                         }
1931                                         
1932                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1933                                         maxPrValue = pr[maxPrIndex];
1934                                         
1935                                         if(count > MIN_COUNT){
1936                                                 double U = 0.0000;
1937                                                 double norm = 0.0000;
1938                                                 
1939                                                 for(int s=0;s<HOMOPS;s++){
1940                                                         norm += exp(-(pr[s] - maxPrValue));
1941                                                 }
1942                                                 
1943                                                 for(int s=1;s<=maxPrIndex;s++){
1944                                                         int value = 0;
1945                                                         double temp = 0.0000;
1946                                                         
1947                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
1948                                                         
1949                                                         if(U>0.00){
1950                                                                 temp = log10(U);
1951                                                         }
1952                                                         else{
1953                                                                 temp = -10.1;
1954                                                         }
1955                                                         temp = floor(-10 * temp);
1956                                                         value = (int)floor(temp);
1957                                                         if(value > 100){        value = 100;    }
1958                                                         
1959                                                         qualities[i][base] = (int)value;
1960                                                         base++;
1961                                                 }
1962                                         }
1963                                         
1964                                         index++;
1965                                 }
1966                         }
1967                         
1968                         
1969                         if(otuCounts[i] > 0){
1970                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1971                                 
1972                                 int j=4;        //need to get past the first four bases
1973                                 while(qualities[i][j] != -1){
1974                                         qualityFile << qualities[i][j] << ' ';
1975                                         j++;
1976                                 }
1977                                 qualityFile << endl;
1978                         }
1979                 }
1980                 qualityFile.close();
1981                 
1982         }
1983         catch(exception& e) {
1984                 m->errorOut(e, "ShhherCommand", "writeQualities");
1985                 exit(1);        
1986         }               
1987 }
1988
1989 /**************************************************************************************************/
1990
1991 void ShhherCommand::writeSequences(vector<int> otuCounts){
1992         try {
1993                 
1994                 string bases = "TACG";
1995                 
1996                 string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.fasta";
1997                 ofstream fastaFile;
1998                 m->openOutputFile(fastaFileName, fastaFile);
1999                 
2000                 vector<string> names(numOTUs, "");
2001                 
2002                 for(int i=0;i<numOTUs;i++){
2003                         int index = centroids[i];
2004                         
2005                         if(otuCounts[i] > 0){
2006                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
2007                                 
2008                                 for(int j=8;j<numFlowCells;j++){
2009                                         
2010                                         char base = bases[j % 4];
2011                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
2012                                                 fastaFile << base;
2013                                         }
2014                                 }
2015                                 fastaFile << endl;
2016                         }
2017                 }
2018                 fastaFile.close();
2019         }
2020         catch(exception& e) {
2021                 m->errorOut(e, "ShhherCommand", "writeSequences");
2022                 exit(1);        
2023         }               
2024 }
2025
2026 /**************************************************************************************************/
2027
2028 void ShhherCommand::writeNames(vector<int> otuCounts){
2029         try {
2030                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.final.names";
2031                 ofstream nameFile;
2032                 m->openOutputFile(nameFileName, nameFile);
2033                 
2034                 for(int i=0;i<numOTUs;i++){
2035                         if(otuCounts[i] > 0){
2036                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
2037                                 
2038                                 for(int j=1;j<nSeqsPerOTU[i];j++){
2039                                         nameFile << ',' << seqNameVector[aaI[i][j]];
2040                                 }
2041                                 
2042                                 nameFile << endl;
2043                         }
2044                 }
2045                 nameFile.close();
2046         }
2047         catch(exception& e) {
2048                 m->errorOut(e, "ShhherCommand", "writeNames");
2049                 exit(1);        
2050         }               
2051 }
2052
2053 /**************************************************************************************************/
2054
2055 void ShhherCommand::writeGroups(){
2056         try {
2057                 string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
2058                 string groupFileName = fileRoot + ".pn.groups";
2059                 ofstream groupFile;
2060                 m->openOutputFile(groupFileName, groupFile);
2061                 
2062                 for(int i=0;i<numSeqs;i++){
2063                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
2064                 }
2065                 groupFile.close();
2066         }
2067         catch(exception& e) {
2068                 m->errorOut(e, "ShhherCommand", "writeGroups");
2069                 exit(1);        
2070         }               
2071 }
2072
2073 /**************************************************************************************************/
2074
2075 void ShhherCommand::writeClusters(vector<int> otuCounts){
2076         try {
2077                 string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.counts";
2078                 ofstream otuCountsFile;
2079                 m->openOutputFile(otuCountsFileName, otuCountsFile);
2080                 
2081                 string bases = "TACG";
2082                 
2083                 for(int i=0;i<numOTUs;i++){
2084                         //output the translated version of the centroid sequence for the otu
2085                         if(otuCounts[i] > 0){
2086                                 int index = centroids[i];
2087                                 
2088                                 otuCountsFile << "ideal\t";
2089                                 for(int j=8;j<numFlowCells;j++){
2090                                         char base = bases[j % 4];
2091                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
2092                                                 otuCountsFile << base;
2093                                         }
2094                                 }
2095                                 otuCountsFile << endl;
2096                                 
2097                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2098                                         int sequence = aaI[i][j];
2099                                         otuCountsFile << seqNameVector[sequence] << '\t';
2100                                         
2101                                         for(int k=8;k<lengths[sequence];k++){
2102                                                 char base = bases[k % 4];
2103                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
2104                                                 
2105                                                 for(int s=0;s<freq;s++){
2106                                                         otuCountsFile << base;
2107                                                 }
2108                                         }
2109                                         otuCountsFile << endl;
2110                                 }
2111                                 otuCountsFile << endl;
2112                         }
2113                 }
2114                 otuCountsFile.close();
2115         }
2116         catch(exception& e) {
2117                 m->errorOut(e, "ShhherCommand", "writeClusters");
2118                 exit(1);        
2119         }               
2120 }
2121
2122 //**********************************************************************************************************************