]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
modified shhhercommand to make buildable with windows
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsematrix.hpp"
19 #include <cfloat>
20
21 //**********************************************************************************************************************
22
23 #define NUMBINS 1000
24 #define HOMOPS 10
25 #define MIN_COUNT 0.1
26 #define MIN_WEIGHT 0.1
27 #define MIN_TAU 0.0001
28 #define MIN_ITER 10
29
30 //**********************************************************************************************************************
31
32 vector<string> ShhherCommand::getValidParameters(){     
33         try {
34                 string Array[] =  {     
35                         "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors"       
36                 };
37                 
38                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
39                 return myArray;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ShhherCommand", "getValidParameters");
43                 exit(1);
44         }
45 }
46
47 //**********************************************************************************************************************
48
49 ShhherCommand::ShhherCommand(){ 
50         try {
51                 abort = true;
52                 
53                 //initialize outputTypes
54                 vector<string> tempOutNames;
55                 outputTypes["pn.dist"] = tempOutNames;
56
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
60                 exit(1);
61         }
62 }
63
64 //**********************************************************************************************************************
65
66 vector<string> ShhherCommand::getRequiredParameters(){  
67         try {
68                 string Array[] =  {"flow"};
69                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
70                 return myArray;
71         }
72         catch(exception& e) {
73                 m->errorOut(e, "ShhherCommand", "getRequiredParameters");
74                 exit(1);
75         }
76 }
77
78 //**********************************************************************************************************************
79
80 vector<string> ShhherCommand::getRequiredFiles(){       
81         try {
82                 vector<string> myArray;
83                 return myArray;
84         }
85         catch(exception& e) {
86                 m->errorOut(e, "ShhherCommand", "getRequiredFiles");
87                 exit(1);
88         }
89 }
90
91 //**********************************************************************************************************************
92
93 ShhherCommand::ShhherCommand(string option) {
94         try {
95
96 #ifdef USE_MPI
97                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
98                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
99
100                 if(pid == 0){
101 #endif
102                 
103                 
104                 abort = false;
105                 
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; }
109                 
110                 else {
111                         
112                         //valid paramters for this command
113                         string AlignArray[] =  {
114                                 "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors"       
115                         };
116                         
117                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
118                         
119                         OptionParser parser(option);
120                         map<string,string> parameters = parser.getParameters();
121                         
122                         ValidParameters validParameter;
123                         map<string,string>::iterator it;
124                         
125                         //check to make sure all parameters are valid for command
126                         for (it = parameters.begin(); it != parameters.end(); it++) { 
127                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
128                         }
129                         
130                         //initialize outputTypes
131                         vector<string> tempOutNames;
132                         outputTypes["pn.dist"] = tempOutNames;
133                         //                      outputTypes["fasta"] = tempOutNames;
134                         
135                         //if the user changes the input directory command factory will send this info to us in the output parameter 
136                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
137                         if (inputDir == "not found"){   inputDir = "";          }
138                         else {
139                                 string path;
140                                 it = parameters.find("flow");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
146                                 }
147                                 
148                                 it = parameters.find("lookup");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
154                                 }
155
156                                 it = parameters.find("file");
157                                 //user has given a template file
158                                 if(it != parameters.end()){ 
159                                         path = m->hasPath(it->second);
160                                         //if the user has not given a path then, add inputdir. else leave path alone.
161                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
162                                 }
163                         }
164                         
165                         
166                         //check for required parameters
167                         flowFileName = validParameter.validFile(parameters, "flow", true);
168                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
169                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
170                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
171                                 m->mothurOutEndLine();
172                                 abort = true; 
173                         }
174                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }       
175                         
176                         //if the user changes the output directory command factory will send this info to us in the output parameter 
177                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
178                                 outputDir = ""; 
179                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
180                         }
181                         
182                         
183                         //check for optional parameter and set defaults
184                         // ...at some point should added some additional type checking...
185                         string temp;
186                         temp = validParameter.validFile(parameters, "lookup", true);
187                         if (temp == "not found")        {       lookupFileName = "LookUp_Titanium.pat"; }
188                         else if(temp == "not open")     {       abort = true;                   } 
189                         else                                            {       lookupFileName = temp;  }
190                         
191                         temp = validParameter.validFile(parameters, "processors", false);if (temp == "not found"){      temp = "1";                     }
192                         convert(temp, processors); 
193
194                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
195                         convert(temp, cutoff); 
196                         
197                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
198                         convert(temp, minDelta); 
199
200                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
201                         convert(temp, maxIters); 
202
203                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
204                         convert(temp, sigma); 
205                         
206                         globaldata = GlobalData::getInstance();
207                 }
208                         
209 #ifdef USE_MPI
210                 }                               
211 #endif
212                                 
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
216                 exit(1);
217         }
218 }
219
220 //**********************************************************************************************************************
221
222 ShhherCommand::~ShhherCommand(){}
223
224 //**********************************************************************************************************************
225
226 void ShhherCommand::help(){
227         try {
228                 m->mothurOut("The shhher command reads a file containing flowgrams and creates a file of corrected sequences.\n");
229         }
230         catch(exception& e) {
231                 m->errorOut(e, "ShhherCommand", "help");
232                 exit(1);
233         }
234 }
235
236 //**********************************************************************************************************************
237 #ifdef USE_MPI
238 int ShhherCommand::execute(){
239         try {
240                 int tag = 1976;
241                 MPI_Status status; 
242
243                 double begClock = clock();
244                 unsigned long int begTime = time(NULL);
245
246                 cout.setf(ios::fixed, ios::floatfield);
247                 cout.setf(ios::showpoint);
248                 cout << setprecision(2);
249                 
250                 
251                 if(pid == 0){
252
253                         for(int i=1;i<ncpus;i++){
254                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
255                         }
256                         if(abort == 1){ return 0;       }
257
258                         processors = ncpus;
259                         
260                         cout << "\nGetting preliminary data..." << endl;
261                         getSingleLookUp();
262                         getJointLookUp();
263                         
264                         vector<string> flowFileVector;
265                         if(flowFilesFileName != "not found"){
266                                 string fName;
267
268                                 ifstream flowFilesFile;
269                                 m->openInputFile(flowFilesFileName, flowFilesFile);
270                                 while(flowFilesFile){
271                                         flowFilesFile >> fName;
272                                         flowFileVector.push_back(fName);
273                                         m->gobble(flowFilesFile);
274                                 }
275                         }
276                         else{
277                                 flowFileVector.push_back(flowFileName);
278                         }
279                         int numFiles = flowFileVector.size();
280
281                         for(int i=1;i<ncpus;i++){
282                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
283                         }
284                         
285                         for(int i=0;i<numFiles;i++){
286                                 flowFileName = flowFileVector[i];
287                         
288                                 cout << "\n>>>>>\tProcessing " << flowFileName << " (file " << i+1 << " of " << numFiles << ")\t<<<<<" << endl;
289                                 cout << "Reading flowgrams..." << endl;
290                                 getFlowData();
291                                 cout << "Identifying unique flowgrams..." << endl;
292                                 getUniques();
293
294                                 cout << "Calculating distances between flowgrams..." << endl;
295                                 char fileName[1024];
296                                 strcpy(fileName, flowFileName.c_str());
297
298                                 for(int i=1;i<ncpus;i++){
299                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
300
301                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
302                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
303                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
304                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
305                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
306                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
307                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
308                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
309                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
310                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
311                                 }                               
312                                                         
313                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
314                                 
315                                 int done;
316                                 for(int i=1;i<ncpus;i++){
317                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
318                                         
319                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
320                                         remove((distFileName + ".temp." + toString(i)).c_str());
321                                 }
322
323                                 string namesFileName = createNamesFile();
324                                 
325                                 cout << "\nClustering flowgrams..." << endl;
326                                 string listFileName = cluster(distFileName, namesFileName);
327         //                      string listFileName = "PriestPot_C7.pn.list";
328         //                      string listFileName = "test.mock_rep3.v69.pn.list";
329                         
330                                 getOTUData(listFileName);
331                                 initPyroCluster();
332
333                                 for(int i=1;i<ncpus;i++){
334                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
335                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
336                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
337                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
338                                 }
339                                 
340                                 
341                                 double maxDelta = 0;
342                                 int iter = 0;
343                                 
344                                 int numOTUsOnCPU = numOTUs / ncpus;
345                                 int numSeqsOnCPU = numSeqs / ncpus;
346                                 
347                                 cout << "\nDenoising flowgrams..." << endl;
348                                 cout << "iter\tmaxDelta\tnLL\t\tcycletime" << endl;
349                                 
350                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
351
352                                         double cycClock = clock();
353                                         unsigned long int cycTime = time(NULL);
354                                         fill();
355
356                                         int total = singleTau.size();
357                                         for(int i=1;i<ncpus;i++){
358                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
359                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
360                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
361                                                 
362                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
363                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
364                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
365                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
366                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
367                                         }
368                                 
369                                         calcCentroidsDriver(0, numOTUsOnCPU);
370                                         
371                                         for(int i=1;i<ncpus;i++){
372                                                 int otuStart = i * numOTUs / ncpus;
373                                                 int otuStop = (i + 1) * numOTUs / ncpus;
374                                                 
375                                                 vector<int> tempCentroids(numOTUs, 0);
376                                                 vector<short> tempChange(numOTUs, 0);
377                                                 
378                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
379                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
380                                                 
381                                                 for(int j=otuStart;j<otuStop;j++){
382                                                         centroids[j] = tempCentroids[j];
383                                                         change[j] = tempChange[j];
384                                                 }
385                                         }
386                                                                         
387                                         maxDelta = getNewWeights();
388                                         double nLL = getLikelihood();
389                                         checkCentroids();
390                                         
391                                         for(int i=1;i<ncpus;i++){
392                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
393                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
394                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
395                                         }
396                                         
397                                         calcNewDistancesParent(0, numSeqsOnCPU);
398
399                                         total = singleTau.size();
400
401                                         for(int i=1;i<ncpus;i++){
402                                                 int childTotal;
403                                                 int seqStart = i * numSeqs / ncpus;
404                                                 int seqStop = (i + 1) * numSeqs / ncpus;
405                                                 
406                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
407
408                                                 vector<int> childSeqIndex(childTotal, 0);
409                                                 vector<double> childSingleTau(childTotal, 0);
410                                                 vector<double> childDist(numSeqs * numOTUs, 0);
411                                                 vector<int> otuIndex(childTotal, 0);
412                                                 
413                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
414                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
415                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
416                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
417
418                                                 int oldTotal = total;
419                                                 total += childTotal;
420                                                 singleTau.resize(total, 0);
421                                                 seqIndex.resize(total, 0);
422                                                 seqNumber.resize(total, 0);
423                                                 
424                                                 int childIndex = 0;
425                                                 
426                                                 for(int j=oldTotal;j<total;j++){
427                                                         int otuI = otuIndex[childIndex];
428                                                         int seqI = childSeqIndex[childIndex];
429                                                         
430                                                         singleTau[j] = childSingleTau[childIndex];
431                                                         
432                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
433                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
434                                                         nSeqsPerOTU[otuI]++;
435                                                         childIndex++;
436                                                 }
437                                                 
438                                                 int index = seqStart * numOTUs;
439                                                 for(int j=seqStart;j<seqStop;j++){
440                                                         for(int k=0;k<numOTUs;k++){
441                                                                 dist[index] = childDist[index];
442                                                                 index++;
443                                                         }
444                                                 }                                       
445                                         }
446                                         
447                                         iter++;
448                                         
449                                         cout << iter << '\t' << maxDelta << '\t' << setprecision(2) << nLL << '\t' << time(NULL) - cycTime << '\t' << setprecision(6) << (clock() - cycClock)/(double)CLOCKS_PER_SEC << endl;                   
450
451                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
452                                                 int live = 1;
453                                                 for(int i=1;i<ncpus;i++){
454                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
455                                                 }
456                                         }
457                                         else{
458                                                 int live = 0;
459                                                 for(int i=1;i<ncpus;i++){
460                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
461                                                 }
462                                         }
463                                         
464                                 }       
465                                 
466                                 cout << "\nFinalizing..." << endl;
467                                 fill();
468                                 setOTUs();
469                                 vector<int> otuCounts(numOTUs, 0);
470                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
471                                 calcCentroidsDriver(0, numOTUs);
472                                 writeQualities(otuCounts);
473                                 writeSequences(otuCounts);
474                                 writeNames(otuCounts);
475                                 writeClusters(otuCounts);
476                                 writeGroups();
477                                 
478                                 remove(distFileName.c_str());
479                                 remove(namesFileName.c_str());
480                                 remove(listFileName.c_str());
481                                 
482                                 cout << "Total time to process " << flowFileName << ":\t" << time(NULL) - begTime << '\t' << setprecision(6) << (clock() - begClock)/(double)CLOCKS_PER_SEC << endl;                    
483                         }
484
485                 }
486                 else{
487                         int abort = 1;
488                         bool live = 1;
489
490                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
491                         if(abort){      return 0;       }
492
493                         int numFiles;
494                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
495
496                         for(int i=0;i<numFiles;i++){
497                                 //Now into the pyrodist part
498                                 char fileName[1024];
499                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
500                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
501                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
502                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
503                                 
504                                 flowDataIntI.resize(numSeqs * numFlowCells);
505                                 flowDataPrI.resize(numSeqs * numFlowCells);
506                                 mapUniqueToSeq.resize(numSeqs);
507                                 mapSeqToUnique.resize(numSeqs);
508                                 lengths.resize(numSeqs);
509                                 jointLookUp.resize(NUMBINS * NUMBINS);
510
511                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
512                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
513                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
514                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
515                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
516                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
517                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
518
519                                 flowFileName = string(fileName);
520                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
521                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
522                                 
523                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
524
525                                 int done = 1;
526                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
527
528                                 //Now into the pyronoise part
529                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
530                                 
531                                 singleLookUp.resize(HOMOPS * NUMBINS);
532                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
533                                 weight.resize(numOTUs);
534                                 centroids.resize(numOTUs);
535                                 change.resize(numOTUs);
536                                 dist.assign(numOTUs * numSeqs, 0);
537                                 nSeqsPerOTU.resize(numOTUs);
538                                 cumNumSeqs.resize(numOTUs);
539
540                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
541                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
542                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
543                                 
544                                 int startOTU = pid * numOTUs / ncpus;
545                                 int endOTU = (pid + 1) * numOTUs / ncpus;
546
547                                 int startSeq = pid * numSeqs / ncpus;
548                                 int endSeq = (pid + 1) * numSeqs /ncpus;
549                                 
550                                 int total;
551
552                                 while(live){
553
554                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
555                                         singleTau.assign(total, 0.0000);
556                                         seqNumber.assign(total, 0);
557                                         seqIndex.assign(total, 0);
558                                         
559                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
560                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
561                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
562                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
563                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
564                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
565                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
566                                         
567                                         calcCentroidsDriver(startOTU, endOTU);
568
569                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
570                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
571
572                                         
573                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
574                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
575                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
576
577                                         vector<int> otuIndex(total, 0);
578                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
579                                         total = otuIndex.size();
580                                         
581                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
582                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
583                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
584                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
585                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
586                                 
587                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
588                                 }
589                         }
590                 }               
591
592                 MPI_Barrier(MPI_COMM_WORLD);
593                 return 0;
594
595         }
596         catch(exception& e) {
597                 m->errorOut(e, "ShhherCommand", "execute");
598                 exit(1);
599         }
600 }
601
602 /**************************************************************************************************/
603
604 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
605         try{            
606                 ostringstream outStream;
607                 outStream.setf(ios::fixed, ios::floatfield);
608                 outStream.setf(ios::dec, ios::basefield);
609                 outStream.setf(ios::showpoint);
610                 outStream.precision(6);
611                 
612                 int begTime = time(NULL);
613                 double begClock = clock();
614                 
615                 for(int i=startSeq;i<stopSeq;i++){
616                         for(int j=0;j<i;j++){
617                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
618                                 
619                                 if(flowDistance < 1e-6){
620                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
621                                 }
622                                 else if(flowDistance <= cutoff){
623                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
624                                 }
625                         }
626                         if(i % 100 == 0){
627                                 cout << i << "\t" << (time(NULL) - begTime) << "\t" << (clock()-begClock)/CLOCKS_PER_SEC << endl;
628                         }
629                 }
630                 cout << stopSeq << "\t" << (time(NULL) - begTime) << "\t" << (clock()-begClock)/CLOCKS_PER_SEC << endl;
631                 
632                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
633                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
634
635                 ofstream distFile(fDistFileName.c_str());
636                 distFile << outStream.str();            
637                 distFile.close();
638                 
639                 return fDistFileName;
640         }
641         catch(exception& e) {
642                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
643                 exit(1);
644         }
645 }
646
647 #else
648 //**********************************************************************************************************************
649
650 int ShhherCommand::execute(){
651         try {
652                 if (abort == true) { return 0; }
653                 
654                 cout.setf(ios::fixed, ios::floatfield);
655                 cout.setf(ios::showpoint);
656                 
657                 getSingleLookUp();
658                 getJointLookUp();
659                                 
660                 vector<string> flowFileVector;
661                 if(flowFilesFileName != "not found"){
662                         string fName;
663                         
664                         ifstream flowFilesFile;
665                         m->openInputFile(flowFilesFileName, flowFilesFile);
666                         while(flowFilesFile){
667                                 flowFilesFile >> fName;
668                                 flowFileVector.push_back(fName);
669                                 m->gobble(flowFilesFile);
670                         }
671                 }
672                 else{
673                         flowFileVector.push_back(flowFileName);
674                 }
675                 int numFiles = flowFileVector.size();
676                 
677                 
678                 for(int i=0;i<numFiles;i++){
679                         flowFileName = flowFileVector[i];
680
681                         cout << "\n>>>>>\tProcessing " << flowFileName << " (file " << i+1 << " of " << numFiles << ")\t<<<<<" << endl;
682                         cout << "Reading flowgrams..." << endl;
683                         getFlowData();
684                         cout << "Identifying unique flowgrams..." << endl;
685                         getUniques();
686                         
687                         
688                         cout << "Calculating distances between flowgrams..." << endl;                   
689                         string distFileName = createDistFile(processors);
690                         string namesFileName = createNamesFile();
691                         
692                         cout << "\nClustering flowgrams..." << endl;
693                         string listFileName = cluster(distFileName, namesFileName);
694                         getOTUData(listFileName);
695                         
696                         initPyroCluster();
697                         
698                         double maxDelta = 0;
699                         int iter = 0;
700                         
701                         double begClock = clock();
702                         unsigned long int begTime = time(NULL);
703
704                         cout << "\nDenoising flowgrams..." << endl;
705                         cout << "iter\tmaxDelta\tnLL\t\tcycletime" << endl;
706                         
707                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
708                                 
709                                 double cycClock = clock();
710                                 unsigned long int cycTime = time(NULL);
711                                 fill();
712                                 
713                                 calcCentroids();
714                                 
715                                 maxDelta = getNewWeights();
716                                 double nLL = getLikelihood();
717                                 checkCentroids();
718                                 
719                                 calcNewDistances();
720
721                                 iter++;
722                                 
723                                 cout << iter << '\t' << maxDelta << '\t' << setprecision(2) << nLL << '\t' << time(NULL) - cycTime << '\t' << setprecision(6) << (clock() - cycClock)/(double)CLOCKS_PER_SEC << endl;                   
724                         }       
725                         
726                         cout << "\nFinalizing..." << endl;
727                         fill();
728                         setOTUs();
729                         
730                         vector<int> otuCounts(numOTUs, 0);
731                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
732                         
733                         calcCentroidsDriver(0, numOTUs);
734                         writeQualities(otuCounts);
735                         writeSequences(otuCounts);
736                         writeNames(otuCounts);
737                         writeClusters(otuCounts);
738                         writeGroups();
739                         
740                         remove(distFileName.c_str());
741                         remove(namesFileName.c_str());
742                         remove(listFileName.c_str());
743                         
744                         cout << "Total time to process " << flowFileName << ":\t" << time(NULL) - begTime << '\t' << setprecision(6) << (clock() - begClock)/(double)CLOCKS_PER_SEC << endl;                    
745                 }
746                 return 0;
747         }
748         catch(exception& e) {
749                 m->errorOut(e, "ShhherCommand", "execute");
750                 exit(1);
751         }
752 }
753 #endif
754 /**************************************************************************************************/
755
756 void ShhherCommand::getFlowData(){
757         try{
758                 ifstream flowFile;
759                 m->openInputFile(flowFileName, flowFile);
760                 
761                 string seqName;
762                 
763                 int currentNumFlowCells;
764                 
765                 float intensity;
766                 
767                 flowFile >> numFlowCells;
768                 int index = 0;//pcluster
769                 while(!flowFile.eof()){
770                         flowFile >> seqName >> currentNumFlowCells;
771                         lengths.push_back(currentNumFlowCells);
772
773                         seqNameVector.push_back(seqName);
774                         nameMap[seqName] = index++;//pcluster
775
776                         for(int i=0;i<numFlowCells;i++){
777                                 flowFile >> intensity;
778                                 if(intensity > 9.99)    {       intensity = 9.99;       }
779                                 int intI = int(100 * intensity + 0.0001);
780                                 flowDataIntI.push_back(intI);
781                         }
782                         m->gobble(flowFile);
783                 }
784                 flowFile.close();
785                 
786                 numSeqs = seqNameVector.size();         
787                 
788                 for(int i=0;i<numSeqs;i++){
789                         int iNumFlowCells = i * numFlowCells;
790                         for(int j=lengths[i];j<numFlowCells;j++){
791                                 flowDataIntI[iNumFlowCells + j] = 0;
792                         }
793                 }
794                 
795         }
796         catch(exception& e) {
797                 m->errorOut(e, "ShhherCommand", "getFlowData");
798                 exit(1);
799         }
800 }
801
802 /**************************************************************************************************/
803
804 void ShhherCommand::getSingleLookUp(){
805         try{
806                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
807                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
808                 
809                 int index = 0;
810                 ifstream lookUpFile;
811                 m->openInputFile(lookupFileName, lookUpFile);
812                 
813                 for(int i=0;i<HOMOPS;i++){
814                         float logFracFreq;
815                         lookUpFile >> logFracFreq;
816                         
817                         for(int j=0;j<NUMBINS;j++)      {
818                                 lookUpFile >> singleLookUp[index];
819                                 index++;
820                         }
821                 }       
822                 lookUpFile.close();
823         }
824         catch(exception& e) {
825                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
826                 exit(1);
827         }
828 }
829
830 /**************************************************************************************************/
831
832 void ShhherCommand::getJointLookUp(){
833         try{
834                 
835                 //      the most likely joint probability (-log) that two intenities have the same polymer length
836                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
837                 
838                 for(int i=0;i<NUMBINS;i++){
839                         for(int j=0;j<NUMBINS;j++){             
840                                 
841                                 double minSum = 100000000;
842                                 
843                                 for(int k=0;k<HOMOPS;k++){
844                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
845                                         
846                                         if(sum < minSum)        {       minSum = sum;           }
847                                 }       
848                                 jointLookUp[i * NUMBINS + j] = minSum;
849                         }
850                 }
851         }
852         catch(exception& e) {
853                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
854                 exit(1);
855         }
856 }
857
858 /**************************************************************************************************/
859
860 double ShhherCommand::getProbIntensity(int intIntensity){                          
861         try{
862
863                 double minNegLogProb = 100000000; 
864
865                 
866                 for(int i=0;i<HOMOPS;i++){//loop signal strength
867                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
868                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
869                 }
870                 
871                 return minNegLogProb;
872         }
873         catch(exception& e) {
874                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
875                 exit(1);
876         }
877 }
878
879 /**************************************************************************************************/
880
881 void ShhherCommand::getUniques(){
882         try{
883                 
884                 
885                 numUniques = 0;
886                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
887                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
888                 uniqueLengths.assign(numSeqs, 0);
889                 mapSeqToUnique.assign(numSeqs, -1);
890                 mapUniqueToSeq.assign(numSeqs, -1);
891                 
892                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
893                 
894                 for(int i=0;i<numSeqs;i++){
895                         int index = 0;
896                         
897                         vector<short> current(numFlowCells);
898                         for(int j=0;j<numFlowCells;j++){        current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));        }
899                                                 
900                         for(int j=0;j<numUniques;j++){
901                                 int offset = j * numFlowCells;
902                                 bool toEnd = 1;
903                                 
904                                 for(int k=0;k<numFlowCells;k++){
905                                         if(current[k] != uniqueFlowgrams[offset + k]){
906                                                 toEnd = 0;
907                                                 break;
908                                         }
909                                 }
910                                 
911                                 if(toEnd){
912                                         mapSeqToUnique[i] = j;
913                                         uniqueCount[j]++;
914                                         index = j;
915                                         break;
916                                 }
917                                 index++;
918                         }
919                         
920                         if(index == numUniques){
921                                 uniqueLengths[numUniques] = lengths[i];
922                                 uniqueCount[numUniques] = 1;
923                                 mapSeqToUnique[i] = numUniques;//anMap
924                                 mapUniqueToSeq[numUniques] = i;//anF
925                                 
926                                 for(int k=0;k<numFlowCells;k++){
927                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
928                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
929                                 }
930                                 
931                                 numUniques++;
932                         }
933                 }
934                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
935                 uniqueLengths.resize(numUniques);       
936                 
937                 flowDataPrI.assign(numSeqs * numFlowCells, 0);
938                 for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
939         }
940         catch(exception& e) {
941                 m->errorOut(e, "ShhherCommand", "getUniques");
942                 exit(1);
943         }
944 }
945
946 /**************************************************************************************************/
947
948 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
949         try{
950                 int minLength = lengths[mapSeqToUnique[seqA]];
951                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
952                 
953                 int ANumFlowCells = seqA * numFlowCells;
954                 int BNumFlowCells = seqB * numFlowCells;
955                 
956                 float dist = 0;
957                 
958                 for(int i=0;i<minLength;i++){
959                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
960                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
961                         
962                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
963                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
964                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
965                 }
966                 
967                 dist /= (float) minLength;
968                 return dist;
969         }
970         catch(exception& e) {
971                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
972                 exit(1);
973         }
974 }
975
976 /**************************************************************************************************/
977
978 void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
979         try{            
980
981                 ostringstream outStream;
982                 outStream.setf(ios::fixed, ios::floatfield);
983                 outStream.setf(ios::dec, ios::basefield);
984                 outStream.setf(ios::showpoint);
985                 outStream.precision(6);
986                 
987                 int begTime = time(NULL);
988                 double begClock = clock();
989
990                 for(int i=startSeq;i<stopSeq;i++){
991                         for(int j=0;j<i;j++){
992                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
993
994                                 if(flowDistance < 1e-6){
995                                         outStream << seqNameVector[mapUniqueToSeq[i]] << '\t' << seqNameVector[mapUniqueToSeq[j]] << '\t' << 0.000000 << endl;
996                                 }
997                                 else if(flowDistance <= cutoff){
998                                         outStream << seqNameVector[mapUniqueToSeq[i]] << '\t' << seqNameVector[mapUniqueToSeq[j]] << '\t' << flowDistance << endl;
999                                 }
1000                         }
1001                         if(i % 100 == 0){
1002                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
1003                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1004                                 m->mothurOutEndLine();
1005                         }
1006                 }
1007                 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
1008                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1009                 m->mothurOutEndLine();
1010                 
1011                 ofstream distFile(distFileName.c_str());
1012                 distFile << outStream.str();            
1013                 distFile.close();
1014         }
1015         catch(exception& e) {
1016                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
1017                 exit(1);
1018         }
1019 }
1020
1021 /**************************************************************************************************/
1022
1023 string ShhherCommand::createDistFile(int processors){
1024         try{
1025                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
1026                                 
1027                 unsigned long int begTime = time(NULL);
1028                 double begClock = clock();
1029
1030                 vector<int> start;
1031                 vector<int> end;
1032                 
1033 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1034                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
1035                 else{ //you have multiple processors
1036                         
1037                         if (numSeqs < processors){      processors = 1; }
1038                         
1039                         vector<int> start(processors, 0);
1040                         vector<int> end(processors, 0);
1041                         
1042                         for (int i = 0; i < processors; i++) {
1043                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
1044                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
1045                         }
1046                         
1047                         int process = 1;
1048                         vector<int> processIDs;
1049                         
1050                         //loop through and create all the processes you want
1051                         while (process != processors) {
1052                                 int pid = fork();
1053                                 
1054                                 if (pid > 0) {
1055                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1056                                         process++;
1057                                 }else if (pid == 0){
1058                                         flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
1059                                         exit(0);
1060                                 }else { 
1061                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1062                                         perror(" : ");
1063                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1064                                         exit(0);
1065                                 }
1066                         }
1067                         
1068                         //parent does its part
1069                         flowDistParentFork(fDistFileName, start[0], end[0]);
1070                         
1071                         //force parent to wait until all the processes are done
1072                         for (int i=0;i<processIDs.size();i++) { 
1073                                 int temp = processIDs[i];
1074                                 wait(&temp);
1075                         }
1076                         
1077                         //append and remove temp files
1078                         for (int i=0;i<processIDs.size();i++) { 
1079                                 m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
1080                                 remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
1081                         }
1082                         
1083                 }
1084                 
1085 #else
1086                 flowDistParentFork(fDistFileName, 0, numUniques);
1087 #endif
1088
1089                 m->mothurOutEndLine();
1090                 
1091                 cout << "Total time: " << (time(NULL) - begTime) << "\t"  << (clock() - begClock)/CLOCKS_PER_SEC << endl;;
1092
1093                 return fDistFileName;
1094         }
1095         catch(exception& e) {
1096                 m->errorOut(e, "ShhherCommand", "createDistFile");
1097                 exit(1);
1098         }
1099         
1100 }
1101
1102 /**************************************************************************************************/
1103
1104 string ShhherCommand::createNamesFile(){
1105         try{
1106                 
1107                 vector<string> duplicateNames(numUniques, "");
1108                 for(int i=0;i<numSeqs;i++){
1109                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
1110                 }
1111                 
1112                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.names";
1113                 
1114                 ofstream nameFile;
1115                 m->openOutputFile(nameFileName, nameFile);
1116                 
1117                 for(int i=0;i<numUniques;i++){
1118 //                      nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1119                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1120                 }
1121                 
1122                 nameFile.close();
1123                 return  nameFileName;
1124         }
1125         catch(exception& e) {
1126                 m->errorOut(e, "ShhherCommand", "createNamesFile");
1127                 exit(1);
1128         }
1129 }
1130
1131 //**********************************************************************************************************************
1132
1133 string ShhherCommand::cluster(string distFileName, string namesFileName){
1134         try {
1135                 
1136                 SparseMatrix* matrix;
1137                 ListVector* list;
1138                 RAbundVector* rabund;
1139                 
1140                 globaldata->setNameFile(namesFileName);
1141                 globaldata->setColumnFile(distFileName);
1142                 globaldata->setFormat("column");
1143                 
1144                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1145                 read->setCutoff(cutoff);
1146                 
1147                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1148                 clusterNameMap->readMap();
1149                 read->read(clusterNameMap);
1150                 
1151                 list = read->getListVector();
1152                 matrix = read->getMatrix();
1153                 
1154                 delete read; 
1155                 delete clusterNameMap; 
1156                                 
1157                 rabund = new RAbundVector(list->getRAbundVector());
1158                 
1159                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1160                 string tag = cluster->getTag();
1161                 
1162                 double clusterCutoff = cutoff;
1163                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1164                         cluster->update(clusterCutoff);
1165                 }
1166                 
1167                 list->setLabel(toString(cutoff));
1168                 
1169                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.list";
1170                 ofstream listFile;
1171                 m->openOutputFile(listFileName, listFile);
1172                 list->print(listFile);
1173                 listFile.close();
1174                 
1175                 delete matrix;  delete cluster; delete rabund; delete list;
1176         
1177                 return listFileName;
1178         }
1179         catch(exception& e) {
1180                 m->errorOut(e, "ShhherCommand", "cluster");
1181                 exit(1);        
1182         }               
1183 }
1184
1185 /**************************************************************************************************/
1186
1187 void ShhherCommand::getOTUData(string listFileName){
1188         try {
1189
1190                 ifstream listFile;
1191                 m->openInputFile(listFileName, listFile);
1192                 string label;
1193                 
1194                 listFile >> label >> numOTUs;
1195
1196                 otuData.assign(numSeqs, 0);
1197                 cumNumSeqs.assign(numOTUs, 0);
1198                 nSeqsPerOTU.assign(numOTUs, 0);
1199                 aaP.resize(numOTUs);
1200                 
1201                 string singleOTU = "";
1202                 
1203                 for(int i=0;i<numOTUs;i++){
1204
1205                         listFile >> singleOTU;
1206                         
1207                         istringstream otuString(singleOTU);
1208
1209                         while(otuString){
1210                                 
1211                                 string seqName = "";
1212                                 
1213                                 for(int j=0;j<singleOTU.length();j++){
1214                                         char letter = otuString.get();
1215                                         
1216                                         if(letter != ','){
1217                                                 seqName += letter;
1218                                         }
1219                                         else{
1220                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1221                                                 int index = nmIt->second;
1222                                                 
1223                                                 nameMap.erase(nmIt);
1224                                                 
1225                                                 otuData[index] = i;
1226                                                 nSeqsPerOTU[i]++;
1227                                                 aaP[i].push_back(index);
1228                                                 seqName = "";
1229                                         }
1230                                 }
1231                                 
1232                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1233
1234                                 int index = nmIt->second;
1235                                 nameMap.erase(nmIt);
1236
1237                                 otuData[index] = i;
1238                                 nSeqsPerOTU[i]++;
1239                                 aaP[i].push_back(index);        
1240                                 
1241                                 otuString.get();
1242                         }
1243                         
1244                         sort(aaP[i].begin(), aaP[i].end());
1245                         for(int j=0;j<nSeqsPerOTU[i];j++){
1246                                 seqNumber.push_back(aaP[i][j]);
1247                         }
1248                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
1249                                 aaP[i].push_back(0);
1250                         }
1251                 }
1252                 
1253                 for(int i=1;i<numOTUs;i++){
1254                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
1255                 }
1256                 aaI = aaP;
1257                 seqIndex = seqNumber;
1258                 
1259                 listFile.close();       
1260         }
1261         catch(exception& e) {
1262                 m->errorOut(e, "ShhherCommand", "getOTUData");
1263                 exit(1);        
1264         }               
1265 }
1266
1267 /**************************************************************************************************/
1268
1269 void ShhherCommand::initPyroCluster(){                          
1270         try{
1271                 dist.assign(numSeqs * numOTUs, 0);
1272                 change.assign(numOTUs, 1);
1273                 centroids.assign(numOTUs, -1);
1274                 weight.assign(numOTUs, 0);
1275                 singleTau.assign(numSeqs, 1.0);
1276                 
1277                 nSeqsBreaks.assign(processors+1, 0);
1278                 nOTUsBreaks.assign(processors+1, 0);
1279
1280                 nSeqsBreaks[0] = 0;
1281                 for(int i=0;i<processors;i++){
1282                         nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1283                         nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1284                 }
1285                 nSeqsBreaks[processors] = numSeqs;
1286                 nOTUsBreaks[processors] = numOTUs;
1287         }
1288         catch(exception& e) {
1289                 m->errorOut(e, "ShhherCommand", "initPyroCluster");
1290                 exit(1);        
1291         }
1292 }
1293
1294 /**************************************************************************************************/
1295
1296 void ShhherCommand::fill(){
1297         try {
1298                 int index = 0;
1299                 for(int i=0;i<numOTUs;i++){
1300                         cumNumSeqs[i] = index;
1301                         for(int j=0;j<nSeqsPerOTU[i];j++){
1302                                 seqNumber[index] = aaP[i][j];
1303                                 seqIndex[index] = aaI[i][j];
1304                                 
1305                                 index++;
1306                         }
1307                 }
1308         }
1309         catch(exception& e) {
1310                 m->errorOut(e, "ShhherCommand", "fill");
1311                 exit(1);        
1312         }               
1313 }
1314
1315 /**************************************************************************************************/
1316
1317 void ShhherCommand::calcCentroids(){                          
1318         try{
1319                 
1320 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1321
1322                 if(processors == 1)     {
1323                         calcCentroidsDriver(0, numOTUs);                
1324                 }
1325                 else{ //you have multiple processors
1326                         if (numOTUs < processors){      processors = 1; }
1327                         
1328                         int process = 1;
1329                         vector<int> processIDs;
1330                         
1331                         //loop through and create all the processes you want
1332                         while (process != processors) {
1333                                 int pid = vfork();
1334                                 
1335                                 if (pid > 0) {
1336                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1337                                         process++;
1338                                 }else if (pid == 0){
1339                                         calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
1340                                         exit(0);
1341                                 }else { 
1342                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1343                                         perror(" : ");
1344                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1345                                         exit(0);
1346                                 }
1347                         }
1348                         
1349                         //parent does its part
1350                         calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
1351
1352                         //force parent to wait until all the processes are done
1353                         for (int i=0;i<processIDs.size();i++) { 
1354                                 int temp = processIDs[i];
1355                                 wait(&temp);
1356                         }
1357                 }
1358                 
1359 #else
1360                 calcCentroidsDriver(0, numOTUs);
1361 #endif          
1362         }
1363         catch(exception& e) {
1364                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1365                 exit(1);        
1366         }               
1367 }
1368
1369 /**************************************************************************************************/
1370
1371 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1372         
1373         //this function gets the most likely homopolymer length at a flow position for a group of sequences
1374         //within an otu
1375         
1376         try{
1377                 
1378                 for(int i=start;i<finish;i++){
1379                         
1380                         double count = 0;
1381                         int position = 0;
1382                         int minFlowGram = 100000000;
1383                         double minFlowValue = 1e8;
1384                         change[i] = 0; //FALSE
1385                         
1386                         for(int j=0;j<nSeqsPerOTU[i];j++){
1387                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1388                         }
1389                         
1390                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1391                                 vector<double> adF(nSeqsPerOTU[i]);
1392                                 vector<int> anL(nSeqsPerOTU[i]);
1393                                 
1394                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1395                                         int index = cumNumSeqs[i] + j;
1396                                         int nI = seqIndex[index];
1397                                         int nIU = mapSeqToUnique[nI];
1398                                         
1399                                         int k;
1400                                         for(k=0;k<position;k++){
1401                                                 if(nIU == anL[k]){
1402                                                         break;
1403                                                 }
1404                                         }
1405                                         if(k == position){
1406                                                 anL[position] = nIU;
1407                                                 adF[position] = 0.0000;
1408                                                 position++;
1409                                         }                                               
1410                                 }
1411                                 
1412                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1413                                         int index = cumNumSeqs[i] + j;
1414                                         int nI = seqIndex[index];
1415                                         
1416                                         double tauValue = singleTau[seqNumber[index]];
1417                                         
1418                                         for(int k=0;k<position;k++){
1419                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1420                                                 adF[k] += dist * tauValue;
1421                                         }
1422                                 }
1423                                 
1424                                 for(int j=0;j<position;j++){
1425                                         if(adF[j] < minFlowValue){
1426                                                 minFlowGram = j;
1427                                                 minFlowValue = adF[j];
1428                                         }
1429                                 }
1430                                 
1431                                 if(centroids[i] != anL[minFlowGram]){
1432                                         change[i] = 1;
1433                                         centroids[i] = anL[minFlowGram];
1434                                 }
1435                         }
1436                         else if(centroids[i] != -1){
1437                                 change[i] = 1;
1438                                 centroids[i] = -1;                      
1439                         }
1440                 }
1441         }
1442         catch(exception& e) {
1443                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1444                 exit(1);        
1445         }               
1446 }
1447
1448 /**************************************************************************************************/
1449
1450 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1451         try{
1452                 
1453                 int flowAValue = cent * numFlowCells;
1454                 int flowBValue = flow * numFlowCells;
1455                 
1456                 double dist = 0;
1457                 
1458                 for(int i=0;i<length;i++){
1459                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1460                         flowAValue++;
1461                         flowBValue++;
1462                 }
1463                 
1464                 return dist / (double)length;
1465         }
1466         catch(exception& e) {
1467                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1468                 exit(1);        
1469         }               
1470 }
1471
1472 /**************************************************************************************************/
1473
1474 double ShhherCommand::getNewWeights(){
1475         try{
1476                 
1477                 double maxChange = 0;
1478                 
1479                 for(int i=0;i<numOTUs;i++){
1480                         
1481                         double difference = weight[i];
1482                         weight[i] = 0;
1483                         
1484                         for(int j=0;j<nSeqsPerOTU[i];j++){
1485                                 int index = cumNumSeqs[i] + j;
1486                                 double tauValue = singleTau[seqNumber[index]];
1487                                 weight[i] += tauValue;
1488                         }
1489                         
1490                         difference = fabs(weight[i] - difference);
1491                         if(difference > maxChange){     maxChange = difference; }
1492                 }
1493                 return maxChange;
1494         }
1495         catch(exception& e) {
1496                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1497                 exit(1);        
1498         }               
1499 }
1500
1501 /**************************************************************************************************/
1502
1503 double ShhherCommand::getLikelihood(){
1504         
1505         try{
1506                 
1507                 vector<long double> P(numSeqs, 0);
1508                 int effNumOTUs = 0;
1509                 
1510                 for(int i=0;i<numOTUs;i++){
1511                         if(weight[i] > MIN_WEIGHT){
1512                                 effNumOTUs++;
1513                         }
1514                 }
1515                 
1516                 string hold;
1517                 for(int i=0;i<numOTUs;i++){
1518                         for(int j=0;j<nSeqsPerOTU[i];j++){
1519                                 int index = cumNumSeqs[i] + j;
1520                                 int nI = seqIndex[index];
1521                                 double singleDist = dist[seqNumber[index]];
1522                                 
1523                                 P[nI] += weight[i] * exp(-singleDist * sigma);
1524                         }
1525                 }
1526                 double nLL = 0.00;
1527                 for(int i=0;i<numSeqs;i++){
1528                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
1529
1530                         nLL += -log(P[i]);
1531                 }
1532                 
1533                 nLL = nLL -(double)numSeqs * log(sigma);
1534
1535                 return nLL; 
1536         }
1537         catch(exception& e) {
1538                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1539                 exit(1);        
1540         }               
1541 }
1542
1543 /**************************************************************************************************/
1544
1545 void ShhherCommand::checkCentroids(){
1546         try{
1547                 vector<int> unique(numOTUs, 1);
1548                 
1549                 for(int i=0;i<numOTUs;i++){
1550                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1551                                 unique[i] = -1;
1552                         }
1553                 }
1554                 
1555                 for(int i=0;i<numOTUs;i++){
1556                         if(unique[i] == 1){
1557                                 for(int j=i+1;j<numOTUs;j++){
1558                                         if(unique[j] == 1){
1559                                                 
1560                                                 if(centroids[j] == centroids[i]){
1561                                                         unique[j] = 0;
1562                                                         centroids[j] = -1;
1563                                                         
1564                                                         weight[i] += weight[j];
1565                                                         weight[j] = 0.0;
1566                                                 }
1567                                         }
1568                                 }
1569                         }
1570                 }
1571         }
1572         catch(exception& e) {
1573                 m->errorOut(e, "ShhherCommand", "checkCentroids");
1574                 exit(1);        
1575         }               
1576 }
1577
1578 /**************************************************************************************************/
1579
1580 void ShhherCommand::calcNewDistances(){                          
1581         try{
1582                 
1583 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1584                 
1585                 if(processors == 1)     {
1586                         calcNewDistancesParent(0, numSeqs);             
1587                 }
1588                 else{ //you have multiple processors
1589                         if (numSeqs < processors){      processors = 1; }
1590                         
1591                         vector<vector<int> > child_otuIndex(processors);
1592                         vector<vector<int> > child_seqIndex(processors);
1593                         vector<vector<double> > child_singleTau(processors);                    
1594                         vector<int> totals(processors);
1595                         
1596                         int process = 1;
1597                         vector<int> processIDs;
1598
1599                         //loop through and create all the processes you want
1600                         while (process != processors) {
1601                                 int pid = vfork();
1602                                 
1603                                 if (pid > 0) {
1604                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1605                                         process++;
1606                                 }else if (pid == 0){
1607                                         calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
1608                                         totals[process] = child_otuIndex[process].size();
1609
1610                                         exit(0);
1611                                 }else { 
1612                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1613                                         perror(" : ");
1614                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1615                                         exit(0);
1616                                 }
1617                         }
1618                         
1619                         //parent does its part
1620                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
1621                         int total = seqIndex.size();
1622
1623                         //force parent to wait until all the processes are done
1624                         for (int i=0;i<processIDs.size();i++) { 
1625                                 int temp = processIDs[i];
1626                                 wait(&temp);
1627                         }
1628
1629                         for(int i=1;i<processors;i++){
1630                                 int oldTotal = total;
1631                                 total += totals[i];
1632
1633                                 singleTau.resize(total, 0);
1634                                 seqIndex.resize(total, 0);
1635                                 seqNumber.resize(total, 0);
1636                                 
1637                                 int childIndex = 0;
1638                                 
1639                                 for(int j=oldTotal;j<total;j++){
1640                                         int otuI = child_otuIndex[i][childIndex];
1641                                         int seqI = child_seqIndex[i][childIndex];
1642
1643                                         singleTau[j] = child_singleTau[i][childIndex];
1644                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
1645                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
1646                                         nSeqsPerOTU[otuI]++;
1647
1648                                         childIndex++;
1649                                 }
1650                         }
1651                 }
1652 #else
1653                 calcNewDistancesParent(0, numSeqs);             
1654 #endif          
1655         }
1656         catch(exception& e) {
1657                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1658                 exit(1);        
1659         }               
1660 }
1661
1662 /**************************************************************************************************/
1663 #ifdef USE_MPI
1664 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1665         
1666         try{
1667                 vector<double> newTau(numOTUs,0);
1668                 vector<double> norms(numSeqs, 0);
1669                 otuIndex.resize(0);
1670                 seqIndex.resize(0);
1671                 singleTau.resize(0);
1672                 
1673                 
1674                 
1675                 for(int i=startSeq;i<stopSeq;i++){
1676                         double offset = 1e8;
1677                         int indexOffset = i * numOTUs;
1678                         
1679                         for(int j=0;j<numOTUs;j++){
1680                                 
1681                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1682                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1683                                 }
1684                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1685                                         offset = dist[indexOffset + j];
1686                                 }
1687                         }
1688                         
1689                         for(int j=0;j<numOTUs;j++){
1690                                 if(weight[j] > MIN_WEIGHT){
1691                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1692                                         norms[i] += newTau[j];
1693                                 }
1694                                 else{
1695                                         newTau[j] = 0.0;
1696                                 }
1697                         }
1698                         
1699                         for(int j=0;j<numOTUs;j++){
1700
1701                                 newTau[j] /= norms[i];
1702                                 
1703                                 if(newTau[j] > MIN_TAU){
1704                                         otuIndex.push_back(j);
1705                                         seqIndex.push_back(i);
1706                                         singleTau.push_back(newTau[j]);
1707                                 }
1708                         }
1709                         
1710                 }
1711         }
1712         catch(exception& e) {
1713                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1714                 exit(1);        
1715         }               
1716 }
1717 #endif
1718 /**************************************************************************************************/
1719
1720 void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
1721         
1722         try{
1723                 vector<double> newTau(numOTUs,0);
1724                 vector<double> norms(numSeqs, 0);
1725                 child_otuIndex.resize(0);
1726                 child_seqIndex.resize(0);
1727                 child_singleTau.resize(0);
1728                 
1729                 for(int i=startSeq;i<stopSeq;i++){
1730                         double offset = 1e8;
1731                         int indexOffset = i * numOTUs;
1732                         
1733                         
1734                         for(int j=0;j<numOTUs;j++){
1735                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1736                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1737                                 }
1738                                 
1739                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1740                                         offset = dist[indexOffset + j];
1741                                 }
1742                         }
1743                         
1744                         for(int j=0;j<numOTUs;j++){
1745                                 if(weight[j] > MIN_WEIGHT){
1746                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1747                                         norms[i] += newTau[j];
1748                                 }
1749                                 else{
1750                                         newTau[j] = 0.0;
1751                                 }
1752                         }
1753                         
1754                         for(int j=0;j<numOTUs;j++){
1755                                 newTau[j] /= norms[i];
1756                                 
1757                                 if(newTau[j] > MIN_TAU){
1758                                         child_otuIndex.push_back(j);
1759                                         child_seqIndex.push_back(i);
1760                                         child_singleTau.push_back(newTau[j]);
1761                                 }
1762                         }
1763                 }
1764         }
1765         catch(exception& e) {
1766                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
1767                 exit(1);        
1768         }               
1769 }
1770
1771 /**************************************************************************************************/
1772
1773 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1774         
1775         try{
1776                 
1777                 int total = 0;
1778                 vector<double> newTau(numOTUs,0);
1779                 vector<double> norms(numSeqs, 0);
1780                 nSeqsPerOTU.assign(numOTUs, 0);
1781                 
1782                 for(int i=startSeq;i<stopSeq;i++){
1783                         int indexOffset = i * numOTUs;
1784                         
1785                         double offset = 1e8;
1786                         
1787                         for(int j=0;j<numOTUs;j++){
1788                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1789                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1790                                 }
1791                                 
1792                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1793                                         offset = dist[indexOffset + j];
1794                                 }
1795                         }
1796                         
1797                         for(int j=0;j<numOTUs;j++){
1798                                 if(weight[j] > MIN_WEIGHT){
1799                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1800                                         norms[i] += newTau[j];
1801                                 }
1802                                 else{
1803                                         newTau[j] = 0.0;
1804                                 }
1805                         }
1806                         
1807                         for(int j=0;j<numOTUs;j++){
1808                                 newTau[j] /= norms[i];
1809                         }
1810                         
1811                         for(int j=0;j<numOTUs;j++){
1812                                 if(newTau[j] > MIN_TAU){
1813                                         
1814                                         int oldTotal = total;
1815                                         
1816                                         total++;
1817                                         
1818                                         singleTau.resize(total, 0);
1819                                         seqNumber.resize(total, 0);
1820                                         seqIndex.resize(total, 0);
1821                                         
1822                                         singleTau[oldTotal] = newTau[j];
1823                                         
1824                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
1825                                         aaI[j][nSeqsPerOTU[j]] = i;
1826                                         nSeqsPerOTU[j]++;
1827                                 }
1828                         }
1829                 }
1830         }
1831         catch(exception& e) {
1832                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1833                 exit(1);        
1834         }               
1835 }
1836
1837 /**************************************************************************************************/
1838
1839 void ShhherCommand::setOTUs(){
1840         
1841         try {
1842                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1843                 
1844                 for(int i=0;i<numOTUs;i++){
1845                         for(int j=0;j<nSeqsPerOTU[i];j++){
1846                                 int index = cumNumSeqs[i] + j;
1847                                 double tauValue = singleTau[seqNumber[index]];
1848                                 int sIndex = seqIndex[index];
1849                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1850                         }
1851                 }
1852                 
1853                 for(int i=0;i<numSeqs;i++){
1854                         double maxTau = -1.0000;
1855                         int maxOTU = -1;
1856                         for(int j=0;j<numOTUs;j++){
1857                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1858                                         maxTau = bigTauMatrix[i * numOTUs + j];
1859                                         maxOTU = j;
1860                                 }
1861                         }
1862                         
1863                         otuData[i] = maxOTU;
1864                 }
1865                 
1866                 nSeqsPerOTU.assign(numOTUs, 0);         
1867                 
1868                 for(int i=0;i<numSeqs;i++){
1869                         int index = otuData[i];
1870                         
1871                         singleTau[i] = 1.0000;
1872                         dist[i] = 0.0000;
1873                         
1874                         aaP[index][nSeqsPerOTU[index]] = i;
1875                         aaI[index][nSeqsPerOTU[index]] = i;
1876                         
1877                         nSeqsPerOTU[index]++;
1878                 }
1879                 fill(); 
1880         }
1881         catch(exception& e) {
1882                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1883                 exit(1);        
1884         }               
1885 }
1886
1887 /**************************************************************************************************/
1888
1889 void ShhherCommand::writeQualities(vector<int> otuCounts){
1890         
1891         try {
1892                 string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.qual";
1893
1894                 ofstream qualityFile;
1895                 m->openOutputFile(qualityFileName, qualityFile);
1896
1897                 qualityFile.setf(ios::fixed, ios::floatfield);
1898                 qualityFile.setf(ios::showpoint);
1899                 qualityFile << setprecision(6);
1900                 
1901                 vector<vector<int> > qualities(numOTUs);
1902                 vector<double> pr(HOMOPS, 0);
1903                 
1904                 
1905                 for(int i=0;i<numOTUs;i++){
1906                         int index = 0;
1907                         int base = 0;
1908                         
1909                         if(nSeqsPerOTU[i] > 0){
1910                                 qualities[i].assign(1024, -1);
1911                                 
1912                                 while(index < numFlowCells){
1913                                         double maxPrValue = 1e8;
1914                                         short maxPrIndex = -1;
1915                                         double count = 0.0000;
1916                                         
1917                                         pr.assign(HOMOPS, 0);
1918                                         
1919                                         for(int j=0;j<nSeqsPerOTU[i];j++){
1920                                                 int lIndex = cumNumSeqs[i] + j;
1921                                                 double tauValue = singleTau[seqNumber[lIndex]];
1922                                                 int sequenceIndex = aaI[i][j];
1923                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1924                                                 
1925                                                 count += tauValue;
1926                                                 
1927                                                 for(int s=0;s<HOMOPS;s++){
1928                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1929                                                 }
1930                                         }
1931                                         
1932                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1933                                         maxPrValue = pr[maxPrIndex];
1934                                         
1935                                         if(count > MIN_COUNT){
1936                                                 double U = 0.0000;
1937                                                 double norm = 0.0000;
1938                                                 
1939                                                 for(int s=0;s<HOMOPS;s++){
1940                                                         norm += exp(-(pr[s] - maxPrValue));
1941                                                 }
1942                                                 
1943                                                 for(int s=1;s<=maxPrIndex;s++){
1944                                                         int value = 0;
1945                                                         double temp = 0.0000;
1946                                                         
1947                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
1948                                                         
1949                                                         if(U>0.00){
1950                                                                 temp = log10(U);
1951                                                         }
1952                                                         else{
1953                                                                 temp = -10.1;
1954                                                         }
1955                                                         temp = floor(-10 * temp);
1956                                                         value = (int)floor(temp);
1957                                                         if(value > 100){        value = 100;    }
1958                                                         
1959                                                         qualities[i][base] = (int)value;
1960                                                         base++;
1961                                                 }
1962                                         }
1963                                         
1964                                         index++;
1965                                 }
1966                         }
1967                         
1968                         
1969                         if(otuCounts[i] > 0){
1970                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1971                                 
1972                                 int j=4;        //need to get past the first four bases
1973                                 while(qualities[i][j] != -1){
1974                                         qualityFile << qualities[i][j] << ' ';
1975                                         j++;
1976                                 }
1977                                 qualityFile << endl;
1978                         }
1979                 }
1980                 qualityFile.close();
1981                 
1982         }
1983         catch(exception& e) {
1984                 m->errorOut(e, "ShhherCommand", "writeQualities");
1985                 exit(1);        
1986         }               
1987 }
1988
1989 /**************************************************************************************************/
1990
1991 void ShhherCommand::writeSequences(vector<int> otuCounts){
1992         try {
1993                 
1994                 string bases = "TACG";
1995                 
1996                 string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.fasta";
1997                 ofstream fastaFile;
1998                 m->openOutputFile(fastaFileName, fastaFile);
1999                 
2000                 vector<string> names(numOTUs, "");
2001                 
2002                 for(int i=0;i<numOTUs;i++){
2003                         int index = centroids[i];
2004                         
2005                         if(otuCounts[i] > 0){
2006                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
2007                                 
2008                                 for(int j=8;j<numFlowCells;j++){
2009                                         
2010                                         char base = bases[j % 4];
2011                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
2012                                                 fastaFile << base;
2013                                         }
2014                                 }
2015                                 fastaFile << endl;
2016                         }
2017                 }
2018                 fastaFile.close();
2019         }
2020         catch(exception& e) {
2021                 m->errorOut(e, "ShhherCommand", "writeSequences");
2022                 exit(1);        
2023         }               
2024 }
2025
2026 /**************************************************************************************************/
2027
2028 void ShhherCommand::writeNames(vector<int> otuCounts){
2029         try {
2030                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.final.names";
2031                 ofstream nameFile;
2032                 m->openOutputFile(nameFileName, nameFile);
2033                 
2034                 for(int i=0;i<numOTUs;i++){
2035                         if(otuCounts[i] > 0){
2036                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
2037                                 
2038                                 for(int j=1;j<nSeqsPerOTU[i];j++){
2039                                         nameFile << ',' << seqNameVector[aaI[i][j]];
2040                                 }
2041                                 
2042                                 nameFile << endl;
2043                         }
2044                 }
2045                 nameFile.close();
2046         }
2047         catch(exception& e) {
2048                 m->errorOut(e, "ShhherCommand", "writeNames");
2049                 exit(1);        
2050         }               
2051 }
2052
2053 /**************************************************************************************************/
2054
2055 void ShhherCommand::writeGroups(){
2056         try {
2057                 string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
2058                 string groupFileName = fileRoot + ".pn.groups";
2059                 ofstream groupFile;
2060                 m->openOutputFile(groupFileName, groupFile);
2061                 
2062                 for(int i=0;i<numSeqs;i++){
2063                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
2064                 }
2065                 groupFile.close();
2066         }
2067         catch(exception& e) {
2068                 m->errorOut(e, "ShhherCommand", "writeGroups");
2069                 exit(1);        
2070         }               
2071 }
2072
2073 /**************************************************************************************************/
2074
2075 void ShhherCommand::writeClusters(vector<int> otuCounts){
2076         try {
2077                 string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.counts";
2078                 ofstream otuCountsFile;
2079                 m->openOutputFile(otuCountsFileName, otuCountsFile);
2080                 
2081                 string bases = "TACG";
2082                 
2083                 for(int i=0;i<numOTUs;i++){
2084                         //output the translated version of the centroid sequence for the otu
2085                         if(otuCounts[i] > 0){
2086                                 int index = centroids[i];
2087                                 
2088                                 otuCountsFile << "ideal\t";
2089                                 for(int j=8;j<numFlowCells;j++){
2090                                         char base = bases[j % 4];
2091                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
2092                                                 otuCountsFile << base;
2093                                         }
2094                                 }
2095                                 otuCountsFile << endl;
2096                                 
2097                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2098                                         int sequence = aaI[i][j];
2099                                         otuCountsFile << seqNameVector[sequence] << '\t';
2100                                         
2101                                         for(int k=8;k<lengths[sequence];k++){
2102                                                 char base = bases[k % 4];
2103                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
2104                                                 
2105                                                 for(int s=0;s<freq;s++){
2106                                                         otuCountsFile << base;
2107                                                 }
2108                                         }
2109                                         otuCountsFile << endl;
2110                                 }
2111                                 otuCountsFile << endl;
2112                         }
2113                 }
2114                 otuCountsFile.close();
2115         }
2116         catch(exception& e) {
2117                 m->errorOut(e, "ShhherCommand", "writeClusters");
2118                 exit(1);        
2119         }               
2120 }
2121
2122 //**********************************************************************************************************************