]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
took cout's from shhh.seqs
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsematrix.hpp"
19 #include <cfloat>
20
21 //**********************************************************************************************************************
22
23 #define NUMBINS 1000
24 #define HOMOPS 10
25 #define MIN_COUNT 0.1
26 #define MIN_WEIGHT 0.1
27 #define MIN_TAU 0.0001
28 #define MIN_ITER 10
29
30 //**********************************************************************************************************************
31
32 vector<string> ShhherCommand::getValidParameters(){     
33         try {
34                 string Array[] =  {     
35                         "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors", "maxiter", "mindelta"        
36                 };
37                 
38                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
39                 return myArray;
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ShhherCommand", "getValidParameters");
43                 exit(1);
44         }
45 }
46
47 //**********************************************************************************************************************
48
49 ShhherCommand::ShhherCommand(){ 
50         try {
51                 abort = true; calledHelp = true;
52                 
53                 //initialize outputTypes
54                 vector<string> tempOutNames;
55                 outputTypes["pn.dist"] = tempOutNames;
56
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
60                 exit(1);
61         }
62 }
63
64 //**********************************************************************************************************************
65
66 vector<string> ShhherCommand::getRequiredParameters(){  
67         try {
68                 string Array[] =  {"flow"};
69                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
70                 return myArray;
71         }
72         catch(exception& e) {
73                 m->errorOut(e, "ShhherCommand", "getRequiredParameters");
74                 exit(1);
75         }
76 }
77
78 //**********************************************************************************************************************
79
80 vector<string> ShhherCommand::getRequiredFiles(){       
81         try {
82                 vector<string> myArray;
83                 return myArray;
84         }
85         catch(exception& e) {
86                 m->errorOut(e, "ShhherCommand", "getRequiredFiles");
87                 exit(1);
88         }
89 }
90
91 //**********************************************************************************************************************
92
93 ShhherCommand::ShhherCommand(string option) {
94         try {
95
96 #ifdef USE_MPI
97                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
98                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
99
100                 if(pid == 0){
101 #endif
102                 
103                 
104                 abort = false; calledHelp = false;   
105                 
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 
110                 else {
111                         
112                         //valid paramters for this command
113                         string AlignArray[] =  {
114                                 "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors", "maxiter", "mindelta"        
115                         };
116                         
117                         vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
118                         
119                         OptionParser parser(option);
120                         map<string,string> parameters = parser.getParameters();
121                         
122                         ValidParameters validParameter;
123                         map<string,string>::iterator it;
124                         
125                         //check to make sure all parameters are valid for command
126                         for (it = parameters.begin(); it != parameters.end(); it++) { 
127                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
128                         }
129                         
130                         //initialize outputTypes
131                         vector<string> tempOutNames;
132                         outputTypes["pn.dist"] = tempOutNames;
133                         //                      outputTypes["fasta"] = tempOutNames;
134                         
135                         //if the user changes the input directory command factory will send this info to us in the output parameter 
136                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
137                         if (inputDir == "not found"){   inputDir = "";          }
138                         else {
139                                 string path;
140                                 it = parameters.find("flow");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
146                                 }
147                                 
148                                 it = parameters.find("lookup");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
154                                 }
155
156                                 it = parameters.find("file");
157                                 //user has given a template file
158                                 if(it != parameters.end()){ 
159                                         path = m->hasPath(it->second);
160                                         //if the user has not given a path then, add inputdir. else leave path alone.
161                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
162                                 }
163                         }
164                         
165                         
166                         //check for required parameters
167                         flowFileName = validParameter.validFile(parameters, "flow", true);
168                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
169                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
170                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
171                                 m->mothurOutEndLine();
172                                 abort = true; 
173                         }
174                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
175                         
176                         if(flowFileName != "not found"){        compositeFASTAFileName = "";    }
177                         else{
178                                 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "pn.fasta";
179                                 ofstream temp;
180                                 m->openOutputFile(compositeFASTAFileName, temp);
181                                 temp.close();
182                         }
183                         
184                         //if the user changes the output directory command factory will send this info to us in the output parameter 
185                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
186                                 outputDir = ""; 
187                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
188                         }
189                         
190                         
191                         //check for optional parameter and set defaults
192                         // ...at some point should added some additional type checking...
193                         string temp;
194                         temp = validParameter.validFile(parameters, "lookup", true);
195                         if (temp == "not found")        {       lookupFileName = "LookUp_Titanium.pat"; }
196                         else if(temp == "not open")     {       abort = true;                   } 
197                         else                                            {       lookupFileName = temp;  }
198                         
199                         temp = validParameter.validFile(parameters, "processors", false);if (temp == "not found"){      temp = "1";                     }
200                         convert(temp, processors); 
201
202                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
203                         convert(temp, cutoff); 
204                         
205                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
206                         convert(temp, minDelta); 
207
208                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
209                         convert(temp, maxIters); 
210
211                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
212                         convert(temp, sigma); 
213                         
214                         globaldata = GlobalData::getInstance();
215                 }
216                         
217 #ifdef USE_MPI
218                 }                               
219 #endif
220                                 
221         }
222         catch(exception& e) {
223                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
224                 exit(1);
225         }
226 }
227
228 //**********************************************************************************************************************
229
230 ShhherCommand::~ShhherCommand(){}
231
232 //**********************************************************************************************************************
233
234 void ShhherCommand::help(){
235         try {
236                 m->mothurOut("The shhher command reads a file containing flowgrams and creates a file of corrected sequences.\n");
237         }
238         catch(exception& e) {
239                 m->errorOut(e, "ShhherCommand", "help");
240                 exit(1);
241         }
242 }
243
244 //**********************************************************************************************************************
245 #ifdef USE_MPI
246 int ShhherCommand::execute(){
247         try {
248                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
249                 
250                 int tag = 1976;
251                 MPI_Status status; 
252
253                 double begClock = clock();
254                 unsigned long int begTime = time(NULL);
255                 
256                 if(pid == 0){
257
258                         for(int i=1;i<ncpus;i++){
259                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
260                         }
261                         if(abort == 1){ return 0;       }
262
263                         processors = ncpus;
264                         
265                         m->mothurOut("\nGetting preliminary data...\n");
266                         getSingleLookUp();
267                         getJointLookUp();
268                         
269                         vector<string> flowFileVector;
270                         if(flowFilesFileName != "not found"){
271                                 string fName;
272
273                                 ifstream flowFilesFile;
274                                 m->openInputFile(flowFilesFileName, flowFilesFile);
275                                 while(flowFilesFile){
276                                         flowFilesFile >> fName;
277                                         flowFileVector.push_back(fName);
278                                         m->gobble(flowFilesFile);
279                                 }
280                         }
281                         else{
282                                 flowFileVector.push_back(flowFileName);
283                         }
284                         int numFiles = flowFileVector.size();
285
286                         for(int i=1;i<ncpus;i++){
287                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
288                         }
289                         
290                         for(int i=0;i<numFiles;i++){
291                                 flowFileName = flowFileVector[i];
292
293                                 
294                                 
295                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
296                                 m->mothurOut("Reading flowgrams...\n");
297                                 getFlowData();
298
299                                 m->mothurOut("Identifying unique flowgrams...\n");
300                                 getUniques();
301
302                                 m->mothurOut("Calculating distances between flowgrams...\n");
303                                 char fileName[1024];
304                                 strcpy(fileName, flowFileName.c_str());
305
306                                 for(int i=1;i<ncpus;i++){
307                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
308
309                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
310                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
311                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
312                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
313                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
314                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
315                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
316                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
317                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
318                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
319                                 }                               
320                                                         
321                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
322                                 
323                                 int done;
324                                 for(int i=1;i<ncpus;i++){
325                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
326                                         
327                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
328                                         remove((distFileName + ".temp." + toString(i)).c_str());
329                                 }
330
331                                 string namesFileName = createNamesFile();
332                                 
333                                 m->mothurOut("\nClustering flowgrams...\n");
334                                 string listFileName = cluster(distFileName, namesFileName);
335
336                                 getOTUData(listFileName);
337                                 initPyroCluster();
338
339                                 for(int i=1;i<ncpus;i++){
340                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
341                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
342                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
343                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
344                                 }
345                                 
346                                 
347                                 double maxDelta = 0;
348                                 int iter = 0;
349                                 
350                                 int numOTUsOnCPU = numOTUs / ncpus;
351                                 int numSeqsOnCPU = numSeqs / ncpus;
352                                 m->mothurOut("\nDenoising flowgrams...\n");
353                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
354                                 
355                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
356
357                                         double cycClock = clock();
358                                         unsigned long int cycTime = time(NULL);
359                                         fill();
360
361                                         int total = singleTau.size();
362                                         for(int i=1;i<ncpus;i++){
363                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
364                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
365                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
366                                                 
367                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
368                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
369                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
370                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
371                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
372                                         }
373                                 
374                                         calcCentroidsDriver(0, numOTUsOnCPU);
375                                         
376                                         for(int i=1;i<ncpus;i++){
377                                                 int otuStart = i * numOTUs / ncpus;
378                                                 int otuStop = (i + 1) * numOTUs / ncpus;
379                                                 
380                                                 vector<int> tempCentroids(numOTUs, 0);
381                                                 vector<short> tempChange(numOTUs, 0);
382                                                 
383                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
384                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
385                                                 
386                                                 for(int j=otuStart;j<otuStop;j++){
387                                                         centroids[j] = tempCentroids[j];
388                                                         change[j] = tempChange[j];
389                                                 }
390                                         }
391                                                                         
392                                         maxDelta = getNewWeights();
393                                         double nLL = getLikelihood();
394                                         checkCentroids();
395                                         
396                                         for(int i=1;i<ncpus;i++){
397                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
398                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
399                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
400                                         }
401                                         
402                                         calcNewDistancesParent(0, numSeqsOnCPU);
403
404                                         total = singleTau.size();
405
406                                         for(int i=1;i<ncpus;i++){
407                                                 int childTotal;
408                                                 int seqStart = i * numSeqs / ncpus;
409                                                 int seqStop = (i + 1) * numSeqs / ncpus;
410                                                 
411                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
412
413                                                 vector<int> childSeqIndex(childTotal, 0);
414                                                 vector<double> childSingleTau(childTotal, 0);
415                                                 vector<double> childDist(numSeqs * numOTUs, 0);
416                                                 vector<int> otuIndex(childTotal, 0);
417                                                 
418                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
419                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
420                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
421                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
422
423                                                 int oldTotal = total;
424                                                 total += childTotal;
425                                                 singleTau.resize(total, 0);
426                                                 seqIndex.resize(total, 0);
427                                                 seqNumber.resize(total, 0);
428                                                 
429                                                 int childIndex = 0;
430                                                 
431                                                 for(int j=oldTotal;j<total;j++){
432                                                         int otuI = otuIndex[childIndex];
433                                                         int seqI = childSeqIndex[childIndex];
434                                                         
435                                                         singleTau[j] = childSingleTau[childIndex];
436                                                         
437                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
438                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
439                                                         nSeqsPerOTU[otuI]++;
440                                                         childIndex++;
441                                                 }
442                                                 
443                                                 int index = seqStart * numOTUs;
444                                                 for(int j=seqStart;j<seqStop;j++){
445                                                         for(int k=0;k<numOTUs;k++){
446                                                                 dist[index] = childDist[index];
447                                                                 index++;
448                                                         }
449                                                 }                                       
450                                         }
451                                         
452                                         iter++;
453                                         
454                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
455
456                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
457                                                 int live = 1;
458                                                 for(int i=1;i<ncpus;i++){
459                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
460                                                 }
461                                         }
462                                         else{
463                                                 int live = 0;
464                                                 for(int i=1;i<ncpus;i++){
465                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
466                                                 }
467                                         }
468                                         
469                                 }       
470                                 
471                                 m->mothurOut("\nFinalizing...\n");
472                                 fill();
473                                 setOTUs();
474                                 vector<int> otuCounts(numOTUs, 0);
475                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
476                                 calcCentroidsDriver(0, numOTUs);
477                                 writeQualities(otuCounts);
478                                 writeSequences(otuCounts);
479                                 writeNames(otuCounts);
480                                 writeClusters(otuCounts);
481                                 writeGroups();
482                                 
483                                 remove(distFileName.c_str());
484                                 remove(namesFileName.c_str());
485                                 remove(listFileName.c_str());
486                                                                  
487                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
488                         }
489                 }
490                 else{
491                         int abort = 1;
492                         bool live = 1;
493
494                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
495                         if(abort){      return 0;       }
496
497                         int numFiles;
498                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
499
500                         for(int i=0;i<numFiles;i++){
501                                 //Now into the pyrodist part
502                                 char fileName[1024];
503                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
504                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
505                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
506                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
507                                 
508                                 flowDataIntI.resize(numSeqs * numFlowCells);
509                                 flowDataPrI.resize(numSeqs * numFlowCells);
510                                 mapUniqueToSeq.resize(numSeqs);
511                                 mapSeqToUnique.resize(numSeqs);
512                                 lengths.resize(numSeqs);
513                                 jointLookUp.resize(NUMBINS * NUMBINS);
514
515                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
516                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
517                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
518                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
519                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
520                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
521                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
522
523                                 flowFileName = string(fileName);
524                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
525                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
526                                 
527                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
528
529                                 int done = 1;
530                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
531
532                                 //Now into the pyronoise part
533                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
534                                 
535                                 singleLookUp.resize(HOMOPS * NUMBINS);
536                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
537                                 weight.resize(numOTUs);
538                                 centroids.resize(numOTUs);
539                                 change.resize(numOTUs);
540                                 dist.assign(numOTUs * numSeqs, 0);
541                                 nSeqsPerOTU.resize(numOTUs);
542                                 cumNumSeqs.resize(numOTUs);
543
544                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
545                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
546                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
547                                 
548                                 int startOTU = pid * numOTUs / ncpus;
549                                 int endOTU = (pid + 1) * numOTUs / ncpus;
550
551                                 int startSeq = pid * numSeqs / ncpus;
552                                 int endSeq = (pid + 1) * numSeqs /ncpus;
553                                 
554                                 int total;
555
556                                 while(live){
557
558                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
559                                         singleTau.assign(total, 0.0000);
560                                         seqNumber.assign(total, 0);
561                                         seqIndex.assign(total, 0);
562                                         
563                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
564                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
565                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
566                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
567                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
568                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
569                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
570                                         
571                                         calcCentroidsDriver(startOTU, endOTU);
572
573                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
574                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
575
576                                         
577                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
578                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
579                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
580
581                                         vector<int> otuIndex(total, 0);
582                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
583                                         total = otuIndex.size();
584                                         
585                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
586                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
587                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
588                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
589                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
590                                 
591                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
592                                 }
593                         }
594                 }               
595
596                 MPI_Barrier(MPI_COMM_WORLD);
597                 return 0;
598
599         }
600         catch(exception& e) {
601                 m->errorOut(e, "ShhherCommand", "execute");
602                 exit(1);
603         }
604 }
605
606 /**************************************************************************************************/
607
608 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
609         try{            
610                 ostringstream outStream;
611                 outStream.setf(ios::fixed, ios::floatfield);
612                 outStream.setf(ios::dec, ios::basefield);
613                 outStream.setf(ios::showpoint);
614                 outStream.precision(6);
615                 
616                 int begTime = time(NULL);
617                 double begClock = clock();
618                 
619                 for(int i=startSeq;i<stopSeq;i++){
620                         for(int j=0;j<i;j++){
621                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
622                                 
623                                 if(flowDistance < 1e-6){
624                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
625                                 }
626                                 else if(flowDistance <= cutoff){
627                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
628                                 }
629                         }
630                         if(i % 100 == 0){
631                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
632                         }
633                 }
634                 
635                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
636                 
637                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
638                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
639
640                 ofstream distFile(fDistFileName.c_str());
641                 distFile << outStream.str();            
642                 distFile.close();
643                 
644                 return fDistFileName;
645         }
646         catch(exception& e) {
647                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
648                 exit(1);
649         }
650 }
651
652 #else
653 //**********************************************************************************************************************
654
655 int ShhherCommand::execute(){
656         try {
657                 if (abort == true) { return 0; }
658                 
659                 getSingleLookUp();
660                 getJointLookUp();
661                                 
662                 vector<string> flowFileVector;
663                 if(flowFilesFileName != "not found"){
664                         string fName;
665                         
666                         ifstream flowFilesFile;
667                         m->openInputFile(flowFilesFileName, flowFilesFile);
668                         while(flowFilesFile){
669                                 flowFilesFile >> fName;
670                                 flowFileVector.push_back(fName);
671                                 m->gobble(flowFilesFile);
672                         }
673                 }
674                 else{
675                         flowFileVector.push_back(flowFileName);
676                 }
677                 int numFiles = flowFileVector.size();
678                 
679                 
680                 for(int i=0;i<numFiles;i++){
681                         flowFileName = flowFileVector[i];
682
683                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
684                         m->mothurOut("Reading flowgrams...\n");
685                         getFlowData();
686                         
687                         m->mothurOut("Identifying unique flowgrams...\n");
688                         getUniques();
689                         
690                         
691                         m->mothurOut("Calculating distances between flowgrams...\n");
692                         string distFileName = createDistFile(processors);
693                         string namesFileName = createNamesFile();
694                                 
695                         m->mothurOut("\nClustering flowgrams...\n");
696                         string listFileName = cluster(distFileName, namesFileName);
697                         getOTUData(listFileName);
698                         
699                         initPyroCluster();
700                         
701                         double maxDelta = 0;
702                         int iter = 0;
703                         
704                         double begClock = clock();
705                         unsigned long int begTime = time(NULL);
706
707                         
708                         m->mothurOut("\nDenoising flowgrams...\n");
709                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
710                         
711                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
712                                 
713                                 double cycClock = clock();
714                                 unsigned long int cycTime = time(NULL);
715                                 fill();
716                                 
717                                 calcCentroids();
718                                 
719                                 maxDelta = getNewWeights();
720                                 double nLL = getLikelihood();
721                                 checkCentroids();
722                                 
723                                 calcNewDistances();
724
725                                 iter++;
726                                 
727                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
728
729                         }       
730                         
731                         m->mothurOut("\nFinalizing...\n");
732                         fill();
733                         setOTUs();
734                         
735                         vector<int> otuCounts(numOTUs, 0);
736                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
737                         
738                         calcCentroidsDriver(0, numOTUs);
739                         writeQualities(otuCounts);
740                         writeSequences(otuCounts);
741                         writeNames(otuCounts);
742                         writeClusters(otuCounts);
743                         writeGroups();
744                         
745                         remove(distFileName.c_str());
746                         remove(namesFileName.c_str());
747                         remove(listFileName.c_str());
748                         
749                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
750                 }
751                 return 0;
752         }
753         catch(exception& e) {
754                 m->errorOut(e, "ShhherCommand", "execute");
755                 exit(1);
756         }
757 }
758 #endif
759 /**************************************************************************************************/
760
761 void ShhherCommand::getFlowData(){
762         try{
763                 ifstream flowFile;
764                 m->openInputFile(flowFileName, flowFile);
765                 
766                 string seqName;
767                 seqNameVector.clear();
768                 lengths.clear();
769                 flowDataIntI.clear();
770                 nameMap.clear();
771                 
772                 
773                 int currentNumFlowCells;
774                 
775                 float intensity;
776                 
777                 flowFile >> numFlowCells;
778                 int index = 0;//pcluster
779                 while(!flowFile.eof()){
780                         flowFile >> seqName >> currentNumFlowCells;
781                         lengths.push_back(currentNumFlowCells);
782
783                         seqNameVector.push_back(seqName);
784                         nameMap[seqName] = index++;//pcluster
785
786                         for(int i=0;i<numFlowCells;i++){
787                                 flowFile >> intensity;
788                                 if(intensity > 9.99)    {       intensity = 9.99;       }
789                                 int intI = int(100 * intensity + 0.0001);
790                                 flowDataIntI.push_back(intI);
791                         }
792                         m->gobble(flowFile);
793                 }
794                 flowFile.close();
795                 
796                 numSeqs = seqNameVector.size();         
797                 
798                 for(int i=0;i<numSeqs;i++){
799                         int iNumFlowCells = i * numFlowCells;
800                         for(int j=lengths[i];j<numFlowCells;j++){
801                                 flowDataIntI[iNumFlowCells + j] = 0;
802                         }
803                 }
804                 
805         }
806         catch(exception& e) {
807                 m->errorOut(e, "ShhherCommand", "getFlowData");
808                 exit(1);
809         }
810 }
811
812 /**************************************************************************************************/
813
814 void ShhherCommand::getSingleLookUp(){
815         try{
816                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
817                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
818                 
819                 int index = 0;
820                 ifstream lookUpFile;
821                 m->openInputFile(lookupFileName, lookUpFile);
822                 
823                 for(int i=0;i<HOMOPS;i++){
824                         float logFracFreq;
825                         lookUpFile >> logFracFreq;
826                         
827                         for(int j=0;j<NUMBINS;j++)      {
828                                 lookUpFile >> singleLookUp[index];
829                                 index++;
830                         }
831                 }       
832                 lookUpFile.close();
833         }
834         catch(exception& e) {
835                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
836                 exit(1);
837         }
838 }
839
840 /**************************************************************************************************/
841
842 void ShhherCommand::getJointLookUp(){
843         try{
844                 
845                 //      the most likely joint probability (-log) that two intenities have the same polymer length
846                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
847                 
848                 for(int i=0;i<NUMBINS;i++){
849                         for(int j=0;j<NUMBINS;j++){             
850                                 
851                                 double minSum = 100000000;
852                                 
853                                 for(int k=0;k<HOMOPS;k++){
854                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
855                                         
856                                         if(sum < minSum)        {       minSum = sum;           }
857                                 }       
858                                 jointLookUp[i * NUMBINS + j] = minSum;
859                         }
860                 }
861         }
862         catch(exception& e) {
863                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
864                 exit(1);
865         }
866 }
867
868 /**************************************************************************************************/
869
870 double ShhherCommand::getProbIntensity(int intIntensity){                          
871         try{
872
873                 double minNegLogProb = 100000000; 
874
875                 
876                 for(int i=0;i<HOMOPS;i++){//loop signal strength
877                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
878                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
879                 }
880                 
881                 return minNegLogProb;
882         }
883         catch(exception& e) {
884                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
885                 exit(1);
886         }
887 }
888
889 /**************************************************************************************************/
890
891 void ShhherCommand::getUniques(){
892         try{
893                 
894                 
895                 numUniques = 0;
896                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
897                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
898                 uniqueLengths.assign(numSeqs, 0);
899                 mapSeqToUnique.assign(numSeqs, -1);
900                 mapUniqueToSeq.assign(numSeqs, -1);
901                 
902                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
903                 
904                 for(int i=0;i<numSeqs;i++){
905                         int index = 0;
906                         
907                         vector<short> current(numFlowCells);
908                         for(int j=0;j<numFlowCells;j++){
909                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
910                         }
911                                                 
912                         for(int j=0;j<numUniques;j++){
913                                 int offset = j * numFlowCells;
914                                 bool toEnd = 1;
915                                 
916                                 int shorterLength;
917                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
918                                 else                                                            {       shorterLength = uniqueLengths[j];       }
919
920                                 for(int k=0;k<shorterLength;k++){
921                                         if(current[k] != uniqueFlowgrams[offset + k]){
922                                                 toEnd = 0;
923                                                 break;
924                                         }
925                                 }
926                                 
927                                 if(toEnd){
928                                         mapSeqToUnique[i] = j;
929                                         uniqueCount[j]++;
930                                         index = j;
931                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
932                                         break;
933                                 }
934                                 index++;
935                         }
936                         
937                         if(index == numUniques){
938                                 uniqueLengths[numUniques] = lengths[i];
939                                 uniqueCount[numUniques] = 1;
940                                 mapSeqToUnique[i] = numUniques;//anMap
941                                 mapUniqueToSeq[numUniques] = i;//anF
942                                 
943                                 for(int k=0;k<numFlowCells;k++){
944                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
945                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
946                                 }
947                                 
948                                 numUniques++;
949                         }
950                 }
951                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
952                 uniqueLengths.resize(numUniques);       
953                 
954                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
955                 for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
956         }
957         catch(exception& e) {
958                 m->errorOut(e, "ShhherCommand", "getUniques");
959                 exit(1);
960         }
961 }
962
963 /**************************************************************************************************/
964
965 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
966         try{
967                 int minLength = lengths[mapSeqToUnique[seqA]];
968                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
969                 
970                 int ANumFlowCells = seqA * numFlowCells;
971                 int BNumFlowCells = seqB * numFlowCells;
972                 
973                 float dist = 0;
974                 
975                 for(int i=0;i<minLength;i++){
976                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
977                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
978                         
979                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
980                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
981                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
982                 }
983                 
984                 dist /= (float) minLength;
985                 return dist;
986         }
987         catch(exception& e) {
988                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
989                 exit(1);
990         }
991 }
992
993 /**************************************************************************************************/
994
995 void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
996         try{            
997
998                 ostringstream outStream;
999                 outStream.setf(ios::fixed, ios::floatfield);
1000                 outStream.setf(ios::dec, ios::basefield);
1001                 outStream.setf(ios::showpoint);
1002                 outStream.precision(6);
1003                 
1004                 int begTime = time(NULL);
1005                 double begClock = clock();
1006
1007                 for(int i=startSeq;i<stopSeq;i++){
1008                         for(int j=0;j<i;j++){
1009                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
1010
1011                                 if(flowDistance < 1e-6){
1012                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
1013                                 }
1014                                 else if(flowDistance <= cutoff){
1015                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
1016                                 }
1017                         }
1018                         if(i % 100 == 0){
1019                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
1020                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1021                                 m->mothurOutEndLine();
1022                         }
1023                 }
1024                 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
1025                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1026                 m->mothurOutEndLine();
1027                 
1028                 ofstream distFile(distFileName.c_str());
1029                 distFile << outStream.str();            
1030                 distFile.close();
1031         }
1032         catch(exception& e) {
1033                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
1034                 exit(1);
1035         }
1036 }
1037
1038 /**************************************************************************************************/
1039
1040 string ShhherCommand::createDistFile(int processors){
1041         try{
1042                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
1043                                 
1044                 unsigned long int begTime = time(NULL);
1045                 double begClock = clock();
1046
1047                 vector<int> start;
1048                 vector<int> end;
1049                 
1050 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1051                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
1052                 else{ //you have multiple processors
1053                         
1054                         if (numSeqs < processors){      processors = 1; }
1055                         
1056                         vector<int> start(processors, 0);
1057                         vector<int> end(processors, 0);
1058                         
1059                         for (int i = 0; i < processors; i++) {
1060                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
1061                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
1062                         }
1063                         
1064                         int process = 1;
1065                         vector<int> processIDs;
1066                         
1067                         //loop through and create all the processes you want
1068                         while (process != processors) {
1069                                 int pid = fork();
1070                                 
1071                                 if (pid > 0) {
1072                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1073                                         process++;
1074                                 }else if (pid == 0){
1075                                         flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
1076                                         exit(0);
1077                                 }else { 
1078                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1079                                         perror(" : ");
1080                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1081                                         exit(0);
1082                                 }
1083                         }
1084                         
1085                         //parent does its part
1086                         flowDistParentFork(fDistFileName, start[0], end[0]);
1087                         
1088                         //force parent to wait until all the processes are done
1089                         for (int i=0;i<processIDs.size();i++) { 
1090                                 int temp = processIDs[i];
1091                                 wait(&temp);
1092                         }
1093                         
1094                         //append and remove temp files
1095                         for (int i=0;i<processIDs.size();i++) { 
1096                                 m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
1097                                 remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
1098                         }
1099                         
1100                 }
1101                 
1102 #else
1103                 flowDistParentFork(fDistFileName, 0, numUniques);
1104 #endif
1105
1106                 m->mothurOutEndLine();
1107                 
1108                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
1109                 
1110
1111                 return fDistFileName;
1112         }
1113         catch(exception& e) {
1114                 m->errorOut(e, "ShhherCommand", "createDistFile");
1115                 exit(1);
1116         }
1117         
1118 }
1119
1120 /**************************************************************************************************/
1121
1122 string ShhherCommand::createNamesFile(){
1123         try{
1124                 
1125                 vector<string> duplicateNames(numUniques, "");
1126                 for(int i=0;i<numSeqs;i++){
1127                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
1128                 }
1129                 
1130                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.names";
1131                 
1132                 ofstream nameFile;
1133                 m->openOutputFile(nameFileName, nameFile);
1134                 
1135                 for(int i=0;i<numUniques;i++){
1136 //                      nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1137                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1138                 }
1139                 
1140                 nameFile.close();
1141                 return  nameFileName;
1142         }
1143         catch(exception& e) {
1144                 m->errorOut(e, "ShhherCommand", "createNamesFile");
1145                 exit(1);
1146         }
1147 }
1148
1149 //**********************************************************************************************************************
1150
1151 string ShhherCommand::cluster(string distFileName, string namesFileName){
1152         try {
1153                 
1154                 
1155                 globaldata->setNameFile(namesFileName);
1156                 globaldata->setColumnFile(distFileName);
1157                 globaldata->setFormat("column");
1158                 
1159                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1160                 read->setCutoff(cutoff);
1161                 
1162                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1163                 clusterNameMap->readMap();
1164                 read->read(clusterNameMap);
1165                 
1166                 ListVector* list = read->getListVector();
1167                 SparseMatrix* matrix = read->getMatrix();
1168                 
1169                 delete read; 
1170                 delete clusterNameMap; 
1171                                 
1172                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1173                 
1174                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1175                 string tag = cluster->getTag();
1176                 
1177                 double clusterCutoff = cutoff;
1178                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1179                         cluster->update(clusterCutoff);
1180                 }
1181                 
1182                 list->setLabel(toString(cutoff));
1183                 
1184                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.list";
1185                 ofstream listFile;
1186                 m->openOutputFile(listFileName, listFile);
1187                 list->print(listFile);
1188                 listFile.close();
1189                 
1190                 delete matrix;  delete cluster; delete rabund; delete list;
1191         
1192                 return listFileName;
1193         }
1194         catch(exception& e) {
1195                 m->errorOut(e, "ShhherCommand", "cluster");
1196                 exit(1);        
1197         }               
1198 }
1199
1200 /**************************************************************************************************/
1201
1202 void ShhherCommand::getOTUData(string listFileName){
1203         try {
1204
1205                 ifstream listFile;
1206                 m->openInputFile(listFileName, listFile);
1207                 string label;
1208                 
1209                 listFile >> label >> numOTUs;
1210
1211                 otuData.assign(numSeqs, 0);
1212                 cumNumSeqs.assign(numOTUs, 0);
1213                 nSeqsPerOTU.assign(numOTUs, 0);
1214                 aaP.clear();aaP.resize(numOTUs);
1215                 
1216                 seqNumber.clear();
1217                 aaI.clear();
1218                 seqIndex.clear();
1219                 
1220                 string singleOTU = "";
1221                 
1222                 for(int i=0;i<numOTUs;i++){
1223
1224                         listFile >> singleOTU;
1225                         
1226                         istringstream otuString(singleOTU);
1227
1228                         while(otuString){
1229                                 
1230                                 string seqName = "";
1231                                 
1232                                 for(int j=0;j<singleOTU.length();j++){
1233                                         char letter = otuString.get();
1234                                         
1235                                         if(letter != ','){
1236                                                 seqName += letter;
1237                                         }
1238                                         else{
1239                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1240                                                 int index = nmIt->second;
1241                                                 
1242                                                 nameMap.erase(nmIt);
1243                                                 
1244                                                 otuData[index] = i;
1245                                                 nSeqsPerOTU[i]++;
1246                                                 aaP[i].push_back(index);
1247                                                 seqName = "";
1248                                         }
1249                                 }
1250                                 
1251                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1252
1253                                 int index = nmIt->second;
1254                                 nameMap.erase(nmIt);
1255
1256                                 otuData[index] = i;
1257                                 nSeqsPerOTU[i]++;
1258                                 aaP[i].push_back(index);        
1259                                 
1260                                 otuString.get();
1261                         }
1262                         
1263                         sort(aaP[i].begin(), aaP[i].end());
1264                         for(int j=0;j<nSeqsPerOTU[i];j++){
1265                                 seqNumber.push_back(aaP[i][j]);
1266                         }
1267                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
1268                                 aaP[i].push_back(0);
1269                         }
1270                         
1271                         
1272                 }
1273                 
1274                 for(int i=1;i<numOTUs;i++){
1275                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
1276                 }
1277                 aaI = aaP;
1278                 seqIndex = seqNumber;
1279                 
1280                 listFile.close();       
1281                 
1282         }
1283         catch(exception& e) {
1284                 m->errorOut(e, "ShhherCommand", "getOTUData");
1285                 exit(1);        
1286         }               
1287 }
1288
1289 /**************************************************************************************************/
1290
1291 void ShhherCommand::initPyroCluster(){                          
1292         try{
1293                 dist.assign(numSeqs * numOTUs, 0);
1294                 change.assign(numOTUs, 1);
1295                 centroids.assign(numOTUs, -1);
1296                 weight.assign(numOTUs, 0);
1297                 singleTau.assign(numSeqs, 1.0);
1298                 
1299                 nSeqsBreaks.assign(processors+1, 0);
1300                 nOTUsBreaks.assign(processors+1, 0);
1301
1302                 nSeqsBreaks[0] = 0;
1303                 for(int i=0;i<processors;i++){
1304                         nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1305                         nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1306                 }
1307                 nSeqsBreaks[processors] = numSeqs;
1308                 nOTUsBreaks[processors] = numOTUs;
1309         }
1310         catch(exception& e) {
1311                 m->errorOut(e, "ShhherCommand", "initPyroCluster");
1312                 exit(1);        
1313         }
1314 }
1315
1316 /**************************************************************************************************/
1317
1318 void ShhherCommand::fill(){
1319         try {
1320                 int index = 0;
1321                 for(int i=0;i<numOTUs;i++){
1322                         cumNumSeqs[i] = index;
1323                         for(int j=0;j<nSeqsPerOTU[i];j++){
1324                                 seqNumber[index] = aaP[i][j];
1325                                 seqIndex[index] = aaI[i][j];
1326                                 
1327                                 index++;
1328                         }
1329                 }
1330         }
1331         catch(exception& e) {
1332                 m->errorOut(e, "ShhherCommand", "fill");
1333                 exit(1);        
1334         }               
1335 }
1336
1337 /**************************************************************************************************/
1338
1339 void ShhherCommand::calcCentroids(){                          
1340         try{
1341                 
1342 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1343
1344                 if(processors == 1)     {
1345                         calcCentroidsDriver(0, numOTUs);                
1346                 }
1347                 else{ //you have multiple processors
1348                         if (numOTUs < processors){      processors = 1; }
1349                         
1350                         int process = 1;
1351                         vector<int> processIDs;
1352                         
1353                         //loop through and create all the processes you want
1354                         while (process != processors) {
1355                                 int pid = vfork();
1356                                 
1357                                 if (pid > 0) {
1358                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1359                                         process++;
1360                                 }else if (pid == 0){
1361                                         calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
1362                                         exit(0);
1363                                 }else { 
1364                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1365                                         perror(" : ");
1366                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1367                                         exit(0);
1368                                 }
1369                         }
1370                         
1371                         //parent does its part
1372                         calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
1373
1374                         //force parent to wait until all the processes are done
1375                         for (int i=0;i<processIDs.size();i++) { 
1376                                 int temp = processIDs[i];
1377                                 wait(&temp);
1378                         }
1379                 }
1380                 
1381 #else
1382                 calcCentroidsDriver(0, numOTUs);
1383 #endif          
1384         }
1385         catch(exception& e) {
1386                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1387                 exit(1);        
1388         }               
1389 }
1390
1391 /**************************************************************************************************/
1392
1393 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1394         
1395         //this function gets the most likely homopolymer length at a flow position for a group of sequences
1396         //within an otu
1397         
1398         try{
1399                 
1400         
1401                 for(int i=start;i<finish;i++){
1402                         
1403                         double count = 0;
1404                         int position = 0;
1405                         int minFlowGram = 100000000;
1406                         double minFlowValue = 1e8;
1407                         change[i] = 0; //FALSE
1408                         
1409                         for(int j=0;j<nSeqsPerOTU[i];j++){
1410                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1411                         }
1412
1413                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1414                                 vector<double> adF(nSeqsPerOTU[i]);
1415                                 vector<int> anL(nSeqsPerOTU[i]);
1416                                 
1417                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1418                                         int index = cumNumSeqs[i] + j;
1419                                         int nI = seqIndex[index];
1420                                         int nIU = mapSeqToUnique[nI];
1421                                         
1422                                         int k;
1423                                         for(k=0;k<position;k++){
1424                                                 if(nIU == anL[k]){
1425                                                         break;
1426                                                 }
1427                                         }
1428                                         if(k == position){
1429                                                 anL[position] = nIU;
1430                                                 adF[position] = 0.0000;
1431                                                 position++;
1432                                         }                                               
1433                                 }
1434                                 
1435                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1436                                         int index = cumNumSeqs[i] + j;
1437                                         int nI = seqIndex[index];
1438                                         
1439                                         double tauValue = singleTau[seqNumber[index]];
1440                                         
1441                                         for(int k=0;k<position;k++){
1442                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1443                                                 adF[k] += dist * tauValue;
1444                                         }
1445                                 }
1446                                 
1447                                 for(int j=0;j<position;j++){
1448                                         if(adF[j] < minFlowValue){
1449                                                 minFlowGram = j;
1450                                                 minFlowValue = adF[j];
1451                                         }
1452                                 }
1453                                 
1454                                 if(centroids[i] != anL[minFlowGram]){
1455                                         change[i] = 1;
1456                                         centroids[i] = anL[minFlowGram];
1457                                 }
1458                         }
1459                         else if(centroids[i] != -1){
1460                                 change[i] = 1;
1461                                 centroids[i] = -1;                      
1462                         }
1463                 }
1464         }
1465         catch(exception& e) {
1466                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1467                 exit(1);        
1468         }               
1469 }
1470
1471 /**************************************************************************************************/
1472
1473 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1474         try{
1475                 
1476                 int flowAValue = cent * numFlowCells;
1477                 int flowBValue = flow * numFlowCells;
1478                 
1479                 double dist = 0;
1480                 
1481                 for(int i=0;i<length;i++){
1482                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1483                         flowAValue++;
1484                         flowBValue++;
1485                 }
1486                 
1487                 return dist / (double)length;
1488         }
1489         catch(exception& e) {
1490                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1491                 exit(1);        
1492         }               
1493 }
1494
1495 /**************************************************************************************************/
1496
1497 double ShhherCommand::getNewWeights(){
1498         try{
1499                 
1500                 double maxChange = 0;
1501                 
1502                 for(int i=0;i<numOTUs;i++){
1503                         
1504                         double difference = weight[i];
1505                         weight[i] = 0;
1506                         
1507                         for(int j=0;j<nSeqsPerOTU[i];j++){
1508                                 int index = cumNumSeqs[i] + j;
1509                                 double tauValue = singleTau[seqNumber[index]];
1510                                 weight[i] += tauValue;
1511                         }
1512                         
1513                         difference = fabs(weight[i] - difference);
1514                         if(difference > maxChange){     maxChange = difference; }
1515                 }
1516                 return maxChange;
1517         }
1518         catch(exception& e) {
1519                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1520                 exit(1);        
1521         }               
1522 }
1523
1524 /**************************************************************************************************/
1525
1526 double ShhherCommand::getLikelihood(){
1527         
1528         try{
1529                 
1530                 vector<long double> P(numSeqs, 0);
1531                 int effNumOTUs = 0;
1532                 
1533                 for(int i=0;i<numOTUs;i++){
1534                         if(weight[i] > MIN_WEIGHT){
1535                                 effNumOTUs++;
1536                         }
1537                 }
1538                 
1539                 string hold;
1540                 for(int i=0;i<numOTUs;i++){
1541                         for(int j=0;j<nSeqsPerOTU[i];j++){
1542                                 int index = cumNumSeqs[i] + j;
1543                                 int nI = seqIndex[index];
1544                                 double singleDist = dist[seqNumber[index]];
1545                                 
1546                                 P[nI] += weight[i] * exp(-singleDist * sigma);
1547                         }
1548                 }
1549                 double nLL = 0.00;
1550                 for(int i=0;i<numSeqs;i++){
1551                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
1552
1553                         nLL += -log(P[i]);
1554                 }
1555                 
1556                 nLL = nLL -(double)numSeqs * log(sigma);
1557
1558                 return nLL; 
1559         }
1560         catch(exception& e) {
1561                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1562                 exit(1);        
1563         }               
1564 }
1565
1566 /**************************************************************************************************/
1567
1568 void ShhherCommand::checkCentroids(){
1569         try{
1570                 vector<int> unique(numOTUs, 1);
1571                 
1572                 for(int i=0;i<numOTUs;i++){
1573                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1574                                 unique[i] = -1;
1575                         }
1576                 }
1577                 
1578                 for(int i=0;i<numOTUs;i++){
1579                         if(unique[i] == 1){
1580                                 for(int j=i+1;j<numOTUs;j++){
1581                                         if(unique[j] == 1){
1582                                                 
1583                                                 if(centroids[j] == centroids[i]){
1584                                                         unique[j] = 0;
1585                                                         centroids[j] = -1;
1586                                                         
1587                                                         weight[i] += weight[j];
1588                                                         weight[j] = 0.0;
1589                                                 }
1590                                         }
1591                                 }
1592                         }
1593                 }
1594         }
1595         catch(exception& e) {
1596                 m->errorOut(e, "ShhherCommand", "checkCentroids");
1597                 exit(1);        
1598         }               
1599 }
1600
1601 /**************************************************************************************************/
1602
1603 void ShhherCommand::calcNewDistances(){                          
1604         try{
1605                 
1606 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1607                 
1608                 if(processors == 1)     {
1609                         calcNewDistancesParent(0, numSeqs);             
1610                 }
1611                 else{ //you have multiple processors
1612                         if (numSeqs < processors){      processors = 1; }
1613                         
1614                         vector<vector<int> > child_otuIndex(processors);
1615                         vector<vector<int> > child_seqIndex(processors);
1616                         vector<vector<double> > child_singleTau(processors);                    
1617                         vector<int> totals(processors);
1618                         
1619                         int process = 1;
1620                         vector<int> processIDs;
1621
1622                         //loop through and create all the processes you want
1623                         while (process != processors) {
1624                                 int pid = vfork();
1625                                 
1626                                 if (pid > 0) {
1627                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1628                                         process++;
1629                                 }else if (pid == 0){
1630                                         calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
1631                                         totals[process] = child_otuIndex[process].size();
1632
1633                                         exit(0);
1634                                 }else { 
1635                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1636                                         perror(" : ");
1637                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1638                                         exit(0);
1639                                 }
1640                         }
1641                         
1642                         //parent does its part
1643                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
1644                         int total = seqIndex.size();
1645
1646                         //force parent to wait until all the processes are done
1647                         for (int i=0;i<processIDs.size();i++) { 
1648                                 int temp = processIDs[i];
1649                                 wait(&temp);
1650                         }
1651
1652                         for(int i=1;i<processors;i++){
1653                                 int oldTotal = total;
1654                                 total += totals[i];
1655
1656                                 singleTau.resize(total, 0);
1657                                 seqIndex.resize(total, 0);
1658                                 seqNumber.resize(total, 0);
1659                                 
1660                                 int childIndex = 0;
1661                                 
1662                                 for(int j=oldTotal;j<total;j++){
1663                                         int otuI = child_otuIndex[i][childIndex];
1664                                         int seqI = child_seqIndex[i][childIndex];
1665
1666                                         singleTau[j] = child_singleTau[i][childIndex];
1667                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
1668                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
1669                                         nSeqsPerOTU[otuI]++;
1670
1671                                         childIndex++;
1672                                 }
1673                         }
1674                 }
1675 #else
1676                 calcNewDistancesParent(0, numSeqs);             
1677 #endif          
1678         }
1679         catch(exception& e) {
1680                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1681                 exit(1);        
1682         }               
1683 }
1684
1685 /**************************************************************************************************/
1686 #ifdef USE_MPI
1687 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1688         
1689         try{
1690                 vector<double> newTau(numOTUs,0);
1691                 vector<double> norms(numSeqs, 0);
1692                 otuIndex.clear();
1693                 seqIndex.clear();
1694                 singleTau.clear();
1695                 
1696                 
1697                 
1698                 for(int i=startSeq;i<stopSeq;i++){
1699                         double offset = 1e8;
1700                         int indexOffset = i * numOTUs;
1701                         
1702                         for(int j=0;j<numOTUs;j++){
1703                                 
1704                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1705                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1706                                 }
1707                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1708                                         offset = dist[indexOffset + j];
1709                                 }
1710                         }
1711                         
1712                         for(int j=0;j<numOTUs;j++){
1713                                 if(weight[j] > MIN_WEIGHT){
1714                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1715                                         norms[i] += newTau[j];
1716                                 }
1717                                 else{
1718                                         newTau[j] = 0.0;
1719                                 }
1720                         }
1721                         
1722                         for(int j=0;j<numOTUs;j++){
1723
1724                                 newTau[j] /= norms[i];
1725                                 
1726                                 if(newTau[j] > MIN_TAU){
1727                                         otuIndex.push_back(j);
1728                                         seqIndex.push_back(i);
1729                                         singleTau.push_back(newTau[j]);
1730                                 }
1731                         }
1732                         
1733                 }
1734         }
1735         catch(exception& e) {
1736                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1737                 exit(1);        
1738         }               
1739 }
1740 #endif
1741 /**************************************************************************************************/
1742
1743 void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
1744         
1745         try{
1746                 vector<double> newTau(numOTUs,0);
1747                 vector<double> norms(numSeqs, 0);
1748                 child_otuIndex.resize(0);
1749                 child_seqIndex.resize(0);
1750                 child_singleTau.resize(0);
1751                 
1752                 for(int i=startSeq;i<stopSeq;i++){
1753                         double offset = 1e8;
1754                         int indexOffset = i * numOTUs;
1755                         
1756                         
1757                         for(int j=0;j<numOTUs;j++){
1758                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1759                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1760                                 }
1761                                 
1762                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1763                                         offset = dist[indexOffset + j];
1764                                 }
1765                         }
1766                         
1767                         for(int j=0;j<numOTUs;j++){
1768                                 if(weight[j] > MIN_WEIGHT){
1769                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1770                                         norms[i] += newTau[j];
1771                                 }
1772                                 else{
1773                                         newTau[j] = 0.0;
1774                                 }
1775                         }
1776                         
1777                         for(int j=0;j<numOTUs;j++){
1778                                 newTau[j] /= norms[i];
1779                                 
1780                                 if(newTau[j] > MIN_TAU){
1781                                         child_otuIndex.push_back(j);
1782                                         child_seqIndex.push_back(i);
1783                                         child_singleTau.push_back(newTau[j]);
1784                                 }
1785                         }
1786                 }
1787         }
1788         catch(exception& e) {
1789                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
1790                 exit(1);        
1791         }               
1792 }
1793
1794 /**************************************************************************************************/
1795
1796 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1797         
1798         try{
1799                 
1800                 int total = 0;
1801                 vector<double> newTau(numOTUs,0);
1802                 vector<double> norms(numSeqs, 0);
1803                 nSeqsPerOTU.assign(numOTUs, 0);
1804                 
1805                 for(int i=startSeq;i<stopSeq;i++){
1806                         int indexOffset = i * numOTUs;
1807                         
1808                         double offset = 1e8;
1809                         
1810                         for(int j=0;j<numOTUs;j++){
1811                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1812                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1813                                 }
1814                                 
1815                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1816                                         offset = dist[indexOffset + j];
1817                                 }
1818                         }
1819                         
1820                         for(int j=0;j<numOTUs;j++){
1821                                 if(weight[j] > MIN_WEIGHT){
1822                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1823                                         norms[i] += newTau[j];
1824                                 }
1825                                 else{
1826                                         newTau[j] = 0.0;
1827                                 }
1828                         }
1829                         
1830                         for(int j=0;j<numOTUs;j++){
1831                                 newTau[j] /= norms[i];
1832                         }
1833                         
1834                         for(int j=0;j<numOTUs;j++){
1835                                 if(newTau[j] > MIN_TAU){
1836                                         
1837                                         int oldTotal = total;
1838                                         
1839                                         total++;
1840                                         
1841                                         singleTau.resize(total, 0);
1842                                         seqNumber.resize(total, 0);
1843                                         seqIndex.resize(total, 0);
1844                                         
1845                                         singleTau[oldTotal] = newTau[j];
1846                                         
1847                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
1848                                         aaI[j][nSeqsPerOTU[j]] = i;
1849                                         nSeqsPerOTU[j]++;
1850                                 }
1851                         }
1852                 }
1853         }
1854         catch(exception& e) {
1855                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1856                 exit(1);        
1857         }               
1858 }
1859
1860 /**************************************************************************************************/
1861
1862 void ShhherCommand::setOTUs(){
1863         
1864         try {
1865                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1866                 
1867                 for(int i=0;i<numOTUs;i++){
1868                         for(int j=0;j<nSeqsPerOTU[i];j++){
1869                                 int index = cumNumSeqs[i] + j;
1870                                 double tauValue = singleTau[seqNumber[index]];
1871                                 int sIndex = seqIndex[index];
1872                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1873                         }
1874                 }
1875                 
1876                 for(int i=0;i<numSeqs;i++){
1877                         double maxTau = -1.0000;
1878                         int maxOTU = -1;
1879                         for(int j=0;j<numOTUs;j++){
1880                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1881                                         maxTau = bigTauMatrix[i * numOTUs + j];
1882                                         maxOTU = j;
1883                                 }
1884                         }
1885                         
1886                         otuData[i] = maxOTU;
1887                 }
1888                 
1889                 nSeqsPerOTU.assign(numOTUs, 0);         
1890                 
1891                 for(int i=0;i<numSeqs;i++){
1892                         int index = otuData[i];
1893                         
1894                         singleTau[i] = 1.0000;
1895                         dist[i] = 0.0000;
1896                         
1897                         aaP[index][nSeqsPerOTU[index]] = i;
1898                         aaI[index][nSeqsPerOTU[index]] = i;
1899                         
1900                         nSeqsPerOTU[index]++;
1901                 }
1902                 fill(); 
1903         }
1904         catch(exception& e) {
1905                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1906                 exit(1);        
1907         }               
1908 }
1909
1910 /**************************************************************************************************/
1911
1912 void ShhherCommand::writeQualities(vector<int> otuCounts){
1913         
1914         try {
1915                 string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.qual";
1916
1917                 ofstream qualityFile;
1918                 m->openOutputFile(qualityFileName, qualityFile);
1919
1920                 qualityFile.setf(ios::fixed, ios::floatfield);
1921                 qualityFile.setf(ios::showpoint);
1922                 qualityFile << setprecision(6);
1923                 
1924                 vector<vector<int> > qualities(numOTUs);
1925                 vector<double> pr(HOMOPS, 0);
1926                 
1927                 
1928                 for(int i=0;i<numOTUs;i++){
1929                         int index = 0;
1930                         int base = 0;
1931                         
1932                         if(nSeqsPerOTU[i] > 0){
1933                                 qualities[i].assign(1024, -1);
1934                                 
1935                                 while(index < numFlowCells){
1936                                         double maxPrValue = 1e8;
1937                                         short maxPrIndex = -1;
1938                                         double count = 0.0000;
1939                                         
1940                                         pr.assign(HOMOPS, 0);
1941                                         
1942                                         for(int j=0;j<nSeqsPerOTU[i];j++){
1943                                                 int lIndex = cumNumSeqs[i] + j;
1944                                                 double tauValue = singleTau[seqNumber[lIndex]];
1945                                                 int sequenceIndex = aaI[i][j];
1946                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1947                                                 
1948                                                 count += tauValue;
1949                                                 
1950                                                 for(int s=0;s<HOMOPS;s++){
1951                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1952                                                 }
1953                                         }
1954                                         
1955                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1956                                         maxPrValue = pr[maxPrIndex];
1957                                         
1958                                         if(count > MIN_COUNT){
1959                                                 double U = 0.0000;
1960                                                 double norm = 0.0000;
1961                                                 
1962                                                 for(int s=0;s<HOMOPS;s++){
1963                                                         norm += exp(-(pr[s] - maxPrValue));
1964                                                 }
1965                                                 
1966                                                 for(int s=1;s<=maxPrIndex;s++){
1967                                                         int value = 0;
1968                                                         double temp = 0.0000;
1969                                                         
1970                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
1971                                                         
1972                                                         if(U>0.00){
1973                                                                 temp = log10(U);
1974                                                         }
1975                                                         else{
1976                                                                 temp = -10.1;
1977                                                         }
1978                                                         temp = floor(-10 * temp);
1979                                                         value = (int)floor(temp);
1980                                                         if(value > 100){        value = 100;    }
1981                                                         
1982                                                         qualities[i][base] = (int)value;
1983                                                         base++;
1984                                                 }
1985                                         }
1986                                         
1987                                         index++;
1988                                 }
1989                         }
1990                         
1991                         
1992                         if(otuCounts[i] > 0){
1993                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1994                                 
1995                                 int j=4;        //need to get past the first four bases
1996                                 while(qualities[i][j] != -1){
1997                                         qualityFile << qualities[i][j] << ' ';
1998                                         j++;
1999                                 }
2000                                 qualityFile << endl;
2001                         }
2002                 }
2003                 qualityFile.close();
2004                 
2005         }
2006         catch(exception& e) {
2007                 m->errorOut(e, "ShhherCommand", "writeQualities");
2008                 exit(1);        
2009         }               
2010 }
2011
2012 /**************************************************************************************************/
2013
2014 void ShhherCommand::writeSequences(vector<int> otuCounts){
2015         try {
2016                 string bases = "TACG";
2017                 
2018                 string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.fasta";
2019                 ofstream fastaFile;
2020                 m->openOutputFile(fastaFileName, fastaFile);
2021                 
2022                 vector<string> names(numOTUs, "");
2023                 
2024                 for(int i=0;i<numOTUs;i++){
2025                         int index = centroids[i];
2026                         
2027                         if(otuCounts[i] > 0){
2028                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
2029                                 
2030                                 for(int j=8;j<numFlowCells;j++){
2031                                         
2032                                         char base = bases[j % 4];
2033                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
2034                                                 fastaFile << base;
2035                                         }
2036                                 }
2037                                 fastaFile << endl;
2038                         }
2039                 }
2040                 fastaFile.close();
2041                 
2042                 if(compositeFASTAFileName != ""){
2043                         m->appendFiles(fastaFileName, compositeFASTAFileName);
2044                 }
2045         }
2046         catch(exception& e) {
2047                 m->errorOut(e, "ShhherCommand", "writeSequences");
2048                 exit(1);        
2049         }               
2050 }
2051
2052 /**************************************************************************************************/
2053
2054 void ShhherCommand::writeNames(vector<int> otuCounts){
2055         try {
2056                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.final.names";
2057                 ofstream nameFile;
2058                 m->openOutputFile(nameFileName, nameFile);
2059                 
2060                 for(int i=0;i<numOTUs;i++){
2061                         if(otuCounts[i] > 0){
2062                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
2063                                 
2064                                 for(int j=1;j<nSeqsPerOTU[i];j++){
2065                                         nameFile << ',' << seqNameVector[aaI[i][j]];
2066                                 }
2067                                 
2068                                 nameFile << endl;
2069                         }
2070                 }
2071                 nameFile.close();
2072         }
2073         catch(exception& e) {
2074                 m->errorOut(e, "ShhherCommand", "writeNames");
2075                 exit(1);        
2076         }               
2077 }
2078
2079 /**************************************************************************************************/
2080
2081 void ShhherCommand::writeGroups(){
2082         try {
2083                 string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
2084                 string groupFileName = fileRoot + ".pn.groups";
2085                 ofstream groupFile;
2086                 m->openOutputFile(groupFileName, groupFile);
2087                 
2088                 for(int i=0;i<numSeqs;i++){
2089                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
2090                 }
2091                 groupFile.close();
2092         }
2093         catch(exception& e) {
2094                 m->errorOut(e, "ShhherCommand", "writeGroups");
2095                 exit(1);        
2096         }               
2097 }
2098
2099 /**************************************************************************************************/
2100
2101 void ShhherCommand::writeClusters(vector<int> otuCounts){
2102         try {
2103                 string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.counts";
2104                 ofstream otuCountsFile;
2105                 m->openOutputFile(otuCountsFileName, otuCountsFile);
2106                 
2107                 string bases = "TACG";
2108                 
2109                 for(int i=0;i<numOTUs;i++){
2110                         //output the translated version of the centroid sequence for the otu
2111                         if(otuCounts[i] > 0){
2112                                 int index = centroids[i];
2113                                 
2114                                 otuCountsFile << "ideal\t";
2115                                 for(int j=8;j<numFlowCells;j++){
2116                                         char base = bases[j % 4];
2117                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
2118                                                 otuCountsFile << base;
2119                                         }
2120                                 }
2121                                 otuCountsFile << endl;
2122                                 
2123                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2124                                         int sequence = aaI[i][j];
2125                                         otuCountsFile << seqNameVector[sequence] << '\t';
2126                                         
2127                                         for(int k=8;k<lengths[sequence];k++){
2128                                                 char base = bases[k % 4];
2129                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
2130                                                 
2131                                                 for(int s=0;s<freq;s++){
2132                                                         otuCountsFile << base;
2133                                                 }
2134                                         }
2135                                         otuCountsFile << endl;
2136                                 }
2137                                 otuCountsFile << endl;
2138                         }
2139                 }
2140                 otuCountsFile.close();
2141         }
2142         catch(exception& e) {
2143                 m->errorOut(e, "ShhherCommand", "writeClusters");
2144                 exit(1);        
2145         }               
2146 }
2147
2148 //**********************************************************************************************************************