]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
removed read.dist, read.otu, read.tree and globaldata. added current to defaults...
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsematrix.hpp"
19 #include <cfloat>
20
21 //**********************************************************************************************************************
22
23 #define NUMBINS 1000
24 #define HOMOPS 10
25 #define MIN_COUNT 0.1
26 #define MIN_WEIGHT 0.1
27 #define MIN_TAU 0.0001
28 #define MIN_ITER 10
29 //**********************************************************************************************************************
30 vector<string> ShhherCommand::setParameters(){  
31         try {
32                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
33                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
34                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
35                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
36                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
37                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
38                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
39                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
40                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
41                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
42                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
43                 
44                 vector<string> myArray;
45                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
46                 return myArray;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "setParameters");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getHelpString(){  
55         try {
56                 string helpString = "";
57                 helpString += "The shhh.seqs command reads a file containing flowgrams and creates a file of corrected sequences.\n";
58                 return helpString;
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "ShhherCommand", "getHelpString");
62                 exit(1);
63         }
64 }
65 //**********************************************************************************************************************
66
67 ShhherCommand::ShhherCommand(){ 
68         try {
69                 abort = true; calledHelp = true;
70                 setParameters();
71                 
72                 //initialize outputTypes
73                 vector<string> tempOutNames;
74                 outputTypes["pn.dist"] = tempOutNames;
75
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
79                 exit(1);
80         }
81 }
82
83 //**********************************************************************************************************************
84
85 ShhherCommand::ShhherCommand(string option) {
86         try {
87
88 #ifdef USE_MPI
89                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
90                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
91
92                 if(pid == 0){
93 #endif
94                 
95                 
96                 abort = false; calledHelp = false;   
97                 
98                 
99                 //allow user to run help
100                 if(option == "help") { help(); abort = true; calledHelp = true; }
101                 
102                 else {
103                         vector<string> myArray = setParameters();
104                         
105                         OptionParser parser(option);
106                         map<string,string> parameters = parser.getParameters();
107                         
108                         ValidParameters validParameter;
109                         map<string,string>::iterator it;
110                         
111                         //check to make sure all parameters are valid for command
112                         for (it = parameters.begin(); it != parameters.end(); it++) { 
113                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
114                         }
115                         
116                         //initialize outputTypes
117                         vector<string> tempOutNames;
118                         outputTypes["pn.dist"] = tempOutNames;
119                         //                      outputTypes["fasta"] = tempOutNames;
120                         
121                         //if the user changes the input directory command factory will send this info to us in the output parameter 
122                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
123                         if (inputDir == "not found"){   inputDir = "";          }
124                         else {
125                                 string path;
126                                 it = parameters.find("flow");
127                                 //user has given a template file
128                                 if(it != parameters.end()){ 
129                                         path = m->hasPath(it->second);
130                                         //if the user has not given a path then, add inputdir. else leave path alone.
131                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
132                                 }
133                                 
134                                 it = parameters.find("lookup");
135                                 //user has given a template file
136                                 if(it != parameters.end()){ 
137                                         path = m->hasPath(it->second);
138                                         //if the user has not given a path then, add inputdir. else leave path alone.
139                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
140                                 }
141
142                                 it = parameters.find("file");
143                                 //user has given a template file
144                                 if(it != parameters.end()){ 
145                                         path = m->hasPath(it->second);
146                                         //if the user has not given a path then, add inputdir. else leave path alone.
147                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
148                                 }
149                         }
150                         
151                         
152                         //check for required parameters
153                         flowFileName = validParameter.validFile(parameters, "flow", true);
154                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
155                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
156                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
157                                 m->mothurOutEndLine();
158                                 abort = true; 
159                         }
160                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
161                         
162                         if(flowFileName != "not found"){        compositeFASTAFileName = "";    }
163                         else{
164                                 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "pn.fasta";
165                                 ofstream temp;
166                                 m->openOutputFile(compositeFASTAFileName, temp);
167                                 temp.close();
168                         }
169                         
170                         //if the user changes the output directory command factory will send this info to us in the output parameter 
171                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
172                                 outputDir = ""; 
173                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
174                         }
175                         
176                         
177                         //check for optional parameter and set defaults
178                         // ...at some point should added some additional type checking...
179                         string temp;
180                         temp = validParameter.validFile(parameters, "lookup", true);
181                         if (temp == "not found")        {       lookupFileName = "LookUp_Titanium.pat"; }
182                         else if(temp == "not open")     {       abort = true;                   } 
183                         else                                            {       lookupFileName = temp;  }
184                         
185                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
186                         m->setProcessors(temp);
187                         convert(temp, processors);
188
189                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
190                         convert(temp, cutoff); 
191                         
192                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
193                         convert(temp, minDelta); 
194
195                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
196                         convert(temp, maxIters); 
197
198                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
199                         convert(temp, sigma); 
200                         
201                         flowOrder = validParameter.validFile(parameters, "order", false);
202                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
203                         else if(flowOrder.length() != 4){
204                                 m->mothurOut("The value of the order option must be four bases long\n");
205                         }
206                         
207                 }
208                         
209 #ifdef USE_MPI
210                 }                               
211 #endif
212                                 
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
216                 exit(1);
217         }
218 }
219 //**********************************************************************************************************************
220 #ifdef USE_MPI
221 int ShhherCommand::execute(){
222         try {
223                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
224                 
225                 int tag = 1976;
226                 MPI_Status status; 
227
228                 if(pid == 0){
229
230                         for(int i=1;i<ncpus;i++){
231                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
232                         }
233                         if(abort == 1){ return 0;       }
234
235                         processors = ncpus;
236                         
237                         m->mothurOut("\nGetting preliminary data...\n");
238                         getSingleLookUp();
239                         getJointLookUp();
240                         
241                         vector<string> flowFileVector;
242                         if(flowFilesFileName != "not found"){
243                                 string fName;
244
245                                 ifstream flowFilesFile;
246                                 m->openInputFile(flowFilesFileName, flowFilesFile);
247                                 while(flowFilesFile){
248                                         flowFilesFile >> fName;
249                                         flowFileVector.push_back(fName);
250                                         m->gobble(flowFilesFile);
251                                 }
252                         }
253                         else{
254                                 flowFileVector.push_back(flowFileName);
255                         }
256                         int numFiles = flowFileVector.size();
257
258                         for(int i=1;i<ncpus;i++){
259                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
260                         }
261                         
262                         for(int i=0;i<numFiles;i++){
263                                 double begClock = clock();
264                                 unsigned long int begTime = time(NULL);
265
266                                 flowFileName = flowFileVector[i];
267                                 
268                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
269                                 m->mothurOut("Reading flowgrams...\n");
270                                 getFlowData();
271
272                                 m->mothurOut("Identifying unique flowgrams...\n");
273                                 getUniques();
274
275                                 m->mothurOut("Calculating distances between flowgrams...\n");
276                                 char fileName[1024];
277                                 strcpy(fileName, flowFileName.c_str());
278
279                                 for(int i=1;i<ncpus;i++){
280                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
281
282                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
283                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
284                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
285                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
286                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
287                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
288                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
289                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
290                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
291                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
292                                 }                               
293                                                         
294                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
295                                 
296                                 int done;
297                                 for(int i=1;i<ncpus;i++){
298                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
299                                         
300                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
301                                         remove((distFileName + ".temp." + toString(i)).c_str());
302                                 }
303
304                                 string namesFileName = createNamesFile();
305                                 
306                                 m->mothurOut("\nClustering flowgrams...\n");
307                                 string listFileName = cluster(distFileName, namesFileName);
308
309                                 getOTUData(listFileName);
310                                 initPyroCluster();
311
312                                 for(int i=1;i<ncpus;i++){
313                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
314                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
315                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
316                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
317                                 }
318                                 
319                                 
320                                 double maxDelta = 0;
321                                 int iter = 0;
322                                 
323                                 int numOTUsOnCPU = numOTUs / ncpus;
324                                 int numSeqsOnCPU = numSeqs / ncpus;
325                                 m->mothurOut("\nDenoising flowgrams...\n");
326                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
327                                 
328                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
329
330                                         double cycClock = clock();
331                                         unsigned long int cycTime = time(NULL);
332                                         fill();
333
334                                         int total = singleTau.size();
335                                         for(int i=1;i<ncpus;i++){
336                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
337                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
338                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
339                                                 
340                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
341                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
342                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
343                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
344                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
345                                         }
346                                         
347                                         calcCentroidsDriver(0, numOTUsOnCPU);
348                                         
349                                         for(int i=1;i<ncpus;i++){
350                                                 int otuStart = i * numOTUs / ncpus;
351                                                 int otuStop = (i + 1) * numOTUs / ncpus;
352                                                 
353                                                 vector<int> tempCentroids(numOTUs, 0);
354                                                 vector<short> tempChange(numOTUs, 0);
355                                                 
356                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
357                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
358                                                 
359                                                 for(int j=otuStart;j<otuStop;j++){
360                                                         centroids[j] = tempCentroids[j];
361                                                         change[j] = tempChange[j];
362                                                 }
363                                         }
364                                                                         
365                                         maxDelta = getNewWeights();
366                                         double nLL = getLikelihood();
367                                         checkCentroids();
368                                         
369                                         for(int i=1;i<ncpus;i++){
370                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
371                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
372                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
373                                         }
374                                         
375                                         calcNewDistancesParent(0, numSeqsOnCPU);
376
377                                         total = singleTau.size();
378
379                                         for(int i=1;i<ncpus;i++){
380                                                 int childTotal;
381                                                 int seqStart = i * numSeqs / ncpus;
382                                                 int seqStop = (i + 1) * numSeqs / ncpus;
383                                                 
384                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
385
386                                                 vector<int> childSeqIndex(childTotal, 0);
387                                                 vector<double> childSingleTau(childTotal, 0);
388                                                 vector<double> childDist(numSeqs * numOTUs, 0);
389                                                 vector<int> otuIndex(childTotal, 0);
390                                                 
391                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
392                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
393                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
394                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
395
396                                                 int oldTotal = total;
397                                                 total += childTotal;
398                                                 singleTau.resize(total, 0);
399                                                 seqIndex.resize(total, 0);
400                                                 seqNumber.resize(total, 0);
401                                                 
402                                                 int childIndex = 0;
403                                                 
404                                                 for(int j=oldTotal;j<total;j++){
405                                                         int otuI = otuIndex[childIndex];
406                                                         int seqI = childSeqIndex[childIndex];
407                                                         
408                                                         singleTau[j] = childSingleTau[childIndex];
409                                                         
410                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
411                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
412                                                         nSeqsPerOTU[otuI]++;
413                                                         childIndex++;
414                                                 }
415                                                 
416                                                 int index = seqStart * numOTUs;
417                                                 for(int j=seqStart;j<seqStop;j++){
418                                                         for(int k=0;k<numOTUs;k++){
419                                                                 dist[index] = childDist[index];
420                                                                 index++;
421                                                         }
422                                                 }                                       
423                                         }
424                                         
425                                         iter++;
426                                         
427                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
428
429                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
430                                                 int live = 1;
431                                                 for(int i=1;i<ncpus;i++){
432                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
433                                                 }
434                                         }
435                                         else{
436                                                 int live = 0;
437                                                 for(int i=1;i<ncpus;i++){
438                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
439                                                 }
440                                         }
441                                         
442                                 }       
443                                 
444                                 m->mothurOut("\nFinalizing...\n");
445                                 fill();
446                                 setOTUs();
447                                 vector<int> otuCounts(numOTUs, 0);
448                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
449                                 calcCentroidsDriver(0, numOTUs);
450                                 writeQualities(otuCounts);
451                                 writeSequences(otuCounts);
452                                 writeNames(otuCounts);
453                                 writeClusters(otuCounts);
454                                 writeGroups();
455                                 
456                                 remove(distFileName.c_str());
457                                 remove(namesFileName.c_str());
458                                 remove(listFileName.c_str());
459                                                                  
460                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
461                         }
462                 }
463                 else{
464                         int abort = 1;
465
466                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
467                         if(abort){      return 0;       }
468
469                         int numFiles;
470                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
471
472                         for(int i=0;i<numFiles;i++){
473                                 //Now into the pyrodist part
474                                 bool live = 1;
475
476                                 char fileName[1024];
477                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
478                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
479                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
480                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
481                                 
482                                 flowDataIntI.resize(numSeqs * numFlowCells);
483                                 flowDataPrI.resize(numSeqs * numFlowCells);
484                                 mapUniqueToSeq.resize(numSeqs);
485                                 mapSeqToUnique.resize(numSeqs);
486                                 lengths.resize(numSeqs);
487                                 jointLookUp.resize(NUMBINS * NUMBINS);
488
489                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
490                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
491                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
492                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
493                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
494                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
495                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
496
497                                 flowFileName = string(fileName);
498                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
499                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
500                                 
501                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
502
503                                 int done = 1;
504                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
505
506                                 //Now into the pyronoise part
507                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
508                                 
509                                 singleLookUp.resize(HOMOPS * NUMBINS);
510                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
511                                 weight.resize(numOTUs);
512                                 centroids.resize(numOTUs);
513                                 change.resize(numOTUs);
514                                 dist.assign(numOTUs * numSeqs, 0);
515                                 nSeqsPerOTU.resize(numOTUs);
516                                 cumNumSeqs.resize(numOTUs);
517
518                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
519                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
520                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
521                                 
522                                 int startOTU = pid * numOTUs / ncpus;
523                                 int endOTU = (pid + 1) * numOTUs / ncpus;
524
525                                 int startSeq = pid * numSeqs / ncpus;
526                                 int endSeq = (pid + 1) * numSeqs /ncpus;
527                                 
528                                 int total;
529
530                                 while(live){
531                                         
532                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
533                                         singleTau.assign(total, 0.0000);
534                                         seqNumber.assign(total, 0);
535                                         seqIndex.assign(total, 0);
536
537                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
538                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
539                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
540                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
541                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
542                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
543                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
544
545                                         calcCentroidsDriver(startOTU, endOTU);
546
547                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
548                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
549
550                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
551                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
552                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
553
554                                         vector<int> otuIndex(total, 0);
555                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
556                                         total = otuIndex.size();
557                                         
558                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
559                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
560                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
561                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
562                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
563                                 
564                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
565                                 }
566                         }
567                 }               
568                 MPI_Barrier(MPI_COMM_WORLD);
569
570                 return 0;
571
572         }
573         catch(exception& e) {
574                 m->errorOut(e, "ShhherCommand", "execute");
575                 exit(1);
576         }
577 }
578
579 /**************************************************************************************************/
580
581 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
582         try{            
583                 ostringstream outStream;
584                 outStream.setf(ios::fixed, ios::floatfield);
585                 outStream.setf(ios::dec, ios::basefield);
586                 outStream.setf(ios::showpoint);
587                 outStream.precision(6);
588                 
589                 int begTime = time(NULL);
590                 double begClock = clock();
591                 
592                 for(int i=startSeq;i<stopSeq;i++){
593                         for(int j=0;j<i;j++){
594                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
595                                 
596                                 if(flowDistance < 1e-6){
597                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
598                                 }
599                                 else if(flowDistance <= cutoff){
600                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
601                                 }
602                         }
603                         if(i % 100 == 0){
604                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
605                         }
606                 }
607                 
608                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
609                 
610                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
611                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
612
613                 ofstream distFile(fDistFileName.c_str());
614                 distFile << outStream.str();            
615                 distFile.close();
616                 
617                 return fDistFileName;
618         }
619         catch(exception& e) {
620                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
621                 exit(1);
622         }
623 }
624
625 #else
626 //**********************************************************************************************************************
627
628 int ShhherCommand::execute(){
629         try {
630                 if (abort == true) { return 0; }
631                 
632                 getSingleLookUp();
633                 getJointLookUp();
634                                 
635                 vector<string> flowFileVector;
636                 if(flowFilesFileName != "not found"){
637                         string fName;
638                         
639                         ifstream flowFilesFile;
640                         m->openInputFile(flowFilesFileName, flowFilesFile);
641                         while(flowFilesFile){
642                                 flowFilesFile >> fName;
643                                 flowFileVector.push_back(fName);
644                                 m->gobble(flowFilesFile);
645                         }
646                 }
647                 else{
648                         flowFileVector.push_back(flowFileName);
649                 }
650                 int numFiles = flowFileVector.size();
651                 
652                 
653                 for(int i=0;i<numFiles;i++){
654                         flowFileName = flowFileVector[i];
655
656                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
657                         m->mothurOut("Reading flowgrams...\n");
658                         getFlowData();
659                         
660                         m->mothurOut("Identifying unique flowgrams...\n");
661                         getUniques();
662                         
663                         
664                         m->mothurOut("Calculating distances between flowgrams...\n");
665                         string distFileName = createDistFile(processors);
666                         string namesFileName = createNamesFile();
667                                 
668                         m->mothurOut("\nClustering flowgrams...\n");
669                         string listFileName = cluster(distFileName, namesFileName);
670                         getOTUData(listFileName);
671                         
672                         initPyroCluster();
673                         
674                         double maxDelta = 0;
675                         int iter = 0;
676                         
677                         double begClock = clock();
678                         unsigned long int begTime = time(NULL);
679
680                         m->mothurOut("\nDenoising flowgrams...\n");
681                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
682                         
683                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
684                                 
685                                 double cycClock = clock();
686                                 unsigned long int cycTime = time(NULL);
687                                 fill();
688                                 
689                                 calcCentroids();
690                                 
691                                 maxDelta = getNewWeights();
692                                 double nLL = getLikelihood();
693                                 checkCentroids();
694                                 
695                                 calcNewDistances();
696
697                                 iter++;
698                                 
699                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
700
701                         }       
702                         
703                         m->mothurOut("\nFinalizing...\n");
704                         fill();
705                         setOTUs();
706                         
707                         vector<int> otuCounts(numOTUs, 0);
708                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
709                         
710                         calcCentroidsDriver(0, numOTUs);
711                         writeQualities(otuCounts);
712                         writeSequences(otuCounts);
713                         writeNames(otuCounts);
714                         writeClusters(otuCounts);
715                         writeGroups();
716                         
717                         remove(distFileName.c_str());
718                         remove(namesFileName.c_str());
719                         remove(listFileName.c_str());
720                         
721                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
722                 }
723                 return 0;
724         }
725         catch(exception& e) {
726                 m->errorOut(e, "ShhherCommand", "execute");
727                 exit(1);
728         }
729 }
730 #endif
731 /**************************************************************************************************/
732
733 void ShhherCommand::getFlowData(){
734         try{
735                 ifstream flowFile;
736                 m->openInputFile(flowFileName, flowFile);
737                 
738                 string seqName;
739                 seqNameVector.clear();
740                 lengths.clear();
741                 flowDataIntI.clear();
742                 nameMap.clear();
743                 
744                 
745                 int currentNumFlowCells;
746                 
747                 float intensity;
748                 
749                 flowFile >> numFlowCells;
750                 int index = 0;//pcluster
751                 while(!flowFile.eof()){
752                         flowFile >> seqName >> currentNumFlowCells;
753                         lengths.push_back(currentNumFlowCells);
754
755                         seqNameVector.push_back(seqName);
756                         nameMap[seqName] = index++;//pcluster
757
758                         for(int i=0;i<numFlowCells;i++){
759                                 flowFile >> intensity;
760                                 if(intensity > 9.99)    {       intensity = 9.99;       }
761                                 int intI = int(100 * intensity + 0.0001);
762                                 flowDataIntI.push_back(intI);
763                         }
764                         m->gobble(flowFile);
765                 }
766                 flowFile.close();
767                 
768                 numSeqs = seqNameVector.size();         
769                 
770                 for(int i=0;i<numSeqs;i++){
771                         int iNumFlowCells = i * numFlowCells;
772                         for(int j=lengths[i];j<numFlowCells;j++){
773                                 flowDataIntI[iNumFlowCells + j] = 0;
774                         }
775                 }
776                 
777         }
778         catch(exception& e) {
779                 m->errorOut(e, "ShhherCommand", "getFlowData");
780                 exit(1);
781         }
782 }
783
784 /**************************************************************************************************/
785
786 void ShhherCommand::getSingleLookUp(){
787         try{
788                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
789                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
790                 
791                 int index = 0;
792                 ifstream lookUpFile;
793                 m->openInputFile(lookupFileName, lookUpFile);
794                 
795                 for(int i=0;i<HOMOPS;i++){
796                         float logFracFreq;
797                         lookUpFile >> logFracFreq;
798                         
799                         for(int j=0;j<NUMBINS;j++)      {
800                                 lookUpFile >> singleLookUp[index];
801                                 index++;
802                         }
803                 }       
804                 lookUpFile.close();
805         }
806         catch(exception& e) {
807                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
808                 exit(1);
809         }
810 }
811
812 /**************************************************************************************************/
813
814 void ShhherCommand::getJointLookUp(){
815         try{
816                 
817                 //      the most likely joint probability (-log) that two intenities have the same polymer length
818                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
819                 
820                 for(int i=0;i<NUMBINS;i++){
821                         for(int j=0;j<NUMBINS;j++){             
822                                 
823                                 double minSum = 100000000;
824                                 
825                                 for(int k=0;k<HOMOPS;k++){
826                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
827                                         
828                                         if(sum < minSum)        {       minSum = sum;           }
829                                 }       
830                                 jointLookUp[i * NUMBINS + j] = minSum;
831                         }
832                 }
833         }
834         catch(exception& e) {
835                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
836                 exit(1);
837         }
838 }
839
840 /**************************************************************************************************/
841
842 double ShhherCommand::getProbIntensity(int intIntensity){                          
843         try{
844
845                 double minNegLogProb = 100000000; 
846
847                 
848                 for(int i=0;i<HOMOPS;i++){//loop signal strength
849                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
850                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
851                 }
852                 
853                 return minNegLogProb;
854         }
855         catch(exception& e) {
856                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
857                 exit(1);
858         }
859 }
860
861 /**************************************************************************************************/
862
863 void ShhherCommand::getUniques(){
864         try{
865                 
866                 
867                 numUniques = 0;
868                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
869                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
870                 uniqueLengths.assign(numSeqs, 0);
871                 mapSeqToUnique.assign(numSeqs, -1);
872                 mapUniqueToSeq.assign(numSeqs, -1);
873                 
874                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
875                 
876                 for(int i=0;i<numSeqs;i++){
877                         int index = 0;
878                         
879                         vector<short> current(numFlowCells);
880                         for(int j=0;j<numFlowCells;j++){
881                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
882                         }
883                                                 
884                         for(int j=0;j<numUniques;j++){
885                                 int offset = j * numFlowCells;
886                                 bool toEnd = 1;
887                                 
888                                 int shorterLength;
889                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
890                                 else                                                            {       shorterLength = uniqueLengths[j];       }
891
892                                 for(int k=0;k<shorterLength;k++){
893                                         if(current[k] != uniqueFlowgrams[offset + k]){
894                                                 toEnd = 0;
895                                                 break;
896                                         }
897                                 }
898                                 
899                                 if(toEnd){
900                                         mapSeqToUnique[i] = j;
901                                         uniqueCount[j]++;
902                                         index = j;
903                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
904                                         break;
905                                 }
906                                 index++;
907                         }
908                         
909                         if(index == numUniques){
910                                 uniqueLengths[numUniques] = lengths[i];
911                                 uniqueCount[numUniques] = 1;
912                                 mapSeqToUnique[i] = numUniques;//anMap
913                                 mapUniqueToSeq[numUniques] = i;//anF
914                                 
915                                 for(int k=0;k<numFlowCells;k++){
916                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
917                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
918                                 }
919                                 
920                                 numUniques++;
921                         }
922                 }
923                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
924                 uniqueLengths.resize(numUniques);       
925                 
926                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
927                 for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
928         }
929         catch(exception& e) {
930                 m->errorOut(e, "ShhherCommand", "getUniques");
931                 exit(1);
932         }
933 }
934
935 /**************************************************************************************************/
936
937 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
938         try{
939                 int minLength = lengths[mapSeqToUnique[seqA]];
940                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
941                 
942                 int ANumFlowCells = seqA * numFlowCells;
943                 int BNumFlowCells = seqB * numFlowCells;
944                 
945                 float dist = 0;
946                 
947                 for(int i=0;i<minLength;i++){
948                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
949                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
950                         
951                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
952                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
953                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
954                 }
955                 
956                 dist /= (float) minLength;
957                 return dist;
958         }
959         catch(exception& e) {
960                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
961                 exit(1);
962         }
963 }
964
965 /**************************************************************************************************/
966
967 void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
968         try{            
969
970                 ostringstream outStream;
971                 outStream.setf(ios::fixed, ios::floatfield);
972                 outStream.setf(ios::dec, ios::basefield);
973                 outStream.setf(ios::showpoint);
974                 outStream.precision(6);
975                 
976                 int begTime = time(NULL);
977                 double begClock = clock();
978
979                 for(int i=startSeq;i<stopSeq;i++){
980                         for(int j=0;j<i;j++){
981                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
982
983                                 if(flowDistance < 1e-6){
984                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
985                                 }
986                                 else if(flowDistance <= cutoff){
987                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
988                                 }
989                         }
990                         if(i % 100 == 0){
991                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
992                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
993                                 m->mothurOutEndLine();
994                         }
995                 }
996                 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
997                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
998                 m->mothurOutEndLine();
999                 
1000                 ofstream distFile(distFileName.c_str());
1001                 distFile << outStream.str();            
1002                 distFile.close();
1003         }
1004         catch(exception& e) {
1005                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
1006                 exit(1);
1007         }
1008 }
1009
1010 /**************************************************************************************************/
1011
1012 string ShhherCommand::createDistFile(int processors){
1013         try{
1014                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
1015                                 
1016                 unsigned long int begTime = time(NULL);
1017                 double begClock = clock();
1018
1019                 vector<int> start;
1020                 vector<int> end;
1021                 
1022 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1023                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
1024                 else{ //you have multiple processors
1025                         
1026                         if (numSeqs < processors){      processors = 1; }
1027                         
1028                         vector<int> start(processors, 0);
1029                         vector<int> end(processors, 0);
1030                         
1031                         for (int i = 0; i < processors; i++) {
1032                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
1033                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
1034                         }
1035                         
1036                         int process = 1;
1037                         vector<int> processIDs;
1038                         
1039                         //loop through and create all the processes you want
1040                         while (process != processors) {
1041                                 int pid = fork();
1042                                 
1043                                 if (pid > 0) {
1044                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1045                                         process++;
1046                                 }else if (pid == 0){
1047                                         flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
1048                                         exit(0);
1049                                 }else { 
1050                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1051                                         perror(" : ");
1052                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1053                                         exit(0);
1054                                 }
1055                         }
1056                         
1057                         //parent does its part
1058                         flowDistParentFork(fDistFileName, start[0], end[0]);
1059                         
1060                         //force parent to wait until all the processes are done
1061                         for (int i=0;i<processIDs.size();i++) { 
1062                                 int temp = processIDs[i];
1063                                 wait(&temp);
1064                         }
1065                         
1066                         //append and remove temp files
1067                         for (int i=0;i<processIDs.size();i++) { 
1068                                 m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
1069                                 remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
1070                         }
1071                         
1072                 }
1073                 
1074 #else
1075                 flowDistParentFork(fDistFileName, 0, numUniques);
1076 #endif
1077
1078                 m->mothurOutEndLine();
1079                 
1080                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
1081                 
1082
1083                 return fDistFileName;
1084         }
1085         catch(exception& e) {
1086                 m->errorOut(e, "ShhherCommand", "createDistFile");
1087                 exit(1);
1088         }
1089         
1090 }
1091
1092 /**************************************************************************************************/
1093
1094 string ShhherCommand::createNamesFile(){
1095         try{
1096                 
1097                 vector<string> duplicateNames(numUniques, "");
1098                 for(int i=0;i<numSeqs;i++){
1099                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
1100                 }
1101                 
1102                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.names";
1103                 
1104                 ofstream nameFile;
1105                 m->openOutputFile(nameFileName, nameFile);
1106                 
1107                 for(int i=0;i<numUniques;i++){
1108 //                      nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1109                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1110                 }
1111                 
1112                 nameFile.close();
1113                 return  nameFileName;
1114         }
1115         catch(exception& e) {
1116                 m->errorOut(e, "ShhherCommand", "createNamesFile");
1117                 exit(1);
1118         }
1119 }
1120
1121 //**********************************************************************************************************************
1122
1123 string ShhherCommand::cluster(string distFileName, string namesFileName){
1124         try {
1125                 
1126                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1127                 read->setCutoff(cutoff);
1128                 
1129                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1130                 clusterNameMap->readMap();
1131                 read->read(clusterNameMap);
1132                 
1133                 ListVector* list = read->getListVector();
1134                 SparseMatrix* matrix = read->getMatrix();
1135                 
1136                 delete read; 
1137                 delete clusterNameMap; 
1138                                 
1139                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1140                 
1141                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1142                 string tag = cluster->getTag();
1143                 
1144                 double clusterCutoff = cutoff;
1145                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1146                         cluster->update(clusterCutoff);
1147                 }
1148                 
1149                 list->setLabel(toString(cutoff));
1150                 
1151                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.list";
1152                 ofstream listFile;
1153                 m->openOutputFile(listFileName, listFile);
1154                 list->print(listFile);
1155                 listFile.close();
1156                 
1157                 delete matrix;  delete cluster; delete rabund; delete list;
1158         
1159                 return listFileName;
1160         }
1161         catch(exception& e) {
1162                 m->errorOut(e, "ShhherCommand", "cluster");
1163                 exit(1);        
1164         }               
1165 }
1166
1167 /**************************************************************************************************/
1168
1169 void ShhherCommand::getOTUData(string listFileName){
1170         try {
1171
1172                 ifstream listFile;
1173                 m->openInputFile(listFileName, listFile);
1174                 string label;
1175                 
1176                 listFile >> label >> numOTUs;
1177
1178                 otuData.assign(numSeqs, 0);
1179                 cumNumSeqs.assign(numOTUs, 0);
1180                 nSeqsPerOTU.assign(numOTUs, 0);
1181                 aaP.clear();aaP.resize(numOTUs);
1182                 
1183                 seqNumber.clear();
1184                 aaI.clear();
1185                 seqIndex.clear();
1186                 
1187                 string singleOTU = "";
1188                 
1189                 for(int i=0;i<numOTUs;i++){
1190
1191                         listFile >> singleOTU;
1192                         
1193                         istringstream otuString(singleOTU);
1194
1195                         while(otuString){
1196                                 
1197                                 string seqName = "";
1198                                 
1199                                 for(int j=0;j<singleOTU.length();j++){
1200                                         char letter = otuString.get();
1201                                         
1202                                         if(letter != ','){
1203                                                 seqName += letter;
1204                                         }
1205                                         else{
1206                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1207                                                 int index = nmIt->second;
1208                                                 
1209                                                 nameMap.erase(nmIt);
1210                                                 
1211                                                 otuData[index] = i;
1212                                                 nSeqsPerOTU[i]++;
1213                                                 aaP[i].push_back(index);
1214                                                 seqName = "";
1215                                         }
1216                                 }
1217                                 
1218                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1219
1220                                 int index = nmIt->second;
1221                                 nameMap.erase(nmIt);
1222
1223                                 otuData[index] = i;
1224                                 nSeqsPerOTU[i]++;
1225                                 aaP[i].push_back(index);        
1226                                 
1227                                 otuString.get();
1228                         }
1229                         
1230                         sort(aaP[i].begin(), aaP[i].end());
1231                         for(int j=0;j<nSeqsPerOTU[i];j++){
1232                                 seqNumber.push_back(aaP[i][j]);
1233                         }
1234                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
1235                                 aaP[i].push_back(0);
1236                         }
1237                         
1238                         
1239                 }
1240                 
1241                 for(int i=1;i<numOTUs;i++){
1242                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
1243                 }
1244                 aaI = aaP;
1245                 seqIndex = seqNumber;
1246                 
1247                 listFile.close();       
1248                 
1249         }
1250         catch(exception& e) {
1251                 m->errorOut(e, "ShhherCommand", "getOTUData");
1252                 exit(1);        
1253         }               
1254 }
1255
1256 /**************************************************************************************************/
1257
1258 void ShhherCommand::initPyroCluster(){                          
1259         try{
1260                 dist.assign(numSeqs * numOTUs, 0);
1261                 change.assign(numOTUs, 1);
1262                 centroids.assign(numOTUs, -1);
1263                 weight.assign(numOTUs, 0);
1264                 singleTau.assign(numSeqs, 1.0);
1265                 
1266                 nSeqsBreaks.assign(processors+1, 0);
1267                 nOTUsBreaks.assign(processors+1, 0);
1268
1269                 nSeqsBreaks[0] = 0;
1270                 for(int i=0;i<processors;i++){
1271                         nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1272                         nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1273                 }
1274                 nSeqsBreaks[processors] = numSeqs;
1275                 nOTUsBreaks[processors] = numOTUs;
1276         }
1277         catch(exception& e) {
1278                 m->errorOut(e, "ShhherCommand", "initPyroCluster");
1279                 exit(1);        
1280         }
1281 }
1282
1283 /**************************************************************************************************/
1284
1285 void ShhherCommand::fill(){
1286         try {
1287                 int index = 0;
1288                 for(int i=0;i<numOTUs;i++){
1289                         cumNumSeqs[i] = index;
1290                         for(int j=0;j<nSeqsPerOTU[i];j++){
1291                                 seqNumber[index] = aaP[i][j];
1292                                 seqIndex[index] = aaI[i][j];
1293                                 
1294                                 index++;
1295                         }
1296                 }
1297         }
1298         catch(exception& e) {
1299                 m->errorOut(e, "ShhherCommand", "fill");
1300                 exit(1);        
1301         }               
1302 }
1303
1304 /**************************************************************************************************/
1305
1306 void ShhherCommand::calcCentroids(){                          
1307         try{
1308                 
1309 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1310
1311                 if(processors == 1)     {
1312                         calcCentroidsDriver(0, numOTUs);                
1313                 }
1314                 else{ //you have multiple processors
1315                         if (numOTUs < processors){      processors = 1; }
1316                         
1317                         int process = 1;
1318                         vector<int> processIDs;
1319                         
1320                         //loop through and create all the processes you want
1321                         while (process != processors) {
1322                                 int pid = vfork();
1323                                 
1324                                 if (pid > 0) {
1325                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1326                                         process++;
1327                                 }else if (pid == 0){
1328                                         calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
1329                                         exit(0);
1330                                 }else { 
1331                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1332                                         perror(" : ");
1333                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1334                                         exit(0);
1335                                 }
1336                         }
1337                         
1338                         //parent does its part
1339                         calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
1340
1341                         //force parent to wait until all the processes are done
1342                         for (int i=0;i<processIDs.size();i++) { 
1343                                 int temp = processIDs[i];
1344                                 wait(&temp);
1345                         }
1346                 }
1347                 
1348 #else
1349                 calcCentroidsDriver(0, numOTUs);
1350 #endif          
1351         }
1352         catch(exception& e) {
1353                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1354                 exit(1);        
1355         }               
1356 }
1357
1358 /**************************************************************************************************/
1359
1360 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1361         
1362         //this function gets the most likely homopolymer length at a flow position for a group of sequences
1363         //within an otu
1364         
1365         try{
1366                 
1367         
1368                 for(int i=start;i<finish;i++){
1369                         
1370                         double count = 0;
1371                         int position = 0;
1372                         int minFlowGram = 100000000;
1373                         double minFlowValue = 1e8;
1374                         change[i] = 0; //FALSE
1375                         
1376                         for(int j=0;j<nSeqsPerOTU[i];j++){
1377                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1378                         }
1379
1380                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1381                                 vector<double> adF(nSeqsPerOTU[i]);
1382                                 vector<int> anL(nSeqsPerOTU[i]);
1383                                 
1384                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1385                                         int index = cumNumSeqs[i] + j;
1386                                         int nI = seqIndex[index];
1387                                         int nIU = mapSeqToUnique[nI];
1388                                         
1389                                         int k;
1390                                         for(k=0;k<position;k++){
1391                                                 if(nIU == anL[k]){
1392                                                         break;
1393                                                 }
1394                                         }
1395                                         if(k == position){
1396                                                 anL[position] = nIU;
1397                                                 adF[position] = 0.0000;
1398                                                 position++;
1399                                         }                                               
1400                                 }
1401                                 
1402                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1403                                         int index = cumNumSeqs[i] + j;
1404                                         int nI = seqIndex[index];
1405                                         
1406                                         double tauValue = singleTau[seqNumber[index]];
1407                                         
1408                                         for(int k=0;k<position;k++){
1409                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1410                                                 adF[k] += dist * tauValue;
1411                                         }
1412                                 }
1413                                 
1414                                 for(int j=0;j<position;j++){
1415                                         if(adF[j] < minFlowValue){
1416                                                 minFlowGram = j;
1417                                                 minFlowValue = adF[j];
1418                                         }
1419                                 }
1420                                 
1421                                 if(centroids[i] != anL[minFlowGram]){
1422                                         change[i] = 1;
1423                                         centroids[i] = anL[minFlowGram];
1424                                 }
1425                         }
1426                         else if(centroids[i] != -1){
1427                                 change[i] = 1;
1428                                 centroids[i] = -1;                      
1429                         }
1430                 }
1431         }
1432         catch(exception& e) {
1433                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1434                 exit(1);        
1435         }               
1436 }
1437
1438 /**************************************************************************************************/
1439
1440 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1441         try{
1442                 
1443                 int flowAValue = cent * numFlowCells;
1444                 int flowBValue = flow * numFlowCells;
1445                 
1446                 double dist = 0;
1447                 
1448                 for(int i=0;i<length;i++){
1449                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1450                         flowAValue++;
1451                         flowBValue++;
1452                 }
1453                 
1454                 return dist / (double)length;
1455         }
1456         catch(exception& e) {
1457                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1458                 exit(1);        
1459         }               
1460 }
1461
1462 /**************************************************************************************************/
1463
1464 double ShhherCommand::getNewWeights(){
1465         try{
1466                 
1467                 double maxChange = 0;
1468                 
1469                 for(int i=0;i<numOTUs;i++){
1470                         
1471                         double difference = weight[i];
1472                         weight[i] = 0;
1473                         
1474                         for(int j=0;j<nSeqsPerOTU[i];j++){
1475                                 int index = cumNumSeqs[i] + j;
1476                                 double tauValue = singleTau[seqNumber[index]];
1477                                 weight[i] += tauValue;
1478                         }
1479                         
1480                         difference = fabs(weight[i] - difference);
1481                         if(difference > maxChange){     maxChange = difference; }
1482                 }
1483                 return maxChange;
1484         }
1485         catch(exception& e) {
1486                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1487                 exit(1);        
1488         }               
1489 }
1490
1491 /**************************************************************************************************/
1492
1493 double ShhherCommand::getLikelihood(){
1494         
1495         try{
1496                 
1497                 vector<long double> P(numSeqs, 0);
1498                 int effNumOTUs = 0;
1499                 
1500                 for(int i=0;i<numOTUs;i++){
1501                         if(weight[i] > MIN_WEIGHT){
1502                                 effNumOTUs++;
1503                         }
1504                 }
1505                 
1506                 string hold;
1507                 for(int i=0;i<numOTUs;i++){
1508                         for(int j=0;j<nSeqsPerOTU[i];j++){
1509                                 int index = cumNumSeqs[i] + j;
1510                                 int nI = seqIndex[index];
1511                                 double singleDist = dist[seqNumber[index]];
1512                                 
1513                                 P[nI] += weight[i] * exp(-singleDist * sigma);
1514                         }
1515                 }
1516                 double nLL = 0.00;
1517                 for(int i=0;i<numSeqs;i++){
1518                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
1519
1520                         nLL += -log(P[i]);
1521                 }
1522                 
1523                 nLL = nLL -(double)numSeqs * log(sigma);
1524
1525                 return nLL; 
1526         }
1527         catch(exception& e) {
1528                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1529                 exit(1);        
1530         }               
1531 }
1532
1533 /**************************************************************************************************/
1534
1535 void ShhherCommand::checkCentroids(){
1536         try{
1537                 vector<int> unique(numOTUs, 1);
1538                 
1539                 for(int i=0;i<numOTUs;i++){
1540                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1541                                 unique[i] = -1;
1542                         }
1543                 }
1544                 
1545                 for(int i=0;i<numOTUs;i++){
1546                         if(unique[i] == 1){
1547                                 for(int j=i+1;j<numOTUs;j++){
1548                                         if(unique[j] == 1){
1549                                                 
1550                                                 if(centroids[j] == centroids[i]){
1551                                                         unique[j] = 0;
1552                                                         centroids[j] = -1;
1553                                                         
1554                                                         weight[i] += weight[j];
1555                                                         weight[j] = 0.0;
1556                                                 }
1557                                         }
1558                                 }
1559                         }
1560                 }
1561         }
1562         catch(exception& e) {
1563                 m->errorOut(e, "ShhherCommand", "checkCentroids");
1564                 exit(1);        
1565         }               
1566 }
1567
1568 /**************************************************************************************************/
1569
1570 void ShhherCommand::calcNewDistances(){                          
1571         try{
1572                 
1573 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1574                 
1575                 if(processors == 1)     {
1576                         calcNewDistancesParent(0, numSeqs);             
1577                 }
1578                 else{ //you have multiple processors
1579                         if (numSeqs < processors){      processors = 1; }
1580                         
1581                         vector<vector<int> > child_otuIndex(processors);
1582                         vector<vector<int> > child_seqIndex(processors);
1583                         vector<vector<double> > child_singleTau(processors);                    
1584                         vector<int> totals(processors);
1585                         
1586                         int process = 1;
1587                         vector<int> processIDs;
1588
1589                         //loop through and create all the processes you want
1590                         while (process != processors) {
1591                                 int pid = vfork();
1592                                 
1593                                 if (pid > 0) {
1594                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1595                                         process++;
1596                                 }else if (pid == 0){
1597                                         calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
1598                                         totals[process] = child_otuIndex[process].size();
1599
1600                                         exit(0);
1601                                 }else { 
1602                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1603                                         perror(" : ");
1604                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1605                                         exit(0);
1606                                 }
1607                         }
1608                         
1609                         //parent does its part
1610                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
1611                         int total = seqIndex.size();
1612
1613                         //force parent to wait until all the processes are done
1614                         for (int i=0;i<processIDs.size();i++) { 
1615                                 int temp = processIDs[i];
1616                                 wait(&temp);
1617                         }
1618
1619                         for(int i=1;i<processors;i++){
1620                                 int oldTotal = total;
1621                                 total += totals[i];
1622
1623                                 singleTau.resize(total, 0);
1624                                 seqIndex.resize(total, 0);
1625                                 seqNumber.resize(total, 0);
1626                                 
1627                                 int childIndex = 0;
1628                                 
1629                                 for(int j=oldTotal;j<total;j++){
1630                                         int otuI = child_otuIndex[i][childIndex];
1631                                         int seqI = child_seqIndex[i][childIndex];
1632
1633                                         singleTau[j] = child_singleTau[i][childIndex];
1634                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
1635                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
1636                                         nSeqsPerOTU[otuI]++;
1637
1638                                         childIndex++;
1639                                 }
1640                         }
1641                 }
1642 #else
1643                 calcNewDistancesParent(0, numSeqs);             
1644 #endif          
1645         }
1646         catch(exception& e) {
1647                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1648                 exit(1);        
1649         }               
1650 }
1651
1652 /**************************************************************************************************/
1653 #ifdef USE_MPI
1654 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1655         
1656         try{
1657                 vector<double> newTau(numOTUs,0);
1658                 vector<double> norms(numSeqs, 0);
1659                 otuIndex.clear();
1660                 seqIndex.clear();
1661                 singleTau.clear();
1662                 
1663                 
1664                 
1665                 for(int i=startSeq;i<stopSeq;i++){
1666                         double offset = 1e8;
1667                         int indexOffset = i * numOTUs;
1668                         
1669                         for(int j=0;j<numOTUs;j++){
1670                                 
1671                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1672                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1673                                 }
1674                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1675                                         offset = dist[indexOffset + j];
1676                                 }
1677                         }
1678                         
1679                         for(int j=0;j<numOTUs;j++){
1680                                 if(weight[j] > MIN_WEIGHT){
1681                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1682                                         norms[i] += newTau[j];
1683                                 }
1684                                 else{
1685                                         newTau[j] = 0.0;
1686                                 }
1687                         }
1688                         
1689                         for(int j=0;j<numOTUs;j++){
1690
1691                                 newTau[j] /= norms[i];
1692                                 
1693                                 if(newTau[j] > MIN_TAU){
1694                                         otuIndex.push_back(j);
1695                                         seqIndex.push_back(i);
1696                                         singleTau.push_back(newTau[j]);
1697                                 }
1698                         }
1699                         
1700                 }
1701         }
1702         catch(exception& e) {
1703                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1704                 exit(1);        
1705         }               
1706 }
1707 #endif
1708 /**************************************************************************************************/
1709
1710 void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
1711         
1712         try{
1713                 vector<double> newTau(numOTUs,0);
1714                 vector<double> norms(numSeqs, 0);
1715                 child_otuIndex.resize(0);
1716                 child_seqIndex.resize(0);
1717                 child_singleTau.resize(0);
1718                 
1719                 for(int i=startSeq;i<stopSeq;i++){
1720                         double offset = 1e8;
1721                         int indexOffset = i * numOTUs;
1722                         
1723                         
1724                         for(int j=0;j<numOTUs;j++){
1725                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1726                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1727                                 }
1728                                 
1729                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1730                                         offset = dist[indexOffset + j];
1731                                 }
1732                         }
1733                         
1734                         for(int j=0;j<numOTUs;j++){
1735                                 if(weight[j] > MIN_WEIGHT){
1736                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1737                                         norms[i] += newTau[j];
1738                                 }
1739                                 else{
1740                                         newTau[j] = 0.0;
1741                                 }
1742                         }
1743                         
1744                         for(int j=0;j<numOTUs;j++){
1745                                 newTau[j] /= norms[i];
1746                                 
1747                                 if(newTau[j] > MIN_TAU){
1748                                         child_otuIndex.push_back(j);
1749                                         child_seqIndex.push_back(i);
1750                                         child_singleTau.push_back(newTau[j]);
1751                                 }
1752                         }
1753                 }
1754         }
1755         catch(exception& e) {
1756                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
1757                 exit(1);        
1758         }               
1759 }
1760
1761 /**************************************************************************************************/
1762
1763 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1764         
1765         try{
1766                 
1767                 int total = 0;
1768                 vector<double> newTau(numOTUs,0);
1769                 vector<double> norms(numSeqs, 0);
1770                 nSeqsPerOTU.assign(numOTUs, 0);
1771                 
1772                 for(int i=startSeq;i<stopSeq;i++){
1773                         int indexOffset = i * numOTUs;
1774                         
1775                         double offset = 1e8;
1776                         
1777                         for(int j=0;j<numOTUs;j++){
1778                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1779                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1780                                 }
1781                                 
1782                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1783                                         offset = dist[indexOffset + j];
1784                                 }
1785                         }
1786                         
1787                         for(int j=0;j<numOTUs;j++){
1788                                 if(weight[j] > MIN_WEIGHT){
1789                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1790                                         norms[i] += newTau[j];
1791                                 }
1792                                 else{
1793                                         newTau[j] = 0.0;
1794                                 }
1795                         }
1796                         
1797                         for(int j=0;j<numOTUs;j++){
1798                                 newTau[j] /= norms[i];
1799                         }
1800                         
1801                         for(int j=0;j<numOTUs;j++){
1802                                 if(newTau[j] > MIN_TAU){
1803                                         
1804                                         int oldTotal = total;
1805                                         
1806                                         total++;
1807                                         
1808                                         singleTau.resize(total, 0);
1809                                         seqNumber.resize(total, 0);
1810                                         seqIndex.resize(total, 0);
1811                                         
1812                                         singleTau[oldTotal] = newTau[j];
1813                                         
1814                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
1815                                         aaI[j][nSeqsPerOTU[j]] = i;
1816                                         nSeqsPerOTU[j]++;
1817                                 }
1818                         }
1819                 }
1820         }
1821         catch(exception& e) {
1822                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1823                 exit(1);        
1824         }               
1825 }
1826
1827 /**************************************************************************************************/
1828
1829 void ShhherCommand::setOTUs(){
1830         
1831         try {
1832                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1833                 
1834                 for(int i=0;i<numOTUs;i++){
1835                         for(int j=0;j<nSeqsPerOTU[i];j++){
1836                                 int index = cumNumSeqs[i] + j;
1837                                 double tauValue = singleTau[seqNumber[index]];
1838                                 int sIndex = seqIndex[index];
1839                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1840                         }
1841                 }
1842                 
1843                 for(int i=0;i<numSeqs;i++){
1844                         double maxTau = -1.0000;
1845                         int maxOTU = -1;
1846                         for(int j=0;j<numOTUs;j++){
1847                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1848                                         maxTau = bigTauMatrix[i * numOTUs + j];
1849                                         maxOTU = j;
1850                                 }
1851                         }
1852                         
1853                         otuData[i] = maxOTU;
1854                 }
1855                 
1856                 nSeqsPerOTU.assign(numOTUs, 0);         
1857                 
1858                 for(int i=0;i<numSeqs;i++){
1859                         int index = otuData[i];
1860                         
1861                         singleTau[i] = 1.0000;
1862                         dist[i] = 0.0000;
1863                         
1864                         aaP[index][nSeqsPerOTU[index]] = i;
1865                         aaI[index][nSeqsPerOTU[index]] = i;
1866                         
1867                         nSeqsPerOTU[index]++;
1868                 }
1869                 fill(); 
1870         }
1871         catch(exception& e) {
1872                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1873                 exit(1);        
1874         }               
1875 }
1876
1877 /**************************************************************************************************/
1878
1879 void ShhherCommand::writeQualities(vector<int> otuCounts){
1880         
1881         try {
1882                 string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.qual";
1883
1884                 ofstream qualityFile;
1885                 m->openOutputFile(qualityFileName, qualityFile);
1886
1887                 qualityFile.setf(ios::fixed, ios::floatfield);
1888                 qualityFile.setf(ios::showpoint);
1889                 qualityFile << setprecision(6);
1890                 
1891                 vector<vector<int> > qualities(numOTUs);
1892                 vector<double> pr(HOMOPS, 0);
1893                 
1894                 
1895                 for(int i=0;i<numOTUs;i++){
1896                         int index = 0;
1897                         int base = 0;
1898                         
1899                         if(nSeqsPerOTU[i] > 0){
1900                                 qualities[i].assign(1024, -1);
1901                                 
1902                                 while(index < numFlowCells){
1903                                         double maxPrValue = 1e8;
1904                                         short maxPrIndex = -1;
1905                                         double count = 0.0000;
1906                                         
1907                                         pr.assign(HOMOPS, 0);
1908                                         
1909                                         for(int j=0;j<nSeqsPerOTU[i];j++){
1910                                                 int lIndex = cumNumSeqs[i] + j;
1911                                                 double tauValue = singleTau[seqNumber[lIndex]];
1912                                                 int sequenceIndex = aaI[i][j];
1913                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1914                                                 
1915                                                 count += tauValue;
1916                                                 
1917                                                 for(int s=0;s<HOMOPS;s++){
1918                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1919                                                 }
1920                                         }
1921                                         
1922                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1923                                         maxPrValue = pr[maxPrIndex];
1924                                         
1925                                         if(count > MIN_COUNT){
1926                                                 double U = 0.0000;
1927                                                 double norm = 0.0000;
1928                                                 
1929                                                 for(int s=0;s<HOMOPS;s++){
1930                                                         norm += exp(-(pr[s] - maxPrValue));
1931                                                 }
1932                                                 
1933                                                 for(int s=1;s<=maxPrIndex;s++){
1934                                                         int value = 0;
1935                                                         double temp = 0.0000;
1936                                                         
1937                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
1938                                                         
1939                                                         if(U>0.00){
1940                                                                 temp = log10(U);
1941                                                         }
1942                                                         else{
1943                                                                 temp = -10.1;
1944                                                         }
1945                                                         temp = floor(-10 * temp);
1946                                                         value = (int)floor(temp);
1947                                                         if(value > 100){        value = 100;    }
1948                                                         
1949                                                         qualities[i][base] = (int)value;
1950                                                         base++;
1951                                                 }
1952                                         }
1953                                         
1954                                         index++;
1955                                 }
1956                         }
1957                         
1958                         
1959                         if(otuCounts[i] > 0){
1960                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1961                                 
1962                                 int j=4;        //need to get past the first four bases
1963                                 while(qualities[i][j] != -1){
1964                                         qualityFile << qualities[i][j] << ' ';
1965                                         j++;
1966                                 }
1967                                 qualityFile << endl;
1968                         }
1969                 }
1970                 qualityFile.close();
1971                 
1972         }
1973         catch(exception& e) {
1974                 m->errorOut(e, "ShhherCommand", "writeQualities");
1975                 exit(1);        
1976         }               
1977 }
1978
1979 /**************************************************************************************************/
1980
1981 void ShhherCommand::writeSequences(vector<int> otuCounts){
1982         try {
1983                 
1984                 string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.fasta";
1985                 ofstream fastaFile;
1986                 m->openOutputFile(fastaFileName, fastaFile);
1987                 
1988                 vector<string> names(numOTUs, "");
1989                 
1990                 for(int i=0;i<numOTUs;i++){
1991                         int index = centroids[i];
1992                         
1993                         if(otuCounts[i] > 0){
1994                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1995                                 
1996                                 string newSeq = "";
1997                                 
1998                                 for(int j=0;j<numFlowCells;j++){
1999                                         
2000                                         char base = flowOrder[j % 4];
2001                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
2002                                                 newSeq += base;
2003                                         }
2004                                 }
2005                                 
2006                                 fastaFile << newSeq.substr(4) << endl;
2007                         }
2008                 }
2009                 fastaFile.close();
2010                 
2011                 if(compositeFASTAFileName != ""){
2012                         m->appendFiles(fastaFileName, compositeFASTAFileName);
2013                 }
2014         }
2015         catch(exception& e) {
2016                 m->errorOut(e, "ShhherCommand", "writeSequences");
2017                 exit(1);        
2018         }               
2019 }
2020
2021 /**************************************************************************************************/
2022
2023 void ShhherCommand::writeNames(vector<int> otuCounts){
2024         try {
2025                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.final.names";
2026                 ofstream nameFile;
2027                 m->openOutputFile(nameFileName, nameFile);
2028                 
2029                 for(int i=0;i<numOTUs;i++){
2030                         if(otuCounts[i] > 0){
2031                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
2032                                 
2033                                 for(int j=1;j<nSeqsPerOTU[i];j++){
2034                                         nameFile << ',' << seqNameVector[aaI[i][j]];
2035                                 }
2036                                 
2037                                 nameFile << endl;
2038                         }
2039                 }
2040                 nameFile.close();
2041         }
2042         catch(exception& e) {
2043                 m->errorOut(e, "ShhherCommand", "writeNames");
2044                 exit(1);        
2045         }               
2046 }
2047
2048 /**************************************************************************************************/
2049
2050 void ShhherCommand::writeGroups(){
2051         try {
2052                 string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
2053                 string groupFileName = fileRoot + ".pn.groups";
2054                 ofstream groupFile;
2055                 m->openOutputFile(groupFileName, groupFile);
2056                 
2057                 for(int i=0;i<numSeqs;i++){
2058                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
2059                 }
2060                 groupFile.close();
2061         }
2062         catch(exception& e) {
2063                 m->errorOut(e, "ShhherCommand", "writeGroups");
2064                 exit(1);        
2065         }               
2066 }
2067
2068 /**************************************************************************************************/
2069
2070 void ShhherCommand::writeClusters(vector<int> otuCounts){
2071         try {
2072                 string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.counts";
2073                 ofstream otuCountsFile;
2074                 m->openOutputFile(otuCountsFileName, otuCountsFile);
2075                 
2076                 string bases = flowOrder;
2077                 
2078                 for(int i=0;i<numOTUs;i++){
2079                         //output the translated version of the centroid sequence for the otu
2080                         if(otuCounts[i] > 0){
2081                                 int index = centroids[i];
2082                                 
2083                                 otuCountsFile << "ideal\t";
2084                                 for(int j=8;j<numFlowCells;j++){
2085                                         char base = bases[j % 4];
2086                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
2087                                                 otuCountsFile << base;
2088                                         }
2089                                 }
2090                                 otuCountsFile << endl;
2091                                 
2092                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2093                                         int sequence = aaI[i][j];
2094                                         otuCountsFile << seqNameVector[sequence] << '\t';
2095                                         
2096                                         string newSeq = "";
2097                                         
2098                                         for(int k=0;k<lengths[sequence];k++){
2099                                                 char base = bases[k % 4];
2100                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
2101                                                         
2102                                                 for(int s=0;s<freq;s++){
2103                                                         newSeq += base;
2104                                                         //otuCountsFile << base;
2105                                                 }
2106                                         }
2107                                         otuCountsFile << newSeq.substr(4) << endl;
2108                                 }
2109                                 otuCountsFile << endl;
2110                         }
2111                 }
2112                 otuCountsFile.close();
2113         }
2114         catch(exception& e) {
2115                 m->errorOut(e, "ShhherCommand", "writeClusters");
2116                 exit(1);        
2117         }               
2118 }
2119
2120 //**********************************************************************************************************************