]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
mods to shhh.seqs and trim.flows to fix eye candy output
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsematrix.hpp"
19 #include <cfloat>
20
21 //**********************************************************************************************************************
22
23 #define NUMBINS 1000
24 #define HOMOPS 10
25 #define MIN_COUNT 0.1
26 #define MIN_WEIGHT 0.1
27 #define MIN_TAU 0.0001
28 #define MIN_ITER 10
29 //**********************************************************************************************************************
30 vector<string> ShhherCommand::setParameters(){  
31         try {
32                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
33                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
34                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
35                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
36                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
37                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
38                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
39                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
40                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
41                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
42                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
43                 
44                 vector<string> myArray;
45                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
46                 return myArray;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "setParameters");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getHelpString(){  
55         try {
56                 string helpString = "";
57                 helpString += "The shhh.seqs command reads a file containing flowgrams and creates a file of corrected sequences.\n";
58                 return helpString;
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "ShhherCommand", "getHelpString");
62                 exit(1);
63         }
64 }
65 //**********************************************************************************************************************
66
67 ShhherCommand::ShhherCommand(){ 
68         try {
69                 abort = true; calledHelp = true;
70                 setParameters();
71                 
72                 //initialize outputTypes
73 //              vector<string> tempOutNames;
74 //              outputTypes["pn.dist"] = tempOutNames;
75
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
79                 exit(1);
80         }
81 }
82
83 //**********************************************************************************************************************
84
85 ShhherCommand::ShhherCommand(string option) {
86         try {
87
88 #ifdef USE_MPI
89                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
90                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
91
92                 if(pid == 0){
93 #endif
94                 
95                 
96                 abort = false; calledHelp = false;   
97                 
98                 
99                 //allow user to run help
100                 if(option == "help") { help(); abort = true; calledHelp = true; }
101                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
102                 
103                 else {
104                         vector<string> myArray = setParameters();
105                         
106                         OptionParser parser(option);
107                         map<string,string> parameters = parser.getParameters();
108                         
109                         ValidParameters validParameter;
110                         map<string,string>::iterator it;
111                         
112                         //check to make sure all parameters are valid for command
113                         for (it = parameters.begin(); it != parameters.end(); it++) { 
114                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
115                         }
116                         
117                         //initialize outputTypes
118                         vector<string> tempOutNames;
119 //                      outputTypes["pn.dist"] = tempOutNames;
120                         //                      outputTypes["fasta"] = tempOutNames;
121                         
122                         //if the user changes the input directory command factory will send this info to us in the output parameter 
123                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
124                         if (inputDir == "not found"){   inputDir = "";          }
125                         else {
126                                 string path;
127                                 it = parameters.find("flow");
128                                 //user has given a template file
129                                 if(it != parameters.end()){ 
130                                         path = m->hasPath(it->second);
131                                         //if the user has not given a path then, add inputdir. else leave path alone.
132                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
133                                 }
134                                 
135                                 it = parameters.find("lookup");
136                                 //user has given a template file
137                                 if(it != parameters.end()){ 
138                                         path = m->hasPath(it->second);
139                                         //if the user has not given a path then, add inputdir. else leave path alone.
140                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
141                                 }
142
143                                 it = parameters.find("file");
144                                 //user has given a template file
145                                 if(it != parameters.end()){ 
146                                         path = m->hasPath(it->second);
147                                         //if the user has not given a path then, add inputdir. else leave path alone.
148                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
149                                 }
150                         }
151                         
152                         
153                         //check for required parameters
154                         flowFileName = validParameter.validFile(parameters, "flow", true);
155                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
156                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
157                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
158                                 m->mothurOutEndLine();
159                                 abort = true; 
160                         }
161                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
162                         
163                         if(flowFileName != "not found"){
164                                 compositeFASTAFileName = "";    
165                                 compositeNamesFileName = "";    
166                         }
167                         else{
168                                 ofstream temp;
169
170                                 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
171                                 m->openOutputFile(compositeFASTAFileName, temp);
172                                 temp.close();
173                                 
174                                 compositeNamesFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.names";
175                                 m->openOutputFile(compositeNamesFileName, temp);
176                                 temp.close();
177                         }
178                         
179                         //if the user changes the output directory command factory will send this info to us in the output parameter 
180                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
181                                 outputDir = ""; 
182                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
183                         }
184                         
185                         
186                         //check for optional parameter and set defaults
187                         // ...at some point should added some additional type checking...
188                         string temp;
189                         temp = validParameter.validFile(parameters, "lookup", true);
190                         if (temp == "not found")        {       lookupFileName = "LookUp_Titanium.pat"; }
191                         else if(temp == "not open")     {       abort = true;                   } 
192                         else                                            {       lookupFileName = temp;  }
193                         
194                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
195                         m->setProcessors(temp);
196                         convert(temp, processors);
197
198                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
199                         convert(temp, cutoff); 
200                         
201                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
202                         convert(temp, minDelta); 
203
204                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
205                         convert(temp, maxIters); 
206
207                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
208                         convert(temp, sigma); 
209                         
210                         flowOrder = validParameter.validFile(parameters, "order", false);
211                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
212                         else if(flowOrder.length() != 4){
213                                 m->mothurOut("The value of the order option must be four bases long\n");
214                         }
215                         
216                 }
217                         
218 #ifdef USE_MPI
219                 }                               
220 #endif
221                                 
222         }
223         catch(exception& e) {
224                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
225                 exit(1);
226         }
227 }
228 //**********************************************************************************************************************
229 #ifdef USE_MPI
230 int ShhherCommand::execute(){
231         try {
232                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
233                 
234                 int tag = 1976;
235                 MPI_Status status; 
236
237                 if(pid == 0){
238
239                         for(int i=1;i<ncpus;i++){
240                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
241                         }
242                         if(abort == 1){ return 0;       }
243
244                         processors = ncpus;
245                         
246                         m->mothurOut("\nGetting preliminary data...\n");
247                         getSingleLookUp();
248                         getJointLookUp();
249                         
250                         vector<string> flowFileVector;
251                         if(flowFilesFileName != "not found"){
252                                 string fName;
253
254                                 ifstream flowFilesFile;
255                                 m->openInputFile(flowFilesFileName, flowFilesFile);
256                                 while(flowFilesFile){
257                                         flowFilesFile >> fName;
258                                         flowFileVector.push_back(fName);
259                                         m->gobble(flowFilesFile);
260                                 }
261                         }
262                         else{
263                                 flowFileVector.push_back(flowFileName);
264                         }
265                         int numFiles = flowFileVector.size();
266
267                         for(int i=1;i<ncpus;i++){
268                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
269                         }
270                         
271                         for(int i=0;i<numFiles;i++){
272                                 double begClock = clock();
273                                 unsigned long int begTime = time(NULL);
274
275                                 flowFileName = flowFileVector[i];
276                                 
277                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
278                                 m->mothurOut("Reading flowgrams...\n");
279                                 getFlowData();
280
281                                 m->mothurOut("Identifying unique flowgrams...\n");
282                                 getUniques();
283
284                                 m->mothurOut("Calculating distances between flowgrams...\n");
285                                 char fileName[1024];
286                                 strcpy(fileName, flowFileName.c_str());
287
288                                 for(int i=1;i<ncpus;i++){
289                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
290
291                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
292                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
293                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
294                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
295                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
296                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
297                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
298                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
299                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
300                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
301                                 }                               
302                                                         
303                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
304                                 
305                                 int done;
306                                 for(int i=1;i<ncpus;i++){
307                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
308                                         
309                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
310                                         remove((distFileName + ".temp." + toString(i)).c_str());
311                                 }
312
313                                 string namesFileName = createNamesFile();
314                                 
315                                 m->mothurOut("\nClustering flowgrams...\n");
316                                 string listFileName = cluster(distFileName, namesFileName);
317
318                                 getOTUData(listFileName);
319
320                                 remove(distFileName.c_str());
321                                 remove(namesFileName.c_str());
322                                 remove(listFileName.c_str());
323                                 
324                                 initPyroCluster();
325
326                                 for(int i=1;i<ncpus;i++){
327                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
328                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
329                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
330                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
331                                 }
332                                 
333                                 
334                                 double maxDelta = 0;
335                                 int iter = 0;
336                                 
337                                 int numOTUsOnCPU = numOTUs / ncpus;
338                                 int numSeqsOnCPU = numSeqs / ncpus;
339                                 m->mothurOut("\nDenoising flowgrams...\n");
340                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
341                                 
342                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
343
344                                         double cycClock = clock();
345                                         unsigned long int cycTime = time(NULL);
346                                         fill();
347
348                                         int total = singleTau.size();
349                                         for(int i=1;i<ncpus;i++){
350                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
351                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
352                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
353                                                 
354                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
355                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
356                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
357                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
358                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
359                                         }
360                                         
361                                         calcCentroidsDriver(0, numOTUsOnCPU);
362                                         
363                                         for(int i=1;i<ncpus;i++){
364                                                 int otuStart = i * numOTUs / ncpus;
365                                                 int otuStop = (i + 1) * numOTUs / ncpus;
366                                                 
367                                                 vector<int> tempCentroids(numOTUs, 0);
368                                                 vector<short> tempChange(numOTUs, 0);
369                                                 
370                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
371                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
372                                                 
373                                                 for(int j=otuStart;j<otuStop;j++){
374                                                         centroids[j] = tempCentroids[j];
375                                                         change[j] = tempChange[j];
376                                                 }
377                                         }
378                                                                         
379                                         maxDelta = getNewWeights();
380                                         double nLL = getLikelihood();
381                                         checkCentroids();
382                                         
383                                         for(int i=1;i<ncpus;i++){
384                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
385                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
386                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
387                                         }
388                                         
389                                         calcNewDistancesParent(0, numSeqsOnCPU);
390
391                                         total = singleTau.size();
392
393                                         for(int i=1;i<ncpus;i++){
394                                                 int childTotal;
395                                                 int seqStart = i * numSeqs / ncpus;
396                                                 int seqStop = (i + 1) * numSeqs / ncpus;
397                                                 
398                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
399
400                                                 vector<int> childSeqIndex(childTotal, 0);
401                                                 vector<double> childSingleTau(childTotal, 0);
402                                                 vector<double> childDist(numSeqs * numOTUs, 0);
403                                                 vector<int> otuIndex(childTotal, 0);
404                                                 
405                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
406                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
407                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
408                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
409
410                                                 int oldTotal = total;
411                                                 total += childTotal;
412                                                 singleTau.resize(total, 0);
413                                                 seqIndex.resize(total, 0);
414                                                 seqNumber.resize(total, 0);
415                                                 
416                                                 int childIndex = 0;
417                                                 
418                                                 for(int j=oldTotal;j<total;j++){
419                                                         int otuI = otuIndex[childIndex];
420                                                         int seqI = childSeqIndex[childIndex];
421                                                         
422                                                         singleTau[j] = childSingleTau[childIndex];
423                                                         
424                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
425                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
426                                                         nSeqsPerOTU[otuI]++;
427                                                         childIndex++;
428                                                 }
429                                                 
430                                                 int index = seqStart * numOTUs;
431                                                 for(int j=seqStart;j<seqStop;j++){
432                                                         for(int k=0;k<numOTUs;k++){
433                                                                 dist[index] = childDist[index];
434                                                                 index++;
435                                                         }
436                                                 }                                       
437                                         }
438                                         
439                                         iter++;
440                                         
441                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
442
443                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
444                                                 int live = 1;
445                                                 for(int i=1;i<ncpus;i++){
446                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
447                                                 }
448                                         }
449                                         else{
450                                                 int live = 0;
451                                                 for(int i=1;i<ncpus;i++){
452                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
453                                                 }
454                                         }
455                                         
456                                 }       
457                                 
458                                 m->mothurOut("\nFinalizing...\n");
459                                 fill();
460                                 setOTUs();
461                                 
462                                 vector<int> otuCounts(numOTUs, 0);
463                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
464                                 calcCentroidsDriver(0, numOTUs);
465                                 
466                                 writeQualities(otuCounts);
467                                 writeSequences(otuCounts);
468                                 writeNames(otuCounts);
469                                 writeClusters(otuCounts);
470                                 writeGroups();
471                                 
472                                                                  
473                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
474                         }
475                 }
476                 else{
477                         int abort = 1;
478
479                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
480                         if(abort){      return 0;       }
481
482                         int numFiles;
483                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
484
485                         for(int i=0;i<numFiles;i++){
486                                 //Now into the pyrodist part
487                                 bool live = 1;
488
489                                 char fileName[1024];
490                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
491                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
492                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
493                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
494                                 
495                                 flowDataIntI.resize(numSeqs * numFlowCells);
496                                 flowDataPrI.resize(numSeqs * numFlowCells);
497                                 mapUniqueToSeq.resize(numSeqs);
498                                 mapSeqToUnique.resize(numSeqs);
499                                 lengths.resize(numSeqs);
500                                 jointLookUp.resize(NUMBINS * NUMBINS);
501
502                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
503                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
504                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
505                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
506                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
507                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
508                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
509
510                                 flowFileName = string(fileName);
511                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
512                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
513                                 
514                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
515
516                                 int done = 1;
517                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
518
519                                 //Now into the pyronoise part
520                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
521                                 
522                                 singleLookUp.resize(HOMOPS * NUMBINS);
523                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
524                                 weight.resize(numOTUs);
525                                 centroids.resize(numOTUs);
526                                 change.resize(numOTUs);
527                                 dist.assign(numOTUs * numSeqs, 0);
528                                 nSeqsPerOTU.resize(numOTUs);
529                                 cumNumSeqs.resize(numOTUs);
530
531                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
532                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
533                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
534                                 
535                                 int startOTU = pid * numOTUs / ncpus;
536                                 int endOTU = (pid + 1) * numOTUs / ncpus;
537
538                                 int startSeq = pid * numSeqs / ncpus;
539                                 int endSeq = (pid + 1) * numSeqs /ncpus;
540                                 
541                                 int total;
542
543                                 while(live){
544                                         
545                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
546                                         singleTau.assign(total, 0.0000);
547                                         seqNumber.assign(total, 0);
548                                         seqIndex.assign(total, 0);
549
550                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
551                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
552                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
553                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
554                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
555                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
556                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
557
558                                         calcCentroidsDriver(startOTU, endOTU);
559
560                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
561                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
562
563                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
564                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
565                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
566
567                                         vector<int> otuIndex(total, 0);
568                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
569                                         total = otuIndex.size();
570                                         
571                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
572                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
573                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
574                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
575                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
576                                 
577                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
578                                 }
579                         }
580                 }               
581                 MPI_Barrier(MPI_COMM_WORLD);
582
583                 
584                 if(compositeFASTAFileName != ""){
585                         outputNames.push_back(compositeFASTAFileName);
586                         outputNames.push_back(compositeNamesFileName);
587                 }
588
589                 m->mothurOutEndLine();
590                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
591                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
592                 m->mothurOutEndLine();
593                 
594                 return 0;
595
596         }
597         catch(exception& e) {
598                 m->errorOut(e, "ShhherCommand", "execute");
599                 exit(1);
600         }
601 }
602
603 /**************************************************************************************************/
604
605 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
606         try{            
607                 ostringstream outStream;
608                 outStream.setf(ios::fixed, ios::floatfield);
609                 outStream.setf(ios::dec, ios::basefield);
610                 outStream.setf(ios::showpoint);
611                 outStream.precision(6);
612                 
613                 int begTime = time(NULL);
614                 double begClock = clock();
615                 
616                 for(int i=startSeq;i<stopSeq;i++){
617                         for(int j=0;j<i;j++){
618                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
619                                 
620                                 if(flowDistance < 1e-6){
621                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
622                                 }
623                                 else if(flowDistance <= cutoff){
624                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
625                                 }
626                         }
627                         if(i % 100 == 0){
628                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
629                         }
630                 }
631                 
632                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
633                 
634                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
635                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
636
637                 ofstream distFile(fDistFileName.c_str());
638                 distFile << outStream.str();            
639                 distFile.close();
640                 
641                 return fDistFileName;
642         }
643         catch(exception& e) {
644                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
645                 exit(1);
646         }
647 }
648
649 #else
650 //**********************************************************************************************************************
651
652 int ShhherCommand::execute(){
653         try {
654                 if (abort == true) { return 0; }
655                 
656                 getSingleLookUp();
657                 getJointLookUp();
658                                 
659                 vector<string> flowFileVector;
660                 if(flowFilesFileName != "not found"){
661                         string fName;
662                         
663                         ifstream flowFilesFile;
664                         m->openInputFile(flowFilesFileName, flowFilesFile);
665                         while(flowFilesFile){
666                                 flowFilesFile >> fName;
667                                 flowFileVector.push_back(fName);
668                                 m->gobble(flowFilesFile);
669                         }
670                 }
671                 else{
672                         flowFileVector.push_back(flowFileName);
673                 }
674                 int numFiles = flowFileVector.size();
675                 
676                 
677                 for(int i=0;i<numFiles;i++){
678                         flowFileName = flowFileVector[i];
679
680                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
681                         m->mothurOut("Reading flowgrams...\n");
682                         getFlowData();
683                         
684                         m->mothurOut("Identifying unique flowgrams...\n");
685                         getUniques();
686                         
687                         
688                         m->mothurOut("Calculating distances between flowgrams...\n");
689                         string distFileName = createDistFile(processors);
690                         string namesFileName = createNamesFile();
691                                 
692                         m->mothurOut("\nClustering flowgrams...\n");
693                         string listFileName = cluster(distFileName, namesFileName);
694                         getOTUData(listFileName);
695                         
696                         remove(distFileName.c_str());
697                         remove(namesFileName.c_str());
698                         remove(listFileName.c_str());
699                         
700                         initPyroCluster();
701                         
702                         double maxDelta = 0;
703                         int iter = 0;
704                         
705                         double begClock = clock();
706                         unsigned long int begTime = time(NULL);
707
708                         m->mothurOut("\nDenoising flowgrams...\n");
709                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
710                         
711                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
712                                 
713                                 double cycClock = clock();
714                                 unsigned long int cycTime = time(NULL);
715                                 fill();
716                                 
717                                 calcCentroids();
718                                 
719                                 maxDelta = getNewWeights();
720                                 double nLL = getLikelihood();
721                                 checkCentroids();
722                                 
723                                 calcNewDistances();
724
725                                 iter++;
726                                 
727                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
728
729                         }       
730                         
731                         m->mothurOut("\nFinalizing...\n");
732                         fill();
733                         setOTUs();
734                         
735                         vector<int> otuCounts(numOTUs, 0);
736                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
737                         
738                         calcCentroidsDriver(0, numOTUs);
739                         writeQualities(otuCounts);
740                         writeSequences(otuCounts);
741                         writeNames(otuCounts);
742                         writeClusters(otuCounts);
743                         writeGroups();
744                         
745                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
746                 }
747                 
748                 if(compositeFASTAFileName != ""){
749                         outputNames.push_back(compositeFASTAFileName);
750                         outputNames.push_back(compositeNamesFileName);
751                 }
752
753                 m->mothurOutEndLine();
754                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
755                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
756                 m->mothurOutEndLine();
757                 
758                 return 0;
759         }
760         catch(exception& e) {
761                 m->errorOut(e, "ShhherCommand", "execute");
762                 exit(1);
763         }
764 }
765 #endif
766 /**************************************************************************************************/
767
768 void ShhherCommand::getFlowData(){
769         try{
770                 ifstream flowFile;
771                 m->openInputFile(flowFileName, flowFile);
772                 
773                 string seqName;
774                 seqNameVector.clear();
775                 lengths.clear();
776                 flowDataIntI.clear();
777                 nameMap.clear();
778                 
779                 
780                 int currentNumFlowCells;
781                 
782                 float intensity;
783                 
784                 flowFile >> numFlowCells;
785                 int index = 0;//pcluster
786                 while(!flowFile.eof()){
787                         flowFile >> seqName >> currentNumFlowCells;
788                         lengths.push_back(currentNumFlowCells);
789
790                         seqNameVector.push_back(seqName);
791                         nameMap[seqName] = index++;//pcluster
792
793                         for(int i=0;i<numFlowCells;i++){
794                                 flowFile >> intensity;
795                                 if(intensity > 9.99)    {       intensity = 9.99;       }
796                                 int intI = int(100 * intensity + 0.0001);
797                                 flowDataIntI.push_back(intI);
798                         }
799                         m->gobble(flowFile);
800                 }
801                 flowFile.close();
802                 
803                 numSeqs = seqNameVector.size();         
804                 
805                 for(int i=0;i<numSeqs;i++){
806                         int iNumFlowCells = i * numFlowCells;
807                         for(int j=lengths[i];j<numFlowCells;j++){
808                                 flowDataIntI[iNumFlowCells + j] = 0;
809                         }
810                 }
811                 
812         }
813         catch(exception& e) {
814                 m->errorOut(e, "ShhherCommand", "getFlowData");
815                 exit(1);
816         }
817 }
818
819 /**************************************************************************************************/
820
821 void ShhherCommand::getSingleLookUp(){
822         try{
823                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
824                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
825                 
826                 int index = 0;
827                 ifstream lookUpFile;
828                 m->openInputFile(lookupFileName, lookUpFile);
829                 
830                 for(int i=0;i<HOMOPS;i++){
831                         float logFracFreq;
832                         lookUpFile >> logFracFreq;
833                         
834                         for(int j=0;j<NUMBINS;j++)      {
835                                 lookUpFile >> singleLookUp[index];
836                                 index++;
837                         }
838                 }       
839                 lookUpFile.close();
840         }
841         catch(exception& e) {
842                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
843                 exit(1);
844         }
845 }
846
847 /**************************************************************************************************/
848
849 void ShhherCommand::getJointLookUp(){
850         try{
851                 
852                 //      the most likely joint probability (-log) that two intenities have the same polymer length
853                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
854                 
855                 for(int i=0;i<NUMBINS;i++){
856                         for(int j=0;j<NUMBINS;j++){             
857                                 
858                                 double minSum = 100000000;
859                                 
860                                 for(int k=0;k<HOMOPS;k++){
861                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
862                                         
863                                         if(sum < minSum)        {       minSum = sum;           }
864                                 }       
865                                 jointLookUp[i * NUMBINS + j] = minSum;
866                         }
867                 }
868         }
869         catch(exception& e) {
870                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
871                 exit(1);
872         }
873 }
874
875 /**************************************************************************************************/
876
877 double ShhherCommand::getProbIntensity(int intIntensity){                          
878         try{
879
880                 double minNegLogProb = 100000000; 
881
882                 
883                 for(int i=0;i<HOMOPS;i++){//loop signal strength
884                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
885                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
886                 }
887                 
888                 return minNegLogProb;
889         }
890         catch(exception& e) {
891                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
892                 exit(1);
893         }
894 }
895
896 /**************************************************************************************************/
897
898 void ShhherCommand::getUniques(){
899         try{
900                 
901                 
902                 numUniques = 0;
903                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
904                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
905                 uniqueLengths.assign(numSeqs, 0);
906                 mapSeqToUnique.assign(numSeqs, -1);
907                 mapUniqueToSeq.assign(numSeqs, -1);
908                 
909                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
910                 
911                 for(int i=0;i<numSeqs;i++){
912                         int index = 0;
913                         
914                         vector<short> current(numFlowCells);
915                         for(int j=0;j<numFlowCells;j++){
916                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
917                         }
918                                                 
919                         for(int j=0;j<numUniques;j++){
920                                 int offset = j * numFlowCells;
921                                 bool toEnd = 1;
922                                 
923                                 int shorterLength;
924                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
925                                 else                                                            {       shorterLength = uniqueLengths[j];       }
926
927                                 for(int k=0;k<shorterLength;k++){
928                                         if(current[k] != uniqueFlowgrams[offset + k]){
929                                                 toEnd = 0;
930                                                 break;
931                                         }
932                                 }
933                                 
934                                 if(toEnd){
935                                         mapSeqToUnique[i] = j;
936                                         uniqueCount[j]++;
937                                         index = j;
938                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
939                                         break;
940                                 }
941                                 index++;
942                         }
943                         
944                         if(index == numUniques){
945                                 uniqueLengths[numUniques] = lengths[i];
946                                 uniqueCount[numUniques] = 1;
947                                 mapSeqToUnique[i] = numUniques;//anMap
948                                 mapUniqueToSeq[numUniques] = i;//anF
949                                 
950                                 for(int k=0;k<numFlowCells;k++){
951                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
952                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
953                                 }
954                                 
955                                 numUniques++;
956                         }
957                 }
958                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
959                 uniqueLengths.resize(numUniques);       
960                 
961                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
962                 for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
963         }
964         catch(exception& e) {
965                 m->errorOut(e, "ShhherCommand", "getUniques");
966                 exit(1);
967         }
968 }
969
970 /**************************************************************************************************/
971
972 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
973         try{
974                 int minLength = lengths[mapSeqToUnique[seqA]];
975                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
976                 
977                 int ANumFlowCells = seqA * numFlowCells;
978                 int BNumFlowCells = seqB * numFlowCells;
979                 
980                 float dist = 0;
981                 
982                 for(int i=0;i<minLength;i++){
983                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
984                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
985                         
986                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
987                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
988                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
989                 }
990                 
991                 dist /= (float) minLength;
992                 return dist;
993         }
994         catch(exception& e) {
995                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
996                 exit(1);
997         }
998 }
999
1000 /**************************************************************************************************/
1001
1002 void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
1003         try{            
1004
1005                 ostringstream outStream;
1006                 outStream.setf(ios::fixed, ios::floatfield);
1007                 outStream.setf(ios::dec, ios::basefield);
1008                 outStream.setf(ios::showpoint);
1009                 outStream.precision(6);
1010                 
1011                 int begTime = time(NULL);
1012                 double begClock = clock();
1013
1014                 for(int i=startSeq;i<stopSeq;i++){
1015                         for(int j=0;j<i;j++){
1016                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
1017
1018                                 if(flowDistance < 1e-6){
1019                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
1020                                 }
1021                                 else if(flowDistance <= cutoff){
1022                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
1023                                 }
1024                         }
1025                         if(i % 100 == 0){
1026                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
1027                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1028                                 m->mothurOutEndLine();
1029                         }
1030                 }
1031                 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
1032                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1033                 m->mothurOutEndLine();
1034                 
1035                 ofstream distFile(distFileName.c_str());
1036                 distFile << outStream.str();            
1037                 distFile.close();
1038         }
1039         catch(exception& e) {
1040                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
1041                 exit(1);
1042         }
1043 }
1044
1045 /**************************************************************************************************/
1046
1047 string ShhherCommand::createDistFile(int processors){
1048         try{
1049                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
1050                                 
1051                 unsigned long int begTime = time(NULL);
1052                 double begClock = clock();
1053
1054                 vector<int> start;
1055                 vector<int> end;
1056                 
1057 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1058                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
1059                 else{ //you have multiple processors
1060                         
1061                         if (numSeqs < processors){      processors = 1; }
1062                         
1063                         vector<int> start(processors, 0);
1064                         vector<int> end(processors, 0);
1065                         
1066                         for (int i = 0; i < processors; i++) {
1067                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
1068                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
1069                         }
1070                         
1071                         int process = 1;
1072                         vector<int> processIDs;
1073                         
1074                         //loop through and create all the processes you want
1075                         while (process != processors) {
1076                                 int pid = fork();
1077                                 
1078                                 if (pid > 0) {
1079                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1080                                         process++;
1081                                 }else if (pid == 0){
1082                                         flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
1083                                         exit(0);
1084                                 }else { 
1085                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1086                                         perror(" : ");
1087                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1088                                         exit(0);
1089                                 }
1090                         }
1091                         
1092                         //parent does its part
1093                         flowDistParentFork(fDistFileName, start[0], end[0]);
1094                         
1095                         //force parent to wait until all the processes are done
1096                         for (int i=0;i<processIDs.size();i++) { 
1097                                 int temp = processIDs[i];
1098                                 wait(&temp);
1099                         }
1100                         
1101                         //append and remove temp files
1102                         for (int i=0;i<processIDs.size();i++) { 
1103                                 m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
1104                                 remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
1105                         }
1106                         
1107                 }
1108                 
1109 #else
1110                 flowDistParentFork(fDistFileName, 0, numUniques);
1111 #endif
1112
1113                 m->mothurOutEndLine();
1114                 
1115                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
1116                 
1117
1118                 return fDistFileName;
1119         }
1120         catch(exception& e) {
1121                 m->errorOut(e, "ShhherCommand", "createDistFile");
1122                 exit(1);
1123         }
1124         
1125 }
1126
1127 /**************************************************************************************************/
1128
1129 string ShhherCommand::createNamesFile(){
1130         try{
1131                 
1132                 vector<string> duplicateNames(numUniques, "");
1133                 for(int i=0;i<numSeqs;i++){
1134                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
1135                 }
1136                 
1137                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
1138                 
1139                 ofstream nameFile;
1140                 m->openOutputFile(nameFileName, nameFile);
1141                 
1142                 for(int i=0;i<numUniques;i++){
1143 //                      nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1144                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1145                 }
1146                 
1147                 nameFile.close();
1148                 return  nameFileName;
1149         }
1150         catch(exception& e) {
1151                 m->errorOut(e, "ShhherCommand", "createNamesFile");
1152                 exit(1);
1153         }
1154 }
1155
1156 //**********************************************************************************************************************
1157
1158 string ShhherCommand::cluster(string distFileName, string namesFileName){
1159         try {
1160                 
1161                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1162                 read->setCutoff(cutoff);
1163                 
1164                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1165                 clusterNameMap->readMap();
1166                 read->read(clusterNameMap);
1167                 
1168                 ListVector* list = read->getListVector();
1169                 SparseMatrix* matrix = read->getMatrix();
1170                 
1171                 delete read; 
1172                 delete clusterNameMap; 
1173                                 
1174                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1175                 
1176                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1177                 string tag = cluster->getTag();
1178                 
1179                 double clusterCutoff = cutoff;
1180                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1181                         cluster->update(clusterCutoff);
1182                 }
1183                 
1184                 list->setLabel(toString(cutoff));
1185                 
1186                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1187                 ofstream listFile;
1188                 m->openOutputFile(listFileName, listFile);
1189                 list->print(listFile);
1190                 listFile.close();
1191                 
1192                 delete matrix;  delete cluster; delete rabund; delete list;
1193         
1194                 return listFileName;
1195         }
1196         catch(exception& e) {
1197                 m->errorOut(e, "ShhherCommand", "cluster");
1198                 exit(1);        
1199         }               
1200 }
1201
1202 /**************************************************************************************************/
1203
1204 void ShhherCommand::getOTUData(string listFileName){
1205         try {
1206
1207                 ifstream listFile;
1208                 m->openInputFile(listFileName, listFile);
1209                 string label;
1210                 
1211                 listFile >> label >> numOTUs;
1212
1213                 otuData.assign(numSeqs, 0);
1214                 cumNumSeqs.assign(numOTUs, 0);
1215                 nSeqsPerOTU.assign(numOTUs, 0);
1216                 aaP.clear();aaP.resize(numOTUs);
1217                 
1218                 seqNumber.clear();
1219                 aaI.clear();
1220                 seqIndex.clear();
1221                 
1222                 string singleOTU = "";
1223                 
1224                 for(int i=0;i<numOTUs;i++){
1225
1226                         listFile >> singleOTU;
1227                         
1228                         istringstream otuString(singleOTU);
1229
1230                         while(otuString){
1231                                 
1232                                 string seqName = "";
1233                                 
1234                                 for(int j=0;j<singleOTU.length();j++){
1235                                         char letter = otuString.get();
1236                                         
1237                                         if(letter != ','){
1238                                                 seqName += letter;
1239                                         }
1240                                         else{
1241                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1242                                                 int index = nmIt->second;
1243                                                 
1244                                                 nameMap.erase(nmIt);
1245                                                 
1246                                                 otuData[index] = i;
1247                                                 nSeqsPerOTU[i]++;
1248                                                 aaP[i].push_back(index);
1249                                                 seqName = "";
1250                                         }
1251                                 }
1252                                 
1253                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1254
1255                                 int index = nmIt->second;
1256                                 nameMap.erase(nmIt);
1257
1258                                 otuData[index] = i;
1259                                 nSeqsPerOTU[i]++;
1260                                 aaP[i].push_back(index);        
1261                                 
1262                                 otuString.get();
1263                         }
1264                         
1265                         sort(aaP[i].begin(), aaP[i].end());
1266                         for(int j=0;j<nSeqsPerOTU[i];j++){
1267                                 seqNumber.push_back(aaP[i][j]);
1268                         }
1269                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
1270                                 aaP[i].push_back(0);
1271                         }
1272                         
1273                         
1274                 }
1275                 
1276                 for(int i=1;i<numOTUs;i++){
1277                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
1278                 }
1279                 aaI = aaP;
1280                 seqIndex = seqNumber;
1281                 
1282                 listFile.close();       
1283                 
1284         }
1285         catch(exception& e) {
1286                 m->errorOut(e, "ShhherCommand", "getOTUData");
1287                 exit(1);        
1288         }               
1289 }
1290
1291 /**************************************************************************************************/
1292
1293 void ShhherCommand::initPyroCluster(){                          
1294         try{
1295                 dist.assign(numSeqs * numOTUs, 0);
1296                 change.assign(numOTUs, 1);
1297                 centroids.assign(numOTUs, -1);
1298                 weight.assign(numOTUs, 0);
1299                 singleTau.assign(numSeqs, 1.0);
1300                 
1301                 nSeqsBreaks.assign(processors+1, 0);
1302                 nOTUsBreaks.assign(processors+1, 0);
1303
1304                 nSeqsBreaks[0] = 0;
1305                 for(int i=0;i<processors;i++){
1306                         nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1307                         nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1308                 }
1309                 nSeqsBreaks[processors] = numSeqs;
1310                 nOTUsBreaks[processors] = numOTUs;
1311         }
1312         catch(exception& e) {
1313                 m->errorOut(e, "ShhherCommand", "initPyroCluster");
1314                 exit(1);        
1315         }
1316 }
1317
1318 /**************************************************************************************************/
1319
1320 void ShhherCommand::fill(){
1321         try {
1322                 int index = 0;
1323                 for(int i=0;i<numOTUs;i++){
1324                         cumNumSeqs[i] = index;
1325                         for(int j=0;j<nSeqsPerOTU[i];j++){
1326                                 seqNumber[index] = aaP[i][j];
1327                                 seqIndex[index] = aaI[i][j];
1328                                 
1329                                 index++;
1330                         }
1331                 }
1332         }
1333         catch(exception& e) {
1334                 m->errorOut(e, "ShhherCommand", "fill");
1335                 exit(1);        
1336         }               
1337 }
1338
1339 /**************************************************************************************************/
1340
1341 void ShhherCommand::calcCentroids(){                          
1342         try{
1343                 
1344 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1345
1346                 if(processors == 1)     {
1347                         calcCentroidsDriver(0, numOTUs);                
1348                 }
1349                 else{ //you have multiple processors
1350                         if (numOTUs < processors){      processors = 1; }
1351                         
1352                         int process = 1;
1353                         vector<int> processIDs;
1354                         
1355                         //loop through and create all the processes you want
1356                         while (process != processors) {
1357                                 int pid = vfork();
1358                                 
1359                                 if (pid > 0) {
1360                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1361                                         process++;
1362                                 }else if (pid == 0){
1363                                         calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
1364                                         exit(0);
1365                                 }else { 
1366                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1367                                         perror(" : ");
1368                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1369                                         exit(0);
1370                                 }
1371                         }
1372                         
1373                         //parent does its part
1374                         calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
1375
1376                         //force parent to wait until all the processes are done
1377                         for (int i=0;i<processIDs.size();i++) { 
1378                                 int temp = processIDs[i];
1379                                 wait(&temp);
1380                         }
1381                 }
1382                 
1383 #else
1384                 calcCentroidsDriver(0, numOTUs);
1385 #endif          
1386         }
1387         catch(exception& e) {
1388                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1389                 exit(1);        
1390         }               
1391 }
1392
1393 /**************************************************************************************************/
1394
1395 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1396         
1397         //this function gets the most likely homopolymer length at a flow position for a group of sequences
1398         //within an otu
1399         
1400         try{
1401                 
1402         
1403                 for(int i=start;i<finish;i++){
1404                         
1405                         double count = 0;
1406                         int position = 0;
1407                         int minFlowGram = 100000000;
1408                         double minFlowValue = 1e8;
1409                         change[i] = 0; //FALSE
1410                         
1411                         for(int j=0;j<nSeqsPerOTU[i];j++){
1412                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1413                         }
1414
1415                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1416                                 vector<double> adF(nSeqsPerOTU[i]);
1417                                 vector<int> anL(nSeqsPerOTU[i]);
1418                                 
1419                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1420                                         int index = cumNumSeqs[i] + j;
1421                                         int nI = seqIndex[index];
1422                                         int nIU = mapSeqToUnique[nI];
1423                                         
1424                                         int k;
1425                                         for(k=0;k<position;k++){
1426                                                 if(nIU == anL[k]){
1427                                                         break;
1428                                                 }
1429                                         }
1430                                         if(k == position){
1431                                                 anL[position] = nIU;
1432                                                 adF[position] = 0.0000;
1433                                                 position++;
1434                                         }                                               
1435                                 }
1436                                 
1437                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1438                                         int index = cumNumSeqs[i] + j;
1439                                         int nI = seqIndex[index];
1440                                         
1441                                         double tauValue = singleTau[seqNumber[index]];
1442                                         
1443                                         for(int k=0;k<position;k++){
1444                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1445                                                 adF[k] += dist * tauValue;
1446                                         }
1447                                 }
1448                                 
1449                                 for(int j=0;j<position;j++){
1450                                         if(adF[j] < minFlowValue){
1451                                                 minFlowGram = j;
1452                                                 minFlowValue = adF[j];
1453                                         }
1454                                 }
1455                                 
1456                                 if(centroids[i] != anL[minFlowGram]){
1457                                         change[i] = 1;
1458                                         centroids[i] = anL[minFlowGram];
1459                                 }
1460                         }
1461                         else if(centroids[i] != -1){
1462                                 change[i] = 1;
1463                                 centroids[i] = -1;                      
1464                         }
1465                 }
1466         }
1467         catch(exception& e) {
1468                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1469                 exit(1);        
1470         }               
1471 }
1472
1473 /**************************************************************************************************/
1474
1475 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1476         try{
1477                 
1478                 int flowAValue = cent * numFlowCells;
1479                 int flowBValue = flow * numFlowCells;
1480                 
1481                 double dist = 0;
1482                 
1483                 for(int i=0;i<length;i++){
1484                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1485                         flowAValue++;
1486                         flowBValue++;
1487                 }
1488                 
1489                 return dist / (double)length;
1490         }
1491         catch(exception& e) {
1492                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1493                 exit(1);        
1494         }               
1495 }
1496
1497 /**************************************************************************************************/
1498
1499 double ShhherCommand::getNewWeights(){
1500         try{
1501                 
1502                 double maxChange = 0;
1503                 
1504                 for(int i=0;i<numOTUs;i++){
1505                         
1506                         double difference = weight[i];
1507                         weight[i] = 0;
1508                         
1509                         for(int j=0;j<nSeqsPerOTU[i];j++){
1510                                 int index = cumNumSeqs[i] + j;
1511                                 double tauValue = singleTau[seqNumber[index]];
1512                                 weight[i] += tauValue;
1513                         }
1514                         
1515                         difference = fabs(weight[i] - difference);
1516                         if(difference > maxChange){     maxChange = difference; }
1517                 }
1518                 return maxChange;
1519         }
1520         catch(exception& e) {
1521                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1522                 exit(1);        
1523         }               
1524 }
1525
1526 /**************************************************************************************************/
1527
1528 double ShhherCommand::getLikelihood(){
1529         
1530         try{
1531                 
1532                 vector<long double> P(numSeqs, 0);
1533                 int effNumOTUs = 0;
1534                 
1535                 for(int i=0;i<numOTUs;i++){
1536                         if(weight[i] > MIN_WEIGHT){
1537                                 effNumOTUs++;
1538                         }
1539                 }
1540                 
1541                 string hold;
1542                 for(int i=0;i<numOTUs;i++){
1543                         for(int j=0;j<nSeqsPerOTU[i];j++){
1544                                 int index = cumNumSeqs[i] + j;
1545                                 int nI = seqIndex[index];
1546                                 double singleDist = dist[seqNumber[index]];
1547                                 
1548                                 P[nI] += weight[i] * exp(-singleDist * sigma);
1549                         }
1550                 }
1551                 double nLL = 0.00;
1552                 for(int i=0;i<numSeqs;i++){
1553                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
1554
1555                         nLL += -log(P[i]);
1556                 }
1557                 
1558                 nLL = nLL -(double)numSeqs * log(sigma);
1559
1560                 return nLL; 
1561         }
1562         catch(exception& e) {
1563                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1564                 exit(1);        
1565         }               
1566 }
1567
1568 /**************************************************************************************************/
1569
1570 void ShhherCommand::checkCentroids(){
1571         try{
1572                 vector<int> unique(numOTUs, 1);
1573                 
1574                 for(int i=0;i<numOTUs;i++){
1575                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1576                                 unique[i] = -1;
1577                         }
1578                 }
1579                 
1580                 for(int i=0;i<numOTUs;i++){
1581                         if(unique[i] == 1){
1582                                 for(int j=i+1;j<numOTUs;j++){
1583                                         if(unique[j] == 1){
1584                                                 
1585                                                 if(centroids[j] == centroids[i]){
1586                                                         unique[j] = 0;
1587                                                         centroids[j] = -1;
1588                                                         
1589                                                         weight[i] += weight[j];
1590                                                         weight[j] = 0.0;
1591                                                 }
1592                                         }
1593                                 }
1594                         }
1595                 }
1596         }
1597         catch(exception& e) {
1598                 m->errorOut(e, "ShhherCommand", "checkCentroids");
1599                 exit(1);        
1600         }               
1601 }
1602
1603 /**************************************************************************************************/
1604
1605 void ShhherCommand::calcNewDistances(){                          
1606         try{
1607                 
1608 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1609                 
1610                 if(processors == 1)     {
1611                         calcNewDistancesParent(0, numSeqs);             
1612                 }
1613                 else{ //you have multiple processors
1614                         if (numSeqs < processors){      processors = 1; }
1615                         
1616                         vector<vector<int> > child_otuIndex(processors);
1617                         vector<vector<int> > child_seqIndex(processors);
1618                         vector<vector<double> > child_singleTau(processors);                    
1619                         vector<int> totals(processors);
1620                         
1621                         int process = 1;
1622                         vector<int> processIDs;
1623
1624                         //loop through and create all the processes you want
1625                         while (process != processors) {
1626                                 int pid = vfork();
1627                                 
1628                                 if (pid > 0) {
1629                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1630                                         process++;
1631                                 }else if (pid == 0){
1632                                         calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
1633                                         totals[process] = child_otuIndex[process].size();
1634
1635                                         exit(0);
1636                                 }else { 
1637                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1638                                         perror(" : ");
1639                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1640                                         exit(0);
1641                                 }
1642                         }
1643                         
1644                         //parent does its part
1645                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
1646                         int total = seqIndex.size();
1647
1648                         //force parent to wait until all the processes are done
1649                         for (int i=0;i<processIDs.size();i++) { 
1650                                 int temp = processIDs[i];
1651                                 wait(&temp);
1652                         }
1653
1654                         for(int i=1;i<processors;i++){
1655                                 int oldTotal = total;
1656                                 total += totals[i];
1657
1658                                 singleTau.resize(total, 0);
1659                                 seqIndex.resize(total, 0);
1660                                 seqNumber.resize(total, 0);
1661                                 
1662                                 int childIndex = 0;
1663                                 
1664                                 for(int j=oldTotal;j<total;j++){
1665                                         int otuI = child_otuIndex[i][childIndex];
1666                                         int seqI = child_seqIndex[i][childIndex];
1667
1668                                         singleTau[j] = child_singleTau[i][childIndex];
1669                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
1670                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
1671                                         nSeqsPerOTU[otuI]++;
1672
1673                                         childIndex++;
1674                                 }
1675                         }
1676                 }
1677 #else
1678                 calcNewDistancesParent(0, numSeqs);             
1679 #endif          
1680         }
1681         catch(exception& e) {
1682                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1683                 exit(1);        
1684         }               
1685 }
1686
1687 /**************************************************************************************************/
1688 #ifdef USE_MPI
1689 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1690         
1691         try{
1692                 vector<double> newTau(numOTUs,0);
1693                 vector<double> norms(numSeqs, 0);
1694                 otuIndex.clear();
1695                 seqIndex.clear();
1696                 singleTau.clear();
1697                 
1698                 
1699                 
1700                 for(int i=startSeq;i<stopSeq;i++){
1701                         double offset = 1e8;
1702                         int indexOffset = i * numOTUs;
1703                         
1704                         for(int j=0;j<numOTUs;j++){
1705                                 
1706                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1707                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1708                                 }
1709                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1710                                         offset = dist[indexOffset + j];
1711                                 }
1712                         }
1713                         
1714                         for(int j=0;j<numOTUs;j++){
1715                                 if(weight[j] > MIN_WEIGHT){
1716                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1717                                         norms[i] += newTau[j];
1718                                 }
1719                                 else{
1720                                         newTau[j] = 0.0;
1721                                 }
1722                         }
1723                         
1724                         for(int j=0;j<numOTUs;j++){
1725
1726                                 newTau[j] /= norms[i];
1727                                 
1728                                 if(newTau[j] > MIN_TAU){
1729                                         otuIndex.push_back(j);
1730                                         seqIndex.push_back(i);
1731                                         singleTau.push_back(newTau[j]);
1732                                 }
1733                         }
1734                         
1735                 }
1736         }
1737         catch(exception& e) {
1738                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1739                 exit(1);        
1740         }               
1741 }
1742 #endif
1743 /**************************************************************************************************/
1744
1745 void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
1746         
1747         try{
1748                 vector<double> newTau(numOTUs,0);
1749                 vector<double> norms(numSeqs, 0);
1750                 child_otuIndex.resize(0);
1751                 child_seqIndex.resize(0);
1752                 child_singleTau.resize(0);
1753                 
1754                 for(int i=startSeq;i<stopSeq;i++){
1755                         double offset = 1e8;
1756                         int indexOffset = i * numOTUs;
1757                         
1758                         
1759                         for(int j=0;j<numOTUs;j++){
1760                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1761                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1762                                 }
1763                                 
1764                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1765                                         offset = dist[indexOffset + j];
1766                                 }
1767                         }
1768                         
1769                         for(int j=0;j<numOTUs;j++){
1770                                 if(weight[j] > MIN_WEIGHT){
1771                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1772                                         norms[i] += newTau[j];
1773                                 }
1774                                 else{
1775                                         newTau[j] = 0.0;
1776                                 }
1777                         }
1778                         
1779                         for(int j=0;j<numOTUs;j++){
1780                                 newTau[j] /= norms[i];
1781                                 
1782                                 if(newTau[j] > MIN_TAU){
1783                                         child_otuIndex.push_back(j);
1784                                         child_seqIndex.push_back(i);
1785                                         child_singleTau.push_back(newTau[j]);
1786                                 }
1787                         }
1788                 }
1789         }
1790         catch(exception& e) {
1791                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
1792                 exit(1);        
1793         }               
1794 }
1795
1796 /**************************************************************************************************/
1797
1798 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1799         
1800         try{
1801                 
1802                 int total = 0;
1803                 vector<double> newTau(numOTUs,0);
1804                 vector<double> norms(numSeqs, 0);
1805                 nSeqsPerOTU.assign(numOTUs, 0);
1806                 
1807                 for(int i=startSeq;i<stopSeq;i++){
1808                         int indexOffset = i * numOTUs;
1809                         
1810                         double offset = 1e8;
1811                         
1812                         for(int j=0;j<numOTUs;j++){
1813                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1814                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1815                                 }
1816                                 
1817                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1818                                         offset = dist[indexOffset + j];
1819                                 }
1820                         }
1821                         
1822                         for(int j=0;j<numOTUs;j++){
1823                                 if(weight[j] > MIN_WEIGHT){
1824                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1825                                         norms[i] += newTau[j];
1826                                 }
1827                                 else{
1828                                         newTau[j] = 0.0;
1829                                 }
1830                         }
1831                         
1832                         for(int j=0;j<numOTUs;j++){
1833                                 newTau[j] /= norms[i];
1834                         }
1835                         
1836                         for(int j=0;j<numOTUs;j++){
1837                                 if(newTau[j] > MIN_TAU){
1838                                         
1839                                         int oldTotal = total;
1840                                         
1841                                         total++;
1842                                         
1843                                         singleTau.resize(total, 0);
1844                                         seqNumber.resize(total, 0);
1845                                         seqIndex.resize(total, 0);
1846                                         
1847                                         singleTau[oldTotal] = newTau[j];
1848                                         
1849                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
1850                                         aaI[j][nSeqsPerOTU[j]] = i;
1851                                         nSeqsPerOTU[j]++;
1852                                 }
1853                         }
1854                 }
1855         }
1856         catch(exception& e) {
1857                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1858                 exit(1);        
1859         }               
1860 }
1861
1862 /**************************************************************************************************/
1863
1864 void ShhherCommand::setOTUs(){
1865         
1866         try {
1867                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1868                 
1869                 for(int i=0;i<numOTUs;i++){
1870                         for(int j=0;j<nSeqsPerOTU[i];j++){
1871                                 int index = cumNumSeqs[i] + j;
1872                                 double tauValue = singleTau[seqNumber[index]];
1873                                 int sIndex = seqIndex[index];
1874                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1875                         }
1876                 }
1877                 
1878                 for(int i=0;i<numSeqs;i++){
1879                         double maxTau = -1.0000;
1880                         int maxOTU = -1;
1881                         for(int j=0;j<numOTUs;j++){
1882                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1883                                         maxTau = bigTauMatrix[i * numOTUs + j];
1884                                         maxOTU = j;
1885                                 }
1886                         }
1887                         
1888                         otuData[i] = maxOTU;
1889                 }
1890                 
1891                 nSeqsPerOTU.assign(numOTUs, 0);         
1892                 
1893                 for(int i=0;i<numSeqs;i++){
1894                         int index = otuData[i];
1895                         
1896                         singleTau[i] = 1.0000;
1897                         dist[i] = 0.0000;
1898                         
1899                         aaP[index][nSeqsPerOTU[index]] = i;
1900                         aaI[index][nSeqsPerOTU[index]] = i;
1901                         
1902                         nSeqsPerOTU[index]++;
1903                 }
1904                 fill(); 
1905         }
1906         catch(exception& e) {
1907                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1908                 exit(1);        
1909         }               
1910 }
1911
1912 /**************************************************************************************************/
1913
1914 void ShhherCommand::writeQualities(vector<int> otuCounts){
1915         
1916         try {
1917                 string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.qual";
1918
1919                 ofstream qualityFile;
1920                 m->openOutputFile(qualityFileName, qualityFile);
1921
1922                 qualityFile.setf(ios::fixed, ios::floatfield);
1923                 qualityFile.setf(ios::showpoint);
1924                 qualityFile << setprecision(6);
1925                 
1926                 vector<vector<int> > qualities(numOTUs);
1927                 vector<double> pr(HOMOPS, 0);
1928                 
1929                 
1930                 for(int i=0;i<numOTUs;i++){
1931                         int index = 0;
1932                         int base = 0;
1933                         
1934                         if(nSeqsPerOTU[i] > 0){
1935                                 qualities[i].assign(1024, -1);
1936                                 
1937                                 while(index < numFlowCells){
1938                                         double maxPrValue = 1e8;
1939                                         short maxPrIndex = -1;
1940                                         double count = 0.0000;
1941                                         
1942                                         pr.assign(HOMOPS, 0);
1943                                         
1944                                         for(int j=0;j<nSeqsPerOTU[i];j++){
1945                                                 int lIndex = cumNumSeqs[i] + j;
1946                                                 double tauValue = singleTau[seqNumber[lIndex]];
1947                                                 int sequenceIndex = aaI[i][j];
1948                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1949                                                 
1950                                                 count += tauValue;
1951                                                 
1952                                                 for(int s=0;s<HOMOPS;s++){
1953                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1954                                                 }
1955                                         }
1956                                         
1957                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1958                                         maxPrValue = pr[maxPrIndex];
1959                                         
1960                                         if(count > MIN_COUNT){
1961                                                 double U = 0.0000;
1962                                                 double norm = 0.0000;
1963                                                 
1964                                                 for(int s=0;s<HOMOPS;s++){
1965                                                         norm += exp(-(pr[s] - maxPrValue));
1966                                                 }
1967                                                 
1968                                                 for(int s=1;s<=maxPrIndex;s++){
1969                                                         int value = 0;
1970                                                         double temp = 0.0000;
1971                                                         
1972                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
1973                                                         
1974                                                         if(U>0.00){
1975                                                                 temp = log10(U);
1976                                                         }
1977                                                         else{
1978                                                                 temp = -10.1;
1979                                                         }
1980                                                         temp = floor(-10 * temp);
1981                                                         value = (int)floor(temp);
1982                                                         if(value > 100){        value = 100;    }
1983                                                         
1984                                                         qualities[i][base] = (int)value;
1985                                                         base++;
1986                                                 }
1987                                         }
1988                                         
1989                                         index++;
1990                                 }
1991                         }
1992                         
1993                         
1994                         if(otuCounts[i] > 0){
1995                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1996                                 
1997                                 int j=4;        //need to get past the first four bases
1998                                 while(qualities[i][j] != -1){
1999                                         qualityFile << qualities[i][j] << ' ';
2000                                         j++;
2001                                 }
2002                                 qualityFile << endl;
2003                         }
2004                 }
2005                 qualityFile.close();
2006                 outputNames.push_back(qualityFileName);
2007
2008         }
2009         catch(exception& e) {
2010                 m->errorOut(e, "ShhherCommand", "writeQualities");
2011                 exit(1);        
2012         }               
2013 }
2014
2015 /**************************************************************************************************/
2016
2017 void ShhherCommand::writeSequences(vector<int> otuCounts){
2018         try {
2019                 
2020                 string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.fasta";
2021                 ofstream fastaFile;
2022                 m->openOutputFile(fastaFileName, fastaFile);
2023                 
2024                 vector<string> names(numOTUs, "");
2025                 
2026                 for(int i=0;i<numOTUs;i++){
2027                         int index = centroids[i];
2028                         
2029                         if(otuCounts[i] > 0){
2030                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
2031                                 
2032                                 string newSeq = "";
2033                                 
2034                                 for(int j=0;j<numFlowCells;j++){
2035                                         
2036                                         char base = flowOrder[j % 4];
2037                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
2038                                                 newSeq += base;
2039                                         }
2040                                 }
2041                                 
2042                                 fastaFile << newSeq.substr(4) << endl;
2043                         }
2044                 }
2045                 fastaFile.close();
2046
2047                 outputNames.push_back(fastaFileName);
2048
2049                 if(compositeFASTAFileName != ""){
2050                         m->appendFiles(fastaFileName, compositeFASTAFileName);
2051                 }
2052         }
2053         catch(exception& e) {
2054                 m->errorOut(e, "ShhherCommand", "writeSequences");
2055                 exit(1);        
2056         }               
2057 }
2058
2059 /**************************************************************************************************/
2060
2061 void ShhherCommand::writeNames(vector<int> otuCounts){
2062         try {
2063                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2064                 ofstream nameFile;
2065                 m->openOutputFile(nameFileName, nameFile);
2066                 
2067                 for(int i=0;i<numOTUs;i++){
2068                         if(otuCounts[i] > 0){
2069                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
2070                                 
2071                                 for(int j=1;j<nSeqsPerOTU[i];j++){
2072                                         nameFile << ',' << seqNameVector[aaI[i][j]];
2073                                 }
2074                                 
2075                                 nameFile << endl;
2076                         }
2077                 }
2078                 nameFile.close();
2079                 outputNames.push_back(nameFileName);
2080                 
2081                 
2082                 if(compositeNamesFileName != ""){
2083                         m->appendFiles(nameFileName, compositeNamesFileName);
2084                 }               
2085         }
2086         catch(exception& e) {
2087                 m->errorOut(e, "ShhherCommand", "writeNames");
2088                 exit(1);        
2089         }               
2090 }
2091
2092 /**************************************************************************************************/
2093
2094 void ShhherCommand::writeGroups(){
2095         try {
2096                 string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
2097                 string groupFileName = fileRoot + ".shhh.groups";
2098                 ofstream groupFile;
2099                 m->openOutputFile(groupFileName, groupFile);
2100                 
2101                 for(int i=0;i<numSeqs;i++){
2102                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
2103                 }
2104                 groupFile.close();
2105                 outputNames.push_back(groupFileName);
2106
2107         }
2108         catch(exception& e) {
2109                 m->errorOut(e, "ShhherCommand", "writeGroups");
2110                 exit(1);        
2111         }               
2112 }
2113
2114 /**************************************************************************************************/
2115
2116 void ShhherCommand::writeClusters(vector<int> otuCounts){
2117         try {
2118                 string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.counts";
2119                 ofstream otuCountsFile;
2120                 m->openOutputFile(otuCountsFileName, otuCountsFile);
2121                 
2122                 string bases = flowOrder;
2123                 
2124                 for(int i=0;i<numOTUs;i++){
2125                         //output the translated version of the centroid sequence for the otu
2126                         if(otuCounts[i] > 0){
2127                                 int index = centroids[i];
2128                                 
2129                                 otuCountsFile << "ideal\t";
2130                                 for(int j=8;j<numFlowCells;j++){
2131                                         char base = bases[j % 4];
2132                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
2133                                                 otuCountsFile << base;
2134                                         }
2135                                 }
2136                                 otuCountsFile << endl;
2137                                 
2138                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2139                                         int sequence = aaI[i][j];
2140                                         otuCountsFile << seqNameVector[sequence] << '\t';
2141                                         
2142                                         string newSeq = "";
2143                                         
2144                                         for(int k=0;k<lengths[sequence];k++){
2145                                                 char base = bases[k % 4];
2146                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
2147                                                         
2148                                                 for(int s=0;s<freq;s++){
2149                                                         newSeq += base;
2150                                                         //otuCountsFile << base;
2151                                                 }
2152                                         }
2153                                         otuCountsFile << newSeq.substr(4) << endl;
2154                                 }
2155                                 otuCountsFile << endl;
2156                         }
2157                 }
2158                 otuCountsFile.close();
2159                 outputNames.push_back(otuCountsFileName);
2160
2161         }
2162         catch(exception& e) {
2163                 m->errorOut(e, "ShhherCommand", "writeClusters");
2164                 exit(1);        
2165         }               
2166 }
2167
2168 //**********************************************************************************************************************