]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
small change to shhh.seqs
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsematrix.hpp"
19 #include <cfloat>
20
21 //**********************************************************************************************************************
22
23 #define NUMBINS 1000
24 #define HOMOPS 10
25 #define MIN_COUNT 0.1
26 #define MIN_WEIGHT 0.1
27 #define MIN_TAU 0.0001
28 #define MIN_ITER 10
29 //**********************************************************************************************************************
30 vector<string> ShhherCommand::setParameters(){  
31         try {
32                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
33                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
34                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plookup);
35                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
36                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
37                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
38                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
39                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
40                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
41                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
42                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
43                 
44                 vector<string> myArray;
45                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
46                 return myArray;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "setParameters");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getHelpString(){  
55         try {
56                 string helpString = "";
57                 helpString += "The shhh.seqs command reads a file containing flowgrams and creates a file of corrected sequences.\n";
58                 return helpString;
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "ShhherCommand", "getHelpString");
62                 exit(1);
63         }
64 }
65 //**********************************************************************************************************************
66
67 ShhherCommand::ShhherCommand(){ 
68         try {
69                 abort = true; calledHelp = true;
70                 setParameters();
71                 
72                 //initialize outputTypes
73 //              vector<string> tempOutNames;
74 //              outputTypes["pn.dist"] = tempOutNames;
75
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
79                 exit(1);
80         }
81 }
82
83 //**********************************************************************************************************************
84
85 ShhherCommand::ShhherCommand(string option) {
86         try {
87
88 #ifdef USE_MPI
89                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
90                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
91
92                 if(pid == 0){
93 #endif
94                 
95                 
96                 abort = false; calledHelp = false;   
97                 
98                 
99                 //allow user to run help
100                 if(option == "help") { help(); abort = true; calledHelp = true; }
101                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
102                 
103                 else {
104                         vector<string> myArray = setParameters();
105                         
106                         OptionParser parser(option);
107                         map<string,string> parameters = parser.getParameters();
108                         
109                         ValidParameters validParameter;
110                         map<string,string>::iterator it;
111                         
112                         //check to make sure all parameters are valid for command
113                         for (it = parameters.begin(); it != parameters.end(); it++) { 
114                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
115                         }
116                         
117                         //initialize outputTypes
118                         vector<string> tempOutNames;
119 //                      outputTypes["pn.dist"] = tempOutNames;
120                         //                      outputTypes["fasta"] = tempOutNames;
121                         
122                         //if the user changes the input directory command factory will send this info to us in the output parameter 
123                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
124                         if (inputDir == "not found"){   inputDir = "";          }
125                         else {
126                                 string path;
127                                 it = parameters.find("flow");
128                                 //user has given a template file
129                                 if(it != parameters.end()){ 
130                                         path = m->hasPath(it->second);
131                                         //if the user has not given a path then, add inputdir. else leave path alone.
132                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
133                                 }
134                                 
135                                 it = parameters.find("lookup");
136                                 //user has given a template file
137                                 if(it != parameters.end()){ 
138                                         path = m->hasPath(it->second);
139                                         //if the user has not given a path then, add inputdir. else leave path alone.
140                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
141                                 }
142
143                                 it = parameters.find("file");
144                                 //user has given a template file
145                                 if(it != parameters.end()){ 
146                                         path = m->hasPath(it->second);
147                                         //if the user has not given a path then, add inputdir. else leave path alone.
148                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
149                                 }
150                         }
151                         
152                         
153                         //check for required parameters
154                         flowFileName = validParameter.validFile(parameters, "flow", true);
155                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
156                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
157                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
158                                 m->mothurOutEndLine();
159                                 abort = true; 
160                         }
161                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
162                         
163                         if(flowFileName != "not found"){
164                                 compositeFASTAFileName = "";    
165                                 compositeNamesFileName = "";    
166                         }
167                         else{
168                                 ofstream temp;
169
170                                 //flow.files = 9 character offset
171                                 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
172                                 m->openOutputFile(compositeFASTAFileName, temp);
173                                 temp.close();
174                                 
175                                 compositeNamesFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.names";
176                                 m->openOutputFile(compositeNamesFileName, temp);
177                                 temp.close();
178                         }
179                         
180                         //if the user changes the output directory command factory will send this info to us in the output parameter 
181                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
182                                 outputDir = ""; 
183                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
184                         }
185                         
186                         
187                         //check for optional parameter and set defaults
188                         // ...at some point should added some additional type checking...
189                         string temp;
190                         temp = validParameter.validFile(parameters, "lookup", true);
191                         if (temp == "not found")        {       
192                                 lookupFileName = "LookUp_Titanium.pat"; 
193                                 
194                                 int ableToOpen;
195                                 ifstream in;
196                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
197                                 in.close();     
198                                 
199                                 //if you can't open it, try input location
200                                 if (ableToOpen == 1) {
201                                         if (inputDir != "") { //default path is set
202                                                 string tryPath = inputDir + lookupFileName;
203                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
204                                                 ifstream in2;
205                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
206                                                 in2.close();
207                                                 lookupFileName = tryPath;
208                                         }
209                                 }
210                                 
211                                 //if you can't open it, try default location
212                                 if (ableToOpen == 1) {
213                                         if (m->getDefaultPath() != "") { //default path is set
214                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
215                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
216                                                 ifstream in2;
217                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
218                                                 in2.close();
219                                                 lookupFileName = tryPath;
220                                         }
221                                 }
222                                 
223                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
224                                 if (ableToOpen == 1) {
225                                         string exepath = m->argv;
226                                         string tempPath = exepath;
227                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
228                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
229                                         
230                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
231                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
232                                         ifstream in2;
233                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
234                                         in2.close();
235                                         lookupFileName = tryPath;
236                                 }
237                                 
238                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
239                         }
240                         else if(temp == "not open")     {       
241                                 
242                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
243                                 
244                                 //if you can't open it its not inputDir, try mothur excutable location
245                                 string exepath = m->argv;
246                                 string tempPath = exepath;
247                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
248                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
249                                         
250                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
251                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
252                                 ifstream in2;
253                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
254                                 in2.close();
255                                 lookupFileName = tryPath;
256                                 
257                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
258                         }else                                           {       lookupFileName = temp;  }
259                         
260                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
261                         m->setProcessors(temp);
262                         convert(temp, processors);
263
264                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
265                         convert(temp, cutoff); 
266                         
267                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
268                         convert(temp, minDelta); 
269
270                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
271                         convert(temp, maxIters); 
272
273                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
274                         convert(temp, sigma); 
275                         
276                         flowOrder = validParameter.validFile(parameters, "order", false);
277                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
278                         else if(flowOrder.length() != 4){
279                                 m->mothurOut("The value of the order option must be four bases long\n");
280                         }
281                         
282                 }
283                         
284 #ifdef USE_MPI
285                 }                               
286 #endif
287                                 
288         }
289         catch(exception& e) {
290                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
291                 exit(1);
292         }
293 }
294 //**********************************************************************************************************************
295 #ifdef USE_MPI
296 int ShhherCommand::execute(){
297         try {
298                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
299                 
300                 int tag = 1976;
301                 MPI_Status status; 
302
303                 if(pid == 0){
304
305                         for(int i=1;i<ncpus;i++){
306                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
307                         }
308                         if(abort == 1){ return 0;       }
309
310                         processors = ncpus;
311                         
312                         m->mothurOut("\nGetting preliminary data...\n");
313                         getSingleLookUp();
314                         getJointLookUp();
315                         
316                         vector<string> flowFileVector;
317                         if(flowFilesFileName != "not found"){
318                                 string fName;
319
320                                 ifstream flowFilesFile;
321                                 m->openInputFile(flowFilesFileName, flowFilesFile);
322                                 while(flowFilesFile){
323                                         flowFilesFile >> fName;
324                                         flowFileVector.push_back(fName);
325                                         m->gobble(flowFilesFile);
326                                 }
327                         }
328                         else{
329                                 flowFileVector.push_back(flowFileName);
330                         }
331                         int numFiles = flowFileVector.size();
332
333                         for(int i=1;i<ncpus;i++){
334                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
335                         }
336                         
337                         for(int i=0;i<numFiles;i++){
338                                 double begClock = clock();
339                                 unsigned long int begTime = time(NULL);
340
341                                 flowFileName = flowFileVector[i];
342                                 
343                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
344                                 m->mothurOut("Reading flowgrams...\n");
345                                 getFlowData();
346
347                                 m->mothurOut("Identifying unique flowgrams...\n");
348                                 getUniques();
349
350                                 m->mothurOut("Calculating distances between flowgrams...\n");
351                                 char fileName[1024];
352                                 strcpy(fileName, flowFileName.c_str());
353
354                                 for(int i=1;i<ncpus;i++){
355                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
356
357                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
358                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
359                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
360                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
361                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
362                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
363                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
364                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
365                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
366                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
367                                 }                               
368                                                         
369                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
370                                 
371                                 int done;
372                                 for(int i=1;i<ncpus;i++){
373                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
374                                         
375                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
376                                         remove((distFileName + ".temp." + toString(i)).c_str());
377                                 }
378
379                                 string namesFileName = createNamesFile();
380                                 
381                                 m->mothurOut("\nClustering flowgrams...\n");
382                                 string listFileName = cluster(distFileName, namesFileName);
383
384                                 getOTUData(listFileName);
385
386                                 remove(distFileName.c_str());
387                                 remove(namesFileName.c_str());
388                                 remove(listFileName.c_str());
389                                 
390                                 initPyroCluster();
391
392                                 for(int i=1;i<ncpus;i++){
393                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
394                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
395                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
396                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
397                                 }
398                                 
399                                 
400                                 double maxDelta = 0;
401                                 int iter = 0;
402                                 
403                                 int numOTUsOnCPU = numOTUs / ncpus;
404                                 int numSeqsOnCPU = numSeqs / ncpus;
405                                 m->mothurOut("\nDenoising flowgrams...\n");
406                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
407                                 
408                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
409
410                                         double cycClock = clock();
411                                         unsigned long int cycTime = time(NULL);
412                                         fill();
413
414                                         int total = singleTau.size();
415                                         for(int i=1;i<ncpus;i++){
416                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
417                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
418                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
419                                                 
420                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
421                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
422                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
423                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
424                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
425                                         }
426                                         
427                                         calcCentroidsDriver(0, numOTUsOnCPU);
428                                         
429                                         for(int i=1;i<ncpus;i++){
430                                                 int otuStart = i * numOTUs / ncpus;
431                                                 int otuStop = (i + 1) * numOTUs / ncpus;
432                                                 
433                                                 vector<int> tempCentroids(numOTUs, 0);
434                                                 vector<short> tempChange(numOTUs, 0);
435                                                 
436                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
437                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
438                                                 
439                                                 for(int j=otuStart;j<otuStop;j++){
440                                                         centroids[j] = tempCentroids[j];
441                                                         change[j] = tempChange[j];
442                                                 }
443                                         }
444                                                                         
445                                         maxDelta = getNewWeights();
446                                         double nLL = getLikelihood();
447                                         checkCentroids();
448                                         
449                                         for(int i=1;i<ncpus;i++){
450                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
451                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
452                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
453                                         }
454                                         
455                                         calcNewDistancesParent(0, numSeqsOnCPU);
456
457                                         total = singleTau.size();
458
459                                         for(int i=1;i<ncpus;i++){
460                                                 int childTotal;
461                                                 int seqStart = i * numSeqs / ncpus;
462                                                 int seqStop = (i + 1) * numSeqs / ncpus;
463                                                 
464                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
465
466                                                 vector<int> childSeqIndex(childTotal, 0);
467                                                 vector<double> childSingleTau(childTotal, 0);
468                                                 vector<double> childDist(numSeqs * numOTUs, 0);
469                                                 vector<int> otuIndex(childTotal, 0);
470                                                 
471                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
472                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
473                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
474                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
475
476                                                 int oldTotal = total;
477                                                 total += childTotal;
478                                                 singleTau.resize(total, 0);
479                                                 seqIndex.resize(total, 0);
480                                                 seqNumber.resize(total, 0);
481                                                 
482                                                 int childIndex = 0;
483                                                 
484                                                 for(int j=oldTotal;j<total;j++){
485                                                         int otuI = otuIndex[childIndex];
486                                                         int seqI = childSeqIndex[childIndex];
487                                                         
488                                                         singleTau[j] = childSingleTau[childIndex];
489                                                         
490                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
491                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
492                                                         nSeqsPerOTU[otuI]++;
493                                                         childIndex++;
494                                                 }
495                                                 
496                                                 int index = seqStart * numOTUs;
497                                                 for(int j=seqStart;j<seqStop;j++){
498                                                         for(int k=0;k<numOTUs;k++){
499                                                                 dist[index] = childDist[index];
500                                                                 index++;
501                                                         }
502                                                 }                                       
503                                         }
504                                         
505                                         iter++;
506                                         
507                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
508
509                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
510                                                 int live = 1;
511                                                 for(int i=1;i<ncpus;i++){
512                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
513                                                 }
514                                         }
515                                         else{
516                                                 int live = 0;
517                                                 for(int i=1;i<ncpus;i++){
518                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
519                                                 }
520                                         }
521                                         
522                                 }       
523                                 
524                                 m->mothurOut("\nFinalizing...\n");
525                                 fill();
526                                 setOTUs();
527                                 
528                                 vector<int> otuCounts(numOTUs, 0);
529                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
530                                 calcCentroidsDriver(0, numOTUs);
531                                 
532                                 writeQualities(otuCounts);
533                                 writeSequences(otuCounts);
534                                 writeNames(otuCounts);
535                                 writeClusters(otuCounts);
536                                 writeGroups();
537                                 
538                                                                  
539                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
540                         }
541                 }
542                 else{
543                         int abort = 1;
544
545                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
546                         if(abort){      return 0;       }
547
548                         int numFiles;
549                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
550
551                         for(int i=0;i<numFiles;i++){
552                                 //Now into the pyrodist part
553                                 bool live = 1;
554
555                                 char fileName[1024];
556                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
557                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
558                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
559                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
560                                 
561                                 flowDataIntI.resize(numSeqs * numFlowCells);
562                                 flowDataPrI.resize(numSeqs * numFlowCells);
563                                 mapUniqueToSeq.resize(numSeqs);
564                                 mapSeqToUnique.resize(numSeqs);
565                                 lengths.resize(numSeqs);
566                                 jointLookUp.resize(NUMBINS * NUMBINS);
567
568                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
569                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
570                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
571                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
572                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
573                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
574                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
575
576                                 flowFileName = string(fileName);
577                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
578                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
579                                 
580                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
581
582                                 int done = 1;
583                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
584
585                                 //Now into the pyronoise part
586                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
587                                 
588                                 singleLookUp.resize(HOMOPS * NUMBINS);
589                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
590                                 weight.resize(numOTUs);
591                                 centroids.resize(numOTUs);
592                                 change.resize(numOTUs);
593                                 dist.assign(numOTUs * numSeqs, 0);
594                                 nSeqsPerOTU.resize(numOTUs);
595                                 cumNumSeqs.resize(numOTUs);
596
597                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
598                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
599                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
600                                 
601                                 int startOTU = pid * numOTUs / ncpus;
602                                 int endOTU = (pid + 1) * numOTUs / ncpus;
603
604                                 int startSeq = pid * numSeqs / ncpus;
605                                 int endSeq = (pid + 1) * numSeqs /ncpus;
606                                 
607                                 int total;
608
609                                 while(live){
610                                         
611                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
612                                         singleTau.assign(total, 0.0000);
613                                         seqNumber.assign(total, 0);
614                                         seqIndex.assign(total, 0);
615
616                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
617                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
618                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
619                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
620                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
621                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
622                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
623
624                                         calcCentroidsDriver(startOTU, endOTU);
625
626                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
627                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
628
629                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
630                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
631                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
632
633                                         vector<int> otuIndex(total, 0);
634                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
635                                         total = otuIndex.size();
636                                         
637                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
638                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
639                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
640                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
641                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
642                                 
643                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
644                                 }
645                         }
646                 }               
647                 MPI_Barrier(MPI_COMM_WORLD);
648
649                 
650                 if(compositeFASTAFileName != ""){
651                         outputNames.push_back(compositeFASTAFileName);
652                         outputNames.push_back(compositeNamesFileName);
653                 }
654
655                 m->mothurOutEndLine();
656                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
657                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
658                 m->mothurOutEndLine();
659                 
660                 return 0;
661
662         }
663         catch(exception& e) {
664                 m->errorOut(e, "ShhherCommand", "execute");
665                 exit(1);
666         }
667 }
668
669 /**************************************************************************************************/
670
671 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
672         try{            
673                 ostringstream outStream;
674                 outStream.setf(ios::fixed, ios::floatfield);
675                 outStream.setf(ios::dec, ios::basefield);
676                 outStream.setf(ios::showpoint);
677                 outStream.precision(6);
678                 
679                 int begTime = time(NULL);
680                 double begClock = clock();
681                 
682                 for(int i=startSeq;i<stopSeq;i++){
683                         for(int j=0;j<i;j++){
684                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
685                                 
686                                 if(flowDistance < 1e-6){
687                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
688                                 }
689                                 else if(flowDistance <= cutoff){
690                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
691                                 }
692                         }
693                         if(i % 100 == 0){
694                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
695                         }
696                 }
697                 
698                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
699                 
700                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
701                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
702
703                 ofstream distFile(fDistFileName.c_str());
704                 distFile << outStream.str();            
705                 distFile.close();
706                 
707                 return fDistFileName;
708         }
709         catch(exception& e) {
710                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
711                 exit(1);
712         }
713 }
714
715 #else
716 //**********************************************************************************************************************
717
718 int ShhherCommand::execute(){
719         try {
720                 if (abort == true) { return 0; }
721                 
722                 getSingleLookUp();
723                 getJointLookUp();
724                                 
725                 vector<string> flowFileVector;
726                 if(flowFilesFileName != "not found"){
727                         string fName;
728                         
729                         ifstream flowFilesFile;
730                         m->openInputFile(flowFilesFileName, flowFilesFile);
731                         while(flowFilesFile){
732                                 flowFilesFile >> fName;
733                                 flowFileVector.push_back(fName);
734                                 m->gobble(flowFilesFile);
735                         }
736                 }
737                 else{
738                         flowFileVector.push_back(flowFileName);
739                 }
740                 int numFiles = flowFileVector.size();
741                 
742                 
743                 for(int i=0;i<numFiles;i++){
744                         flowFileName = flowFileVector[i];
745
746                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
747                         m->mothurOut("Reading flowgrams...\n");
748                         getFlowData();
749                         
750                         m->mothurOut("Identifying unique flowgrams...\n");
751                         getUniques();
752                         
753                         
754                         m->mothurOut("Calculating distances between flowgrams...\n");
755                         string distFileName = createDistFile(processors);
756                         string namesFileName = createNamesFile();
757                                 
758                         m->mothurOut("\nClustering flowgrams...\n");
759                         string listFileName = cluster(distFileName, namesFileName);
760                         getOTUData(listFileName);
761                         
762                         remove(distFileName.c_str());
763                         remove(namesFileName.c_str());
764                         remove(listFileName.c_str());
765                         
766                         initPyroCluster();
767                         
768                         double maxDelta = 0;
769                         int iter = 0;
770                         
771                         double begClock = clock();
772                         unsigned long int begTime = time(NULL);
773
774                         m->mothurOut("\nDenoising flowgrams...\n");
775                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
776                         
777                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
778                                 
779                                 double cycClock = clock();
780                                 unsigned long int cycTime = time(NULL);
781                                 fill();
782                                 
783                                 calcCentroids();
784                                 
785                                 maxDelta = getNewWeights();
786                                 double nLL = getLikelihood();
787                                 checkCentroids();
788                                 
789                                 calcNewDistances();
790
791                                 iter++;
792                                 
793                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
794
795                         }       
796                         
797                         m->mothurOut("\nFinalizing...\n");
798                         fill();
799                         setOTUs();
800                         
801                         vector<int> otuCounts(numOTUs, 0);
802                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
803                         
804                         calcCentroidsDriver(0, numOTUs);
805                         writeQualities(otuCounts);
806                         writeSequences(otuCounts);
807                         writeNames(otuCounts);
808                         writeClusters(otuCounts);
809                         writeGroups();
810                         
811                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
812                 }
813                 
814                 if(compositeFASTAFileName != ""){
815                         outputNames.push_back(compositeFASTAFileName);
816                         outputNames.push_back(compositeNamesFileName);
817                 }
818
819                 m->mothurOutEndLine();
820                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
821                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
822                 m->mothurOutEndLine();
823                 
824                 return 0;
825         }
826         catch(exception& e) {
827                 m->errorOut(e, "ShhherCommand", "execute");
828                 exit(1);
829         }
830 }
831 #endif
832 /**************************************************************************************************/
833
834 void ShhherCommand::getFlowData(){
835         try{
836                 ifstream flowFile;
837                 m->openInputFile(flowFileName, flowFile);
838                 
839                 string seqName;
840                 seqNameVector.clear();
841                 lengths.clear();
842                 flowDataIntI.clear();
843                 nameMap.clear();
844                 
845                 
846                 int currentNumFlowCells;
847                 
848                 float intensity;
849                 
850                 flowFile >> numFlowCells;
851                 int index = 0;//pcluster
852                 while(!flowFile.eof()){
853                         flowFile >> seqName >> currentNumFlowCells;
854                         lengths.push_back(currentNumFlowCells);
855
856                         seqNameVector.push_back(seqName);
857                         nameMap[seqName] = index++;//pcluster
858
859                         for(int i=0;i<numFlowCells;i++){
860                                 flowFile >> intensity;
861                                 if(intensity > 9.99)    {       intensity = 9.99;       }
862                                 int intI = int(100 * intensity + 0.0001);
863                                 flowDataIntI.push_back(intI);
864                         }
865                         m->gobble(flowFile);
866                 }
867                 flowFile.close();
868                 
869                 numSeqs = seqNameVector.size();         
870                 
871                 for(int i=0;i<numSeqs;i++){
872                         int iNumFlowCells = i * numFlowCells;
873                         for(int j=lengths[i];j<numFlowCells;j++){
874                                 flowDataIntI[iNumFlowCells + j] = 0;
875                         }
876                 }
877                 
878         }
879         catch(exception& e) {
880                 m->errorOut(e, "ShhherCommand", "getFlowData");
881                 exit(1);
882         }
883 }
884
885 /**************************************************************************************************/
886
887 void ShhherCommand::getSingleLookUp(){
888         try{
889                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
890                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
891                 
892                 int index = 0;
893                 ifstream lookUpFile;
894                 m->openInputFile(lookupFileName, lookUpFile);
895                 
896                 for(int i=0;i<HOMOPS;i++){
897                         float logFracFreq;
898                         lookUpFile >> logFracFreq;
899                         
900                         for(int j=0;j<NUMBINS;j++)      {
901                                 lookUpFile >> singleLookUp[index];
902                                 index++;
903                         }
904                 }       
905                 lookUpFile.close();
906         }
907         catch(exception& e) {
908                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
909                 exit(1);
910         }
911 }
912
913 /**************************************************************************************************/
914
915 void ShhherCommand::getJointLookUp(){
916         try{
917                 
918                 //      the most likely joint probability (-log) that two intenities have the same polymer length
919                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
920                 
921                 for(int i=0;i<NUMBINS;i++){
922                         for(int j=0;j<NUMBINS;j++){             
923                                 
924                                 double minSum = 100000000;
925                                 
926                                 for(int k=0;k<HOMOPS;k++){
927                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
928                                         
929                                         if(sum < minSum)        {       minSum = sum;           }
930                                 }       
931                                 jointLookUp[i * NUMBINS + j] = minSum;
932                         }
933                 }
934         }
935         catch(exception& e) {
936                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
937                 exit(1);
938         }
939 }
940
941 /**************************************************************************************************/
942
943 double ShhherCommand::getProbIntensity(int intIntensity){                          
944         try{
945
946                 double minNegLogProb = 100000000; 
947
948                 
949                 for(int i=0;i<HOMOPS;i++){//loop signal strength
950                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
951                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
952                 }
953                 
954                 return minNegLogProb;
955         }
956         catch(exception& e) {
957                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
958                 exit(1);
959         }
960 }
961
962 /**************************************************************************************************/
963
964 void ShhherCommand::getUniques(){
965         try{
966                 
967                 
968                 numUniques = 0;
969                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
970                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
971                 uniqueLengths.assign(numSeqs, 0);
972                 mapSeqToUnique.assign(numSeqs, -1);
973                 mapUniqueToSeq.assign(numSeqs, -1);
974                 
975                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
976                 
977                 for(int i=0;i<numSeqs;i++){
978                         int index = 0;
979                         
980                         vector<short> current(numFlowCells);
981                         for(int j=0;j<numFlowCells;j++){
982                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
983                         }
984                                                 
985                         for(int j=0;j<numUniques;j++){
986                                 int offset = j * numFlowCells;
987                                 bool toEnd = 1;
988                                 
989                                 int shorterLength;
990                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
991                                 else                                                            {       shorterLength = uniqueLengths[j];       }
992
993                                 for(int k=0;k<shorterLength;k++){
994                                         if(current[k] != uniqueFlowgrams[offset + k]){
995                                                 toEnd = 0;
996                                                 break;
997                                         }
998                                 }
999                                 
1000                                 if(toEnd){
1001                                         mapSeqToUnique[i] = j;
1002                                         uniqueCount[j]++;
1003                                         index = j;
1004                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
1005                                         break;
1006                                 }
1007                                 index++;
1008                         }
1009                         
1010                         if(index == numUniques){
1011                                 uniqueLengths[numUniques] = lengths[i];
1012                                 uniqueCount[numUniques] = 1;
1013                                 mapSeqToUnique[i] = numUniques;//anMap
1014                                 mapUniqueToSeq[numUniques] = i;//anF
1015                                 
1016                                 for(int k=0;k<numFlowCells;k++){
1017                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1018                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1019                                 }
1020                                 
1021                                 numUniques++;
1022                         }
1023                 }
1024                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1025                 uniqueLengths.resize(numUniques);       
1026                 
1027                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
1028                 for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
1029         }
1030         catch(exception& e) {
1031                 m->errorOut(e, "ShhherCommand", "getUniques");
1032                 exit(1);
1033         }
1034 }
1035
1036 /**************************************************************************************************/
1037
1038 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1039         try{
1040                 int minLength = lengths[mapSeqToUnique[seqA]];
1041                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1042                 
1043                 int ANumFlowCells = seqA * numFlowCells;
1044                 int BNumFlowCells = seqB * numFlowCells;
1045                 
1046                 float dist = 0;
1047                 
1048                 for(int i=0;i<minLength;i++){
1049                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
1050                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
1051                         
1052                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
1053                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
1054                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1055                 }
1056                 
1057                 dist /= (float) minLength;
1058                 return dist;
1059         }
1060         catch(exception& e) {
1061                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1062                 exit(1);
1063         }
1064 }
1065
1066 /**************************************************************************************************/
1067
1068 void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
1069         try{            
1070
1071                 ostringstream outStream;
1072                 outStream.setf(ios::fixed, ios::floatfield);
1073                 outStream.setf(ios::dec, ios::basefield);
1074                 outStream.setf(ios::showpoint);
1075                 outStream.precision(6);
1076                 
1077                 int begTime = time(NULL);
1078                 double begClock = clock();
1079
1080                 for(int i=startSeq;i<stopSeq;i++){
1081                         for(int j=0;j<i;j++){
1082                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
1083
1084                                 if(flowDistance < 1e-6){
1085                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
1086                                 }
1087                                 else if(flowDistance <= cutoff){
1088                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
1089                                 }
1090                         }
1091                         if(i % 100 == 0){
1092                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
1093                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1094                                 m->mothurOutEndLine();
1095                         }
1096                 }
1097                 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
1098                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1099                 m->mothurOutEndLine();
1100                 
1101                 ofstream distFile(distFileName.c_str());
1102                 distFile << outStream.str();            
1103                 distFile.close();
1104         }
1105         catch(exception& e) {
1106                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
1107                 exit(1);
1108         }
1109 }
1110
1111 /**************************************************************************************************/
1112
1113 string ShhherCommand::createDistFile(int processors){
1114         try{
1115                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
1116                                 
1117                 unsigned long int begTime = time(NULL);
1118                 double begClock = clock();
1119
1120                 vector<int> start;
1121                 vector<int> end;
1122                 
1123 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1124                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
1125                 else{ //you have multiple processors
1126                         
1127                         if (numSeqs < processors){      processors = 1; }
1128                         
1129                         vector<int> start(processors, 0);
1130                         vector<int> end(processors, 0);
1131                         
1132                         for (int i = 0; i < processors; i++) {
1133                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
1134                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
1135                         }
1136                         
1137                         int process = 1;
1138                         vector<int> processIDs;
1139                         
1140                         //loop through and create all the processes you want
1141                         while (process != processors) {
1142                                 int pid = fork();
1143                                 
1144                                 if (pid > 0) {
1145                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1146                                         process++;
1147                                 }else if (pid == 0){
1148                                         flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
1149                                         exit(0);
1150                                 }else { 
1151                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1152                                         perror(" : ");
1153                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1154                                         exit(0);
1155                                 }
1156                         }
1157                         
1158                         //parent does its part
1159                         flowDistParentFork(fDistFileName, start[0], end[0]);
1160                         
1161                         //force parent to wait until all the processes are done
1162                         for (int i=0;i<processIDs.size();i++) { 
1163                                 int temp = processIDs[i];
1164                                 wait(&temp);
1165                         }
1166                         
1167                         //append and remove temp files
1168                         for (int i=0;i<processIDs.size();i++) { 
1169                                 m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
1170                                 remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
1171                         }
1172                         
1173                 }
1174                 
1175 #else
1176                 flowDistParentFork(fDistFileName, 0, numUniques);
1177 #endif
1178
1179                 m->mothurOutEndLine();
1180                 
1181                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
1182                 
1183
1184                 return fDistFileName;
1185         }
1186         catch(exception& e) {
1187                 m->errorOut(e, "ShhherCommand", "createDistFile");
1188                 exit(1);
1189         }
1190         
1191 }
1192
1193 /**************************************************************************************************/
1194
1195 string ShhherCommand::createNamesFile(){
1196         try{
1197                 
1198                 vector<string> duplicateNames(numUniques, "");
1199                 for(int i=0;i<numSeqs;i++){
1200                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
1201                 }
1202                 
1203                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
1204                 
1205                 ofstream nameFile;
1206                 m->openOutputFile(nameFileName, nameFile);
1207                 
1208                 for(int i=0;i<numUniques;i++){
1209 //                      nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1210                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1211                 }
1212                 
1213                 nameFile.close();
1214                 return  nameFileName;
1215         }
1216         catch(exception& e) {
1217                 m->errorOut(e, "ShhherCommand", "createNamesFile");
1218                 exit(1);
1219         }
1220 }
1221
1222 //**********************************************************************************************************************
1223
1224 string ShhherCommand::cluster(string distFileName, string namesFileName){
1225         try {
1226                 
1227                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1228                 read->setCutoff(cutoff);
1229                 
1230                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1231                 clusterNameMap->readMap();
1232                 read->read(clusterNameMap);
1233                 
1234                 ListVector* list = read->getListVector();
1235                 SparseMatrix* matrix = read->getMatrix();
1236                 
1237                 delete read; 
1238                 delete clusterNameMap; 
1239                                 
1240                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1241                 
1242                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1243                 string tag = cluster->getTag();
1244                 
1245                 double clusterCutoff = cutoff;
1246                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1247                         cluster->update(clusterCutoff);
1248                 }
1249                 
1250                 list->setLabel(toString(cutoff));
1251                 
1252                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1253                 ofstream listFile;
1254                 m->openOutputFile(listFileName, listFile);
1255                 list->print(listFile);
1256                 listFile.close();
1257                 
1258                 delete matrix;  delete cluster; delete rabund; delete list;
1259         
1260                 return listFileName;
1261         }
1262         catch(exception& e) {
1263                 m->errorOut(e, "ShhherCommand", "cluster");
1264                 exit(1);        
1265         }               
1266 }
1267
1268 /**************************************************************************************************/
1269
1270 void ShhherCommand::getOTUData(string listFileName){
1271         try {
1272
1273                 ifstream listFile;
1274                 m->openInputFile(listFileName, listFile);
1275                 string label;
1276                 
1277                 listFile >> label >> numOTUs;
1278
1279                 otuData.assign(numSeqs, 0);
1280                 cumNumSeqs.assign(numOTUs, 0);
1281                 nSeqsPerOTU.assign(numOTUs, 0);
1282                 aaP.clear();aaP.resize(numOTUs);
1283                 
1284                 seqNumber.clear();
1285                 aaI.clear();
1286                 seqIndex.clear();
1287                 
1288                 string singleOTU = "";
1289                 
1290                 for(int i=0;i<numOTUs;i++){
1291
1292                         listFile >> singleOTU;
1293                         
1294                         istringstream otuString(singleOTU);
1295
1296                         while(otuString){
1297                                 
1298                                 string seqName = "";
1299                                 
1300                                 for(int j=0;j<singleOTU.length();j++){
1301                                         char letter = otuString.get();
1302                                         
1303                                         if(letter != ','){
1304                                                 seqName += letter;
1305                                         }
1306                                         else{
1307                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1308                                                 int index = nmIt->second;
1309                                                 
1310                                                 nameMap.erase(nmIt);
1311                                                 
1312                                                 otuData[index] = i;
1313                                                 nSeqsPerOTU[i]++;
1314                                                 aaP[i].push_back(index);
1315                                                 seqName = "";
1316                                         }
1317                                 }
1318                                 
1319                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1320
1321                                 int index = nmIt->second;
1322                                 nameMap.erase(nmIt);
1323
1324                                 otuData[index] = i;
1325                                 nSeqsPerOTU[i]++;
1326                                 aaP[i].push_back(index);        
1327                                 
1328                                 otuString.get();
1329                         }
1330                         
1331                         sort(aaP[i].begin(), aaP[i].end());
1332                         for(int j=0;j<nSeqsPerOTU[i];j++){
1333                                 seqNumber.push_back(aaP[i][j]);
1334                         }
1335                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
1336                                 aaP[i].push_back(0);
1337                         }
1338                         
1339                         
1340                 }
1341                 
1342                 for(int i=1;i<numOTUs;i++){
1343                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
1344                 }
1345                 aaI = aaP;
1346                 seqIndex = seqNumber;
1347                 
1348                 listFile.close();       
1349                 
1350         }
1351         catch(exception& e) {
1352                 m->errorOut(e, "ShhherCommand", "getOTUData");
1353                 exit(1);        
1354         }               
1355 }
1356
1357 /**************************************************************************************************/
1358
1359 void ShhherCommand::initPyroCluster(){                          
1360         try{
1361                 dist.assign(numSeqs * numOTUs, 0);
1362                 change.assign(numOTUs, 1);
1363                 centroids.assign(numOTUs, -1);
1364                 weight.assign(numOTUs, 0);
1365                 singleTau.assign(numSeqs, 1.0);
1366                 
1367                 nSeqsBreaks.assign(processors+1, 0);
1368                 nOTUsBreaks.assign(processors+1, 0);
1369
1370                 nSeqsBreaks[0] = 0;
1371                 for(int i=0;i<processors;i++){
1372                         nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1373                         nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1374                 }
1375                 nSeqsBreaks[processors] = numSeqs;
1376                 nOTUsBreaks[processors] = numOTUs;
1377         }
1378         catch(exception& e) {
1379                 m->errorOut(e, "ShhherCommand", "initPyroCluster");
1380                 exit(1);        
1381         }
1382 }
1383
1384 /**************************************************************************************************/
1385
1386 void ShhherCommand::fill(){
1387         try {
1388                 int index = 0;
1389                 for(int i=0;i<numOTUs;i++){
1390                         cumNumSeqs[i] = index;
1391                         for(int j=0;j<nSeqsPerOTU[i];j++){
1392                                 seqNumber[index] = aaP[i][j];
1393                                 seqIndex[index] = aaI[i][j];
1394                                 
1395                                 index++;
1396                         }
1397                 }
1398         }
1399         catch(exception& e) {
1400                 m->errorOut(e, "ShhherCommand", "fill");
1401                 exit(1);        
1402         }               
1403 }
1404
1405 /**************************************************************************************************/
1406
1407 void ShhherCommand::calcCentroids(){                          
1408         try{
1409                 
1410 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1411
1412                 if(processors == 1)     {
1413                         calcCentroidsDriver(0, numOTUs);                
1414                 }
1415                 else{ //you have multiple processors
1416                         if (numOTUs < processors){      processors = 1; }
1417                         
1418                         int process = 1;
1419                         vector<int> processIDs;
1420                         
1421                         //loop through and create all the processes you want
1422                         while (process != processors) {
1423                                 int pid = vfork();
1424                                 
1425                                 if (pid > 0) {
1426                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1427                                         process++;
1428                                 }else if (pid == 0){
1429                                         calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
1430                                         exit(0);
1431                                 }else { 
1432                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1433                                         perror(" : ");
1434                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1435                                         exit(0);
1436                                 }
1437                         }
1438                         
1439                         //parent does its part
1440                         calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
1441
1442                         //force parent to wait until all the processes are done
1443                         for (int i=0;i<processIDs.size();i++) { 
1444                                 int temp = processIDs[i];
1445                                 wait(&temp);
1446                         }
1447                 }
1448                 
1449 #else
1450                 calcCentroidsDriver(0, numOTUs);
1451 #endif          
1452         }
1453         catch(exception& e) {
1454                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1455                 exit(1);        
1456         }               
1457 }
1458
1459 /**************************************************************************************************/
1460
1461 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1462         
1463         //this function gets the most likely homopolymer length at a flow position for a group of sequences
1464         //within an otu
1465         
1466         try{
1467                 
1468         
1469                 for(int i=start;i<finish;i++){
1470                         
1471                         double count = 0;
1472                         int position = 0;
1473                         int minFlowGram = 100000000;
1474                         double minFlowValue = 1e8;
1475                         change[i] = 0; //FALSE
1476                         
1477                         for(int j=0;j<nSeqsPerOTU[i];j++){
1478                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1479                         }
1480
1481                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1482                                 vector<double> adF(nSeqsPerOTU[i]);
1483                                 vector<int> anL(nSeqsPerOTU[i]);
1484                                 
1485                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1486                                         int index = cumNumSeqs[i] + j;
1487                                         int nI = seqIndex[index];
1488                                         int nIU = mapSeqToUnique[nI];
1489                                         
1490                                         int k;
1491                                         for(k=0;k<position;k++){
1492                                                 if(nIU == anL[k]){
1493                                                         break;
1494                                                 }
1495                                         }
1496                                         if(k == position){
1497                                                 anL[position] = nIU;
1498                                                 adF[position] = 0.0000;
1499                                                 position++;
1500                                         }                                               
1501                                 }
1502                                 
1503                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1504                                         int index = cumNumSeqs[i] + j;
1505                                         int nI = seqIndex[index];
1506                                         
1507                                         double tauValue = singleTau[seqNumber[index]];
1508                                         
1509                                         for(int k=0;k<position;k++){
1510                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1511                                                 adF[k] += dist * tauValue;
1512                                         }
1513                                 }
1514                                 
1515                                 for(int j=0;j<position;j++){
1516                                         if(adF[j] < minFlowValue){
1517                                                 minFlowGram = j;
1518                                                 minFlowValue = adF[j];
1519                                         }
1520                                 }
1521                                 
1522                                 if(centroids[i] != anL[minFlowGram]){
1523                                         change[i] = 1;
1524                                         centroids[i] = anL[minFlowGram];
1525                                 }
1526                         }
1527                         else if(centroids[i] != -1){
1528                                 change[i] = 1;
1529                                 centroids[i] = -1;                      
1530                         }
1531                 }
1532         }
1533         catch(exception& e) {
1534                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1535                 exit(1);        
1536         }               
1537 }
1538
1539 /**************************************************************************************************/
1540
1541 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1542         try{
1543                 
1544                 int flowAValue = cent * numFlowCells;
1545                 int flowBValue = flow * numFlowCells;
1546                 
1547                 double dist = 0;
1548                 
1549                 for(int i=0;i<length;i++){
1550                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1551                         flowAValue++;
1552                         flowBValue++;
1553                 }
1554                 
1555                 return dist / (double)length;
1556         }
1557         catch(exception& e) {
1558                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1559                 exit(1);        
1560         }               
1561 }
1562
1563 /**************************************************************************************************/
1564
1565 double ShhherCommand::getNewWeights(){
1566         try{
1567                 
1568                 double maxChange = 0;
1569                 
1570                 for(int i=0;i<numOTUs;i++){
1571                         
1572                         double difference = weight[i];
1573                         weight[i] = 0;
1574                         
1575                         for(int j=0;j<nSeqsPerOTU[i];j++){
1576                                 int index = cumNumSeqs[i] + j;
1577                                 double tauValue = singleTau[seqNumber[index]];
1578                                 weight[i] += tauValue;
1579                         }
1580                         
1581                         difference = fabs(weight[i] - difference);
1582                         if(difference > maxChange){     maxChange = difference; }
1583                 }
1584                 return maxChange;
1585         }
1586         catch(exception& e) {
1587                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1588                 exit(1);        
1589         }               
1590 }
1591
1592 /**************************************************************************************************/
1593
1594 double ShhherCommand::getLikelihood(){
1595         
1596         try{
1597                 
1598                 vector<long double> P(numSeqs, 0);
1599                 int effNumOTUs = 0;
1600                 
1601                 for(int i=0;i<numOTUs;i++){
1602                         if(weight[i] > MIN_WEIGHT){
1603                                 effNumOTUs++;
1604                         }
1605                 }
1606                 
1607                 string hold;
1608                 for(int i=0;i<numOTUs;i++){
1609                         for(int j=0;j<nSeqsPerOTU[i];j++){
1610                                 int index = cumNumSeqs[i] + j;
1611                                 int nI = seqIndex[index];
1612                                 double singleDist = dist[seqNumber[index]];
1613                                 
1614                                 P[nI] += weight[i] * exp(-singleDist * sigma);
1615                         }
1616                 }
1617                 double nLL = 0.00;
1618                 for(int i=0;i<numSeqs;i++){
1619                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
1620
1621                         nLL += -log(P[i]);
1622                 }
1623                 
1624                 nLL = nLL -(double)numSeqs * log(sigma);
1625
1626                 return nLL; 
1627         }
1628         catch(exception& e) {
1629                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1630                 exit(1);        
1631         }               
1632 }
1633
1634 /**************************************************************************************************/
1635
1636 void ShhherCommand::checkCentroids(){
1637         try{
1638                 vector<int> unique(numOTUs, 1);
1639                 
1640                 for(int i=0;i<numOTUs;i++){
1641                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1642                                 unique[i] = -1;
1643                         }
1644                 }
1645                 
1646                 for(int i=0;i<numOTUs;i++){
1647                         if(unique[i] == 1){
1648                                 for(int j=i+1;j<numOTUs;j++){
1649                                         if(unique[j] == 1){
1650                                                 
1651                                                 if(centroids[j] == centroids[i]){
1652                                                         unique[j] = 0;
1653                                                         centroids[j] = -1;
1654                                                         
1655                                                         weight[i] += weight[j];
1656                                                         weight[j] = 0.0;
1657                                                 }
1658                                         }
1659                                 }
1660                         }
1661                 }
1662         }
1663         catch(exception& e) {
1664                 m->errorOut(e, "ShhherCommand", "checkCentroids");
1665                 exit(1);        
1666         }               
1667 }
1668
1669 /**************************************************************************************************/
1670
1671 void ShhherCommand::calcNewDistances(){                          
1672         try{
1673                 
1674 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1675                 
1676                 if(processors == 1)     {
1677                         calcNewDistancesParent(0, numSeqs);             
1678                 }
1679                 else{ //you have multiple processors
1680                         if (numSeqs < processors){      processors = 1; }
1681                         
1682                         vector<vector<int> > child_otuIndex(processors);
1683                         vector<vector<int> > child_seqIndex(processors);
1684                         vector<vector<double> > child_singleTau(processors);                    
1685                         vector<int> totals(processors);
1686                         
1687                         int process = 1;
1688                         vector<int> processIDs;
1689
1690                         //loop through and create all the processes you want
1691                         while (process != processors) {
1692                                 int pid = vfork();
1693                                 
1694                                 if (pid > 0) {
1695                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1696                                         process++;
1697                                 }else if (pid == 0){
1698                                         calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
1699                                         totals[process] = child_otuIndex[process].size();
1700
1701                                         exit(0);
1702                                 }else { 
1703                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1704                                         perror(" : ");
1705                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1706                                         exit(0);
1707                                 }
1708                         }
1709                         
1710                         //parent does its part
1711                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
1712                         int total = seqIndex.size();
1713
1714                         //force parent to wait until all the processes are done
1715                         for (int i=0;i<processIDs.size();i++) { 
1716                                 int temp = processIDs[i];
1717                                 wait(&temp);
1718                         }
1719
1720                         for(int i=1;i<processors;i++){
1721                                 int oldTotal = total;
1722                                 total += totals[i];
1723
1724                                 singleTau.resize(total, 0);
1725                                 seqIndex.resize(total, 0);
1726                                 seqNumber.resize(total, 0);
1727                                 
1728                                 int childIndex = 0;
1729                                 
1730                                 for(int j=oldTotal;j<total;j++){
1731                                         int otuI = child_otuIndex[i][childIndex];
1732                                         int seqI = child_seqIndex[i][childIndex];
1733
1734                                         singleTau[j] = child_singleTau[i][childIndex];
1735                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
1736                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
1737                                         nSeqsPerOTU[otuI]++;
1738
1739                                         childIndex++;
1740                                 }
1741                         }
1742                 }
1743 #else
1744                 calcNewDistancesParent(0, numSeqs);             
1745 #endif          
1746         }
1747         catch(exception& e) {
1748                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1749                 exit(1);        
1750         }               
1751 }
1752
1753 /**************************************************************************************************/
1754 #ifdef USE_MPI
1755 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1756         
1757         try{
1758                 vector<double> newTau(numOTUs,0);
1759                 vector<double> norms(numSeqs, 0);
1760                 otuIndex.clear();
1761                 seqIndex.clear();
1762                 singleTau.clear();
1763                 
1764                 
1765                 
1766                 for(int i=startSeq;i<stopSeq;i++){
1767                         double offset = 1e8;
1768                         int indexOffset = i * numOTUs;
1769                         
1770                         for(int j=0;j<numOTUs;j++){
1771                                 
1772                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1773                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1774                                 }
1775                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1776                                         offset = dist[indexOffset + j];
1777                                 }
1778                         }
1779                         
1780                         for(int j=0;j<numOTUs;j++){
1781                                 if(weight[j] > MIN_WEIGHT){
1782                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1783                                         norms[i] += newTau[j];
1784                                 }
1785                                 else{
1786                                         newTau[j] = 0.0;
1787                                 }
1788                         }
1789                         
1790                         for(int j=0;j<numOTUs;j++){
1791
1792                                 newTau[j] /= norms[i];
1793                                 
1794                                 if(newTau[j] > MIN_TAU){
1795                                         otuIndex.push_back(j);
1796                                         seqIndex.push_back(i);
1797                                         singleTau.push_back(newTau[j]);
1798                                 }
1799                         }
1800                         
1801                 }
1802         }
1803         catch(exception& e) {
1804                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1805                 exit(1);        
1806         }               
1807 }
1808 #endif
1809 /**************************************************************************************************/
1810
1811 void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
1812         
1813         try{
1814                 vector<double> newTau(numOTUs,0);
1815                 vector<double> norms(numSeqs, 0);
1816                 child_otuIndex.resize(0);
1817                 child_seqIndex.resize(0);
1818                 child_singleTau.resize(0);
1819                 
1820                 for(int i=startSeq;i<stopSeq;i++){
1821                         double offset = 1e8;
1822                         int indexOffset = i * numOTUs;
1823                         
1824                         
1825                         for(int j=0;j<numOTUs;j++){
1826                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1827                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1828                                 }
1829                                 
1830                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1831                                         offset = dist[indexOffset + j];
1832                                 }
1833                         }
1834                         
1835                         for(int j=0;j<numOTUs;j++){
1836                                 if(weight[j] > MIN_WEIGHT){
1837                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1838                                         norms[i] += newTau[j];
1839                                 }
1840                                 else{
1841                                         newTau[j] = 0.0;
1842                                 }
1843                         }
1844                         
1845                         for(int j=0;j<numOTUs;j++){
1846                                 newTau[j] /= norms[i];
1847                                 
1848                                 if(newTau[j] > MIN_TAU){
1849                                         child_otuIndex.push_back(j);
1850                                         child_seqIndex.push_back(i);
1851                                         child_singleTau.push_back(newTau[j]);
1852                                 }
1853                         }
1854                 }
1855         }
1856         catch(exception& e) {
1857                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
1858                 exit(1);        
1859         }               
1860 }
1861
1862 /**************************************************************************************************/
1863
1864 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1865         
1866         try{
1867                 
1868                 int total = 0;
1869                 vector<double> newTau(numOTUs,0);
1870                 vector<double> norms(numSeqs, 0);
1871                 nSeqsPerOTU.assign(numOTUs, 0);
1872                 
1873                 for(int i=startSeq;i<stopSeq;i++){
1874                         int indexOffset = i * numOTUs;
1875                         
1876                         double offset = 1e8;
1877                         
1878                         for(int j=0;j<numOTUs;j++){
1879                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1880                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1881                                 }
1882                                 
1883                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1884                                         offset = dist[indexOffset + j];
1885                                 }
1886                         }
1887                         
1888                         for(int j=0;j<numOTUs;j++){
1889                                 if(weight[j] > MIN_WEIGHT){
1890                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1891                                         norms[i] += newTau[j];
1892                                 }
1893                                 else{
1894                                         newTau[j] = 0.0;
1895                                 }
1896                         }
1897                         
1898                         for(int j=0;j<numOTUs;j++){
1899                                 newTau[j] /= norms[i];
1900                         }
1901                         
1902                         for(int j=0;j<numOTUs;j++){
1903                                 if(newTau[j] > MIN_TAU){
1904                                         
1905                                         int oldTotal = total;
1906                                         
1907                                         total++;
1908                                         
1909                                         singleTau.resize(total, 0);
1910                                         seqNumber.resize(total, 0);
1911                                         seqIndex.resize(total, 0);
1912                                         
1913                                         singleTau[oldTotal] = newTau[j];
1914                                         
1915                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
1916                                         aaI[j][nSeqsPerOTU[j]] = i;
1917                                         nSeqsPerOTU[j]++;
1918                                 }
1919                         }
1920                 }
1921         }
1922         catch(exception& e) {
1923                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1924                 exit(1);        
1925         }               
1926 }
1927
1928 /**************************************************************************************************/
1929
1930 void ShhherCommand::setOTUs(){
1931         
1932         try {
1933                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1934                 
1935                 for(int i=0;i<numOTUs;i++){
1936                         for(int j=0;j<nSeqsPerOTU[i];j++){
1937                                 int index = cumNumSeqs[i] + j;
1938                                 double tauValue = singleTau[seqNumber[index]];
1939                                 int sIndex = seqIndex[index];
1940                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1941                         }
1942                 }
1943                 
1944                 for(int i=0;i<numSeqs;i++){
1945                         double maxTau = -1.0000;
1946                         int maxOTU = -1;
1947                         for(int j=0;j<numOTUs;j++){
1948                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1949                                         maxTau = bigTauMatrix[i * numOTUs + j];
1950                                         maxOTU = j;
1951                                 }
1952                         }
1953                         
1954                         otuData[i] = maxOTU;
1955                 }
1956                 
1957                 nSeqsPerOTU.assign(numOTUs, 0);         
1958                 
1959                 for(int i=0;i<numSeqs;i++){
1960                         int index = otuData[i];
1961                         
1962                         singleTau[i] = 1.0000;
1963                         dist[i] = 0.0000;
1964                         
1965                         aaP[index][nSeqsPerOTU[index]] = i;
1966                         aaI[index][nSeqsPerOTU[index]] = i;
1967                         
1968                         nSeqsPerOTU[index]++;
1969                 }
1970                 fill(); 
1971         }
1972         catch(exception& e) {
1973                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1974                 exit(1);        
1975         }               
1976 }
1977
1978 /**************************************************************************************************/
1979
1980 void ShhherCommand::writeQualities(vector<int> otuCounts){
1981         
1982         try {
1983                 string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.qual";
1984
1985                 ofstream qualityFile;
1986                 m->openOutputFile(qualityFileName, qualityFile);
1987
1988                 qualityFile.setf(ios::fixed, ios::floatfield);
1989                 qualityFile.setf(ios::showpoint);
1990                 qualityFile << setprecision(6);
1991                 
1992                 vector<vector<int> > qualities(numOTUs);
1993                 vector<double> pr(HOMOPS, 0);
1994                 
1995                 
1996                 for(int i=0;i<numOTUs;i++){
1997                         int index = 0;
1998                         int base = 0;
1999                         
2000                         if(nSeqsPerOTU[i] > 0){
2001                                 qualities[i].assign(1024, -1);
2002                                 
2003                                 while(index < numFlowCells){
2004                                         double maxPrValue = 1e8;
2005                                         short maxPrIndex = -1;
2006                                         double count = 0.0000;
2007                                         
2008                                         pr.assign(HOMOPS, 0);
2009                                         
2010                                         for(int j=0;j<nSeqsPerOTU[i];j++){
2011                                                 int lIndex = cumNumSeqs[i] + j;
2012                                                 double tauValue = singleTau[seqNumber[lIndex]];
2013                                                 int sequenceIndex = aaI[i][j];
2014                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
2015                                                 
2016                                                 count += tauValue;
2017                                                 
2018                                                 for(int s=0;s<HOMOPS;s++){
2019                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
2020                                                 }
2021                                         }
2022                                         
2023                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
2024                                         maxPrValue = pr[maxPrIndex];
2025                                         
2026                                         if(count > MIN_COUNT){
2027                                                 double U = 0.0000;
2028                                                 double norm = 0.0000;
2029                                                 
2030                                                 for(int s=0;s<HOMOPS;s++){
2031                                                         norm += exp(-(pr[s] - maxPrValue));
2032                                                 }
2033                                                 
2034                                                 for(int s=1;s<=maxPrIndex;s++){
2035                                                         int value = 0;
2036                                                         double temp = 0.0000;
2037                                                         
2038                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
2039                                                         
2040                                                         if(U>0.00){
2041                                                                 temp = log10(U);
2042                                                         }
2043                                                         else{
2044                                                                 temp = -10.1;
2045                                                         }
2046                                                         temp = floor(-10 * temp);
2047                                                         value = (int)floor(temp);
2048                                                         if(value > 100){        value = 100;    }
2049                                                         
2050                                                         qualities[i][base] = (int)value;
2051                                                         base++;
2052                                                 }
2053                                         }
2054                                         
2055                                         index++;
2056                                 }
2057                         }
2058                         
2059                         
2060                         if(otuCounts[i] > 0){
2061                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
2062                                 
2063                                 int j=4;        //need to get past the first four bases
2064                                 while(qualities[i][j] != -1){
2065                                         qualityFile << qualities[i][j] << ' ';
2066                                         j++;
2067                                 }
2068                                 qualityFile << endl;
2069                         }
2070                 }
2071                 qualityFile.close();
2072                 outputNames.push_back(qualityFileName);
2073
2074         }
2075         catch(exception& e) {
2076                 m->errorOut(e, "ShhherCommand", "writeQualities");
2077                 exit(1);        
2078         }               
2079 }
2080
2081 /**************************************************************************************************/
2082
2083 void ShhherCommand::writeSequences(vector<int> otuCounts){
2084         try {
2085                 
2086                 string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.fasta";
2087                 ofstream fastaFile;
2088                 m->openOutputFile(fastaFileName, fastaFile);
2089                 
2090                 vector<string> names(numOTUs, "");
2091                 
2092                 for(int i=0;i<numOTUs;i++){
2093                         int index = centroids[i];
2094                         
2095                         if(otuCounts[i] > 0){
2096                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
2097                                 
2098                                 string newSeq = "";
2099                                 
2100                                 for(int j=0;j<numFlowCells;j++){
2101                                         
2102                                         char base = flowOrder[j % 4];
2103                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
2104                                                 newSeq += base;
2105                                         }
2106                                 }
2107                                 
2108                                 fastaFile << newSeq.substr(4) << endl;
2109                         }
2110                 }
2111                 fastaFile.close();
2112
2113                 outputNames.push_back(fastaFileName);
2114
2115                 if(compositeFASTAFileName != ""){
2116                         m->appendFiles(fastaFileName, compositeFASTAFileName);
2117                 }
2118         }
2119         catch(exception& e) {
2120                 m->errorOut(e, "ShhherCommand", "writeSequences");
2121                 exit(1);        
2122         }               
2123 }
2124
2125 /**************************************************************************************************/
2126
2127 void ShhherCommand::writeNames(vector<int> otuCounts){
2128         try {
2129                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2130                 ofstream nameFile;
2131                 m->openOutputFile(nameFileName, nameFile);
2132                 
2133                 for(int i=0;i<numOTUs;i++){
2134                         if(otuCounts[i] > 0){
2135                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
2136                                 
2137                                 for(int j=1;j<nSeqsPerOTU[i];j++){
2138                                         nameFile << ',' << seqNameVector[aaI[i][j]];
2139                                 }
2140                                 
2141                                 nameFile << endl;
2142                         }
2143                 }
2144                 nameFile.close();
2145                 outputNames.push_back(nameFileName);
2146                 
2147                 
2148                 if(compositeNamesFileName != ""){
2149                         m->appendFiles(nameFileName, compositeNamesFileName);
2150                 }               
2151         }
2152         catch(exception& e) {
2153                 m->errorOut(e, "ShhherCommand", "writeNames");
2154                 exit(1);        
2155         }               
2156 }
2157
2158 /**************************************************************************************************/
2159
2160 void ShhherCommand::writeGroups(){
2161         try {
2162                 string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
2163                 string groupFileName = fileRoot + ".shhh.groups";
2164                 ofstream groupFile;
2165                 m->openOutputFile(groupFileName, groupFile);
2166                 
2167                 for(int i=0;i<numSeqs;i++){
2168                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
2169                 }
2170                 groupFile.close();
2171                 outputNames.push_back(groupFileName);
2172
2173         }
2174         catch(exception& e) {
2175                 m->errorOut(e, "ShhherCommand", "writeGroups");
2176                 exit(1);        
2177         }               
2178 }
2179
2180 /**************************************************************************************************/
2181
2182 void ShhherCommand::writeClusters(vector<int> otuCounts){
2183         try {
2184                 string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.counts";
2185                 ofstream otuCountsFile;
2186                 m->openOutputFile(otuCountsFileName, otuCountsFile);
2187                 
2188                 string bases = flowOrder;
2189                 
2190                 for(int i=0;i<numOTUs;i++){
2191                         //output the translated version of the centroid sequence for the otu
2192                         if(otuCounts[i] > 0){
2193                                 int index = centroids[i];
2194                                 
2195                                 otuCountsFile << "ideal\t";
2196                                 for(int j=8;j<numFlowCells;j++){
2197                                         char base = bases[j % 4];
2198                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
2199                                                 otuCountsFile << base;
2200                                         }
2201                                 }
2202                                 otuCountsFile << endl;
2203                                 
2204                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2205                                         int sequence = aaI[i][j];
2206                                         otuCountsFile << seqNameVector[sequence] << '\t';
2207                                         
2208                                         string newSeq = "";
2209                                         
2210                                         for(int k=0;k<lengths[sequence];k++){
2211                                                 char base = bases[k % 4];
2212                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
2213                                                         
2214                                                 for(int s=0;s<freq;s++){
2215                                                         newSeq += base;
2216                                                         //otuCountsFile << base;
2217                                                 }
2218                                         }
2219                                         otuCountsFile << newSeq.substr(4) << endl;
2220                                 }
2221                                 otuCountsFile << endl;
2222                         }
2223                 }
2224                 otuCountsFile.close();
2225                 outputNames.push_back(otuCountsFileName);
2226
2227         }
2228         catch(exception& e) {
2229                 m->errorOut(e, "ShhherCommand", "writeClusters");
2230                 exit(1);        
2231         }               
2232 }
2233
2234 //**********************************************************************************************************************