]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added path search for lookup file in shhh.seqs
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsematrix.hpp"
19 #include <cfloat>
20
21 //**********************************************************************************************************************
22
23 #define NUMBINS 1000
24 #define HOMOPS 10
25 #define MIN_COUNT 0.1
26 #define MIN_WEIGHT 0.1
27 #define MIN_TAU 0.0001
28 #define MIN_ITER 10
29 //**********************************************************************************************************************
30 vector<string> ShhherCommand::setParameters(){  
31         try {
32                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
33                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
34                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plookup);
35                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
36                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
37                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
38                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
39                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
40                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
41                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
42                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
43                 
44                 vector<string> myArray;
45                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
46                 return myArray;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "setParameters");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getHelpString(){  
55         try {
56                 string helpString = "";
57                 helpString += "The shhh.seqs command reads a file containing flowgrams and creates a file of corrected sequences.\n";
58                 return helpString;
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "ShhherCommand", "getHelpString");
62                 exit(1);
63         }
64 }
65 //**********************************************************************************************************************
66
67 ShhherCommand::ShhherCommand(){ 
68         try {
69                 abort = true; calledHelp = true;
70                 setParameters();
71                 
72                 //initialize outputTypes
73 //              vector<string> tempOutNames;
74 //              outputTypes["pn.dist"] = tempOutNames;
75
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
79                 exit(1);
80         }
81 }
82
83 //**********************************************************************************************************************
84
85 ShhherCommand::ShhherCommand(string option) {
86         try {
87
88 #ifdef USE_MPI
89                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
90                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
91
92                 if(pid == 0){
93 #endif
94                 
95                 
96                 abort = false; calledHelp = false;   
97                 
98                 
99                 //allow user to run help
100                 if(option == "help") { help(); abort = true; calledHelp = true; }
101                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
102                 
103                 else {
104                         vector<string> myArray = setParameters();
105                         
106                         OptionParser parser(option);
107                         map<string,string> parameters = parser.getParameters();
108                         
109                         ValidParameters validParameter;
110                         map<string,string>::iterator it;
111                         
112                         //check to make sure all parameters are valid for command
113                         for (it = parameters.begin(); it != parameters.end(); it++) { 
114                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
115                         }
116                         
117                         //initialize outputTypes
118                         vector<string> tempOutNames;
119 //                      outputTypes["pn.dist"] = tempOutNames;
120                         //                      outputTypes["fasta"] = tempOutNames;
121                         
122                         //if the user changes the input directory command factory will send this info to us in the output parameter 
123                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
124                         if (inputDir == "not found"){   inputDir = "";          }
125                         else {
126                                 string path;
127                                 it = parameters.find("flow");
128                                 //user has given a template file
129                                 if(it != parameters.end()){ 
130                                         path = m->hasPath(it->second);
131                                         //if the user has not given a path then, add inputdir. else leave path alone.
132                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
133                                 }
134                                 
135                                 it = parameters.find("lookup");
136                                 //user has given a template file
137                                 if(it != parameters.end()){ 
138                                         path = m->hasPath(it->second);
139                                         //if the user has not given a path then, add inputdir. else leave path alone.
140                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
141                                 }
142
143                                 it = parameters.find("file");
144                                 //user has given a template file
145                                 if(it != parameters.end()){ 
146                                         path = m->hasPath(it->second);
147                                         //if the user has not given a path then, add inputdir. else leave path alone.
148                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
149                                 }
150                         }
151                         
152                         
153                         //check for required parameters
154                         flowFileName = validParameter.validFile(parameters, "flow", true);
155                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
156                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
157                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
158                                 m->mothurOutEndLine();
159                                 abort = true; 
160                         }
161                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
162                         
163                         if(flowFileName != "not found"){
164                                 compositeFASTAFileName = "";    
165                                 compositeNamesFileName = "";    
166                         }
167                         else{
168                                 ofstream temp;
169
170                                 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
171                                 m->openOutputFile(compositeFASTAFileName, temp);
172                                 temp.close();
173                                 
174                                 compositeNamesFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.names";
175                                 m->openOutputFile(compositeNamesFileName, temp);
176                                 temp.close();
177                         }
178                         
179                         //if the user changes the output directory command factory will send this info to us in the output parameter 
180                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
181                                 outputDir = ""; 
182                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
183                         }
184                         
185                         
186                         //check for optional parameter and set defaults
187                         // ...at some point should added some additional type checking...
188                         string temp;
189                         temp = validParameter.validFile(parameters, "lookup", true);
190                         if (temp == "not found")        {       
191                                 lookupFileName = "LookUp_Titanium.pat"; 
192                                 
193                                 int ableToOpen;
194                                 ifstream in;
195                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
196                                 in.close();     
197                                 
198                                 //if you can't open it, try input location
199                                 if (ableToOpen == 1) {
200                                         if (inputDir != "") { //default path is set
201                                                 string tryPath = inputDir + lookupFileName;
202                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
203                                                 ifstream in2;
204                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
205                                                 in2.close();
206                                                 lookupFileName = tryPath;
207                                         }
208                                 }
209                                 
210                                 //if you can't open it, try default location
211                                 if (ableToOpen == 1) {
212                                         if (m->getDefaultPath() != "") { //default path is set
213                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
214                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
215                                                 ifstream in2;
216                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
217                                                 in2.close();
218                                                 lookupFileName = tryPath;
219                                         }
220                                 }
221                                 
222                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
223                                 if (ableToOpen == 1) {
224                                         string exepath = m->argv;
225                                         string tempPath = exepath;
226                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
227                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
228                                         
229                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
230                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
231                                         ifstream in2;
232                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
233                                         in2.close();
234                                         lookupFileName = tryPath;
235                                 }
236                                 
237                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
238                         }
239                         else if(temp == "not open")     {       
240                                 
241                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
242                                 
243                                 //if you can't open it its not inputDir, try mothur excutable location
244                                 string exepath = m->argv;
245                                 string tempPath = exepath;
246                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
247                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
248                                         
249                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
250                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
251                                 ifstream in2;
252                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
253                                 in2.close();
254                                 lookupFileName = tryPath;
255                                 
256                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
257                         }else                                           {       lookupFileName = temp;  }
258                         
259                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
260                         m->setProcessors(temp);
261                         convert(temp, processors);
262
263                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
264                         convert(temp, cutoff); 
265                         
266                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
267                         convert(temp, minDelta); 
268
269                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
270                         convert(temp, maxIters); 
271
272                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
273                         convert(temp, sigma); 
274                         
275                         flowOrder = validParameter.validFile(parameters, "order", false);
276                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
277                         else if(flowOrder.length() != 4){
278                                 m->mothurOut("The value of the order option must be four bases long\n");
279                         }
280                         
281                 }
282                         
283 #ifdef USE_MPI
284                 }                               
285 #endif
286                                 
287         }
288         catch(exception& e) {
289                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
290                 exit(1);
291         }
292 }
293 //**********************************************************************************************************************
294 #ifdef USE_MPI
295 int ShhherCommand::execute(){
296         try {
297                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
298                 
299                 int tag = 1976;
300                 MPI_Status status; 
301
302                 if(pid == 0){
303
304                         for(int i=1;i<ncpus;i++){
305                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
306                         }
307                         if(abort == 1){ return 0;       }
308
309                         processors = ncpus;
310                         
311                         m->mothurOut("\nGetting preliminary data...\n");
312                         getSingleLookUp();
313                         getJointLookUp();
314                         
315                         vector<string> flowFileVector;
316                         if(flowFilesFileName != "not found"){
317                                 string fName;
318
319                                 ifstream flowFilesFile;
320                                 m->openInputFile(flowFilesFileName, flowFilesFile);
321                                 while(flowFilesFile){
322                                         flowFilesFile >> fName;
323                                         flowFileVector.push_back(fName);
324                                         m->gobble(flowFilesFile);
325                                 }
326                         }
327                         else{
328                                 flowFileVector.push_back(flowFileName);
329                         }
330                         int numFiles = flowFileVector.size();
331
332                         for(int i=1;i<ncpus;i++){
333                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
334                         }
335                         
336                         for(int i=0;i<numFiles;i++){
337                                 double begClock = clock();
338                                 unsigned long int begTime = time(NULL);
339
340                                 flowFileName = flowFileVector[i];
341                                 
342                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
343                                 m->mothurOut("Reading flowgrams...\n");
344                                 getFlowData();
345
346                                 m->mothurOut("Identifying unique flowgrams...\n");
347                                 getUniques();
348
349                                 m->mothurOut("Calculating distances between flowgrams...\n");
350                                 char fileName[1024];
351                                 strcpy(fileName, flowFileName.c_str());
352
353                                 for(int i=1;i<ncpus;i++){
354                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
355
356                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
357                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
358                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
359                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
360                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
361                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
362                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
363                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
364                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
365                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
366                                 }                               
367                                                         
368                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
369                                 
370                                 int done;
371                                 for(int i=1;i<ncpus;i++){
372                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
373                                         
374                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
375                                         remove((distFileName + ".temp." + toString(i)).c_str());
376                                 }
377
378                                 string namesFileName = createNamesFile();
379                                 
380                                 m->mothurOut("\nClustering flowgrams...\n");
381                                 string listFileName = cluster(distFileName, namesFileName);
382
383                                 getOTUData(listFileName);
384
385                                 remove(distFileName.c_str());
386                                 remove(namesFileName.c_str());
387                                 remove(listFileName.c_str());
388                                 
389                                 initPyroCluster();
390
391                                 for(int i=1;i<ncpus;i++){
392                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
393                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
394                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
395                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
396                                 }
397                                 
398                                 
399                                 double maxDelta = 0;
400                                 int iter = 0;
401                                 
402                                 int numOTUsOnCPU = numOTUs / ncpus;
403                                 int numSeqsOnCPU = numSeqs / ncpus;
404                                 m->mothurOut("\nDenoising flowgrams...\n");
405                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
406                                 
407                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
408
409                                         double cycClock = clock();
410                                         unsigned long int cycTime = time(NULL);
411                                         fill();
412
413                                         int total = singleTau.size();
414                                         for(int i=1;i<ncpus;i++){
415                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
416                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
417                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
418                                                 
419                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
420                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
421                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
422                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
423                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
424                                         }
425                                         
426                                         calcCentroidsDriver(0, numOTUsOnCPU);
427                                         
428                                         for(int i=1;i<ncpus;i++){
429                                                 int otuStart = i * numOTUs / ncpus;
430                                                 int otuStop = (i + 1) * numOTUs / ncpus;
431                                                 
432                                                 vector<int> tempCentroids(numOTUs, 0);
433                                                 vector<short> tempChange(numOTUs, 0);
434                                                 
435                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
436                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
437                                                 
438                                                 for(int j=otuStart;j<otuStop;j++){
439                                                         centroids[j] = tempCentroids[j];
440                                                         change[j] = tempChange[j];
441                                                 }
442                                         }
443                                                                         
444                                         maxDelta = getNewWeights();
445                                         double nLL = getLikelihood();
446                                         checkCentroids();
447                                         
448                                         for(int i=1;i<ncpus;i++){
449                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
450                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
451                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
452                                         }
453                                         
454                                         calcNewDistancesParent(0, numSeqsOnCPU);
455
456                                         total = singleTau.size();
457
458                                         for(int i=1;i<ncpus;i++){
459                                                 int childTotal;
460                                                 int seqStart = i * numSeqs / ncpus;
461                                                 int seqStop = (i + 1) * numSeqs / ncpus;
462                                                 
463                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
464
465                                                 vector<int> childSeqIndex(childTotal, 0);
466                                                 vector<double> childSingleTau(childTotal, 0);
467                                                 vector<double> childDist(numSeqs * numOTUs, 0);
468                                                 vector<int> otuIndex(childTotal, 0);
469                                                 
470                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
471                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
472                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
473                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
474
475                                                 int oldTotal = total;
476                                                 total += childTotal;
477                                                 singleTau.resize(total, 0);
478                                                 seqIndex.resize(total, 0);
479                                                 seqNumber.resize(total, 0);
480                                                 
481                                                 int childIndex = 0;
482                                                 
483                                                 for(int j=oldTotal;j<total;j++){
484                                                         int otuI = otuIndex[childIndex];
485                                                         int seqI = childSeqIndex[childIndex];
486                                                         
487                                                         singleTau[j] = childSingleTau[childIndex];
488                                                         
489                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
490                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
491                                                         nSeqsPerOTU[otuI]++;
492                                                         childIndex++;
493                                                 }
494                                                 
495                                                 int index = seqStart * numOTUs;
496                                                 for(int j=seqStart;j<seqStop;j++){
497                                                         for(int k=0;k<numOTUs;k++){
498                                                                 dist[index] = childDist[index];
499                                                                 index++;
500                                                         }
501                                                 }                                       
502                                         }
503                                         
504                                         iter++;
505                                         
506                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
507
508                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
509                                                 int live = 1;
510                                                 for(int i=1;i<ncpus;i++){
511                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
512                                                 }
513                                         }
514                                         else{
515                                                 int live = 0;
516                                                 for(int i=1;i<ncpus;i++){
517                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
518                                                 }
519                                         }
520                                         
521                                 }       
522                                 
523                                 m->mothurOut("\nFinalizing...\n");
524                                 fill();
525                                 setOTUs();
526                                 
527                                 vector<int> otuCounts(numOTUs, 0);
528                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
529                                 calcCentroidsDriver(0, numOTUs);
530                                 
531                                 writeQualities(otuCounts);
532                                 writeSequences(otuCounts);
533                                 writeNames(otuCounts);
534                                 writeClusters(otuCounts);
535                                 writeGroups();
536                                 
537                                                                  
538                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
539                         }
540                 }
541                 else{
542                         int abort = 1;
543
544                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
545                         if(abort){      return 0;       }
546
547                         int numFiles;
548                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
549
550                         for(int i=0;i<numFiles;i++){
551                                 //Now into the pyrodist part
552                                 bool live = 1;
553
554                                 char fileName[1024];
555                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
556                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
557                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
558                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
559                                 
560                                 flowDataIntI.resize(numSeqs * numFlowCells);
561                                 flowDataPrI.resize(numSeqs * numFlowCells);
562                                 mapUniqueToSeq.resize(numSeqs);
563                                 mapSeqToUnique.resize(numSeqs);
564                                 lengths.resize(numSeqs);
565                                 jointLookUp.resize(NUMBINS * NUMBINS);
566
567                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
568                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
569                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
570                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
571                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
572                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
573                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
574
575                                 flowFileName = string(fileName);
576                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
577                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
578                                 
579                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
580
581                                 int done = 1;
582                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
583
584                                 //Now into the pyronoise part
585                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
586                                 
587                                 singleLookUp.resize(HOMOPS * NUMBINS);
588                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
589                                 weight.resize(numOTUs);
590                                 centroids.resize(numOTUs);
591                                 change.resize(numOTUs);
592                                 dist.assign(numOTUs * numSeqs, 0);
593                                 nSeqsPerOTU.resize(numOTUs);
594                                 cumNumSeqs.resize(numOTUs);
595
596                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
597                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
598                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
599                                 
600                                 int startOTU = pid * numOTUs / ncpus;
601                                 int endOTU = (pid + 1) * numOTUs / ncpus;
602
603                                 int startSeq = pid * numSeqs / ncpus;
604                                 int endSeq = (pid + 1) * numSeqs /ncpus;
605                                 
606                                 int total;
607
608                                 while(live){
609                                         
610                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
611                                         singleTau.assign(total, 0.0000);
612                                         seqNumber.assign(total, 0);
613                                         seqIndex.assign(total, 0);
614
615                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
616                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
617                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
618                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
619                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
620                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
621                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
622
623                                         calcCentroidsDriver(startOTU, endOTU);
624
625                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
626                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
627
628                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
629                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
630                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
631
632                                         vector<int> otuIndex(total, 0);
633                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
634                                         total = otuIndex.size();
635                                         
636                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
637                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
638                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
639                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
640                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
641                                 
642                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
643                                 }
644                         }
645                 }               
646                 MPI_Barrier(MPI_COMM_WORLD);
647
648                 
649                 if(compositeFASTAFileName != ""){
650                         outputNames.push_back(compositeFASTAFileName);
651                         outputNames.push_back(compositeNamesFileName);
652                 }
653
654                 m->mothurOutEndLine();
655                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
656                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
657                 m->mothurOutEndLine();
658                 
659                 return 0;
660
661         }
662         catch(exception& e) {
663                 m->errorOut(e, "ShhherCommand", "execute");
664                 exit(1);
665         }
666 }
667
668 /**************************************************************************************************/
669
670 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
671         try{            
672                 ostringstream outStream;
673                 outStream.setf(ios::fixed, ios::floatfield);
674                 outStream.setf(ios::dec, ios::basefield);
675                 outStream.setf(ios::showpoint);
676                 outStream.precision(6);
677                 
678                 int begTime = time(NULL);
679                 double begClock = clock();
680                 
681                 for(int i=startSeq;i<stopSeq;i++){
682                         for(int j=0;j<i;j++){
683                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
684                                 
685                                 if(flowDistance < 1e-6){
686                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
687                                 }
688                                 else if(flowDistance <= cutoff){
689                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
690                                 }
691                         }
692                         if(i % 100 == 0){
693                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
694                         }
695                 }
696                 
697                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
698                 
699                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
700                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
701
702                 ofstream distFile(fDistFileName.c_str());
703                 distFile << outStream.str();            
704                 distFile.close();
705                 
706                 return fDistFileName;
707         }
708         catch(exception& e) {
709                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
710                 exit(1);
711         }
712 }
713
714 #else
715 //**********************************************************************************************************************
716
717 int ShhherCommand::execute(){
718         try {
719                 if (abort == true) { return 0; }
720                 
721                 getSingleLookUp();
722                 getJointLookUp();
723                                 
724                 vector<string> flowFileVector;
725                 if(flowFilesFileName != "not found"){
726                         string fName;
727                         
728                         ifstream flowFilesFile;
729                         m->openInputFile(flowFilesFileName, flowFilesFile);
730                         while(flowFilesFile){
731                                 flowFilesFile >> fName;
732                                 flowFileVector.push_back(fName);
733                                 m->gobble(flowFilesFile);
734                         }
735                 }
736                 else{
737                         flowFileVector.push_back(flowFileName);
738                 }
739                 int numFiles = flowFileVector.size();
740                 
741                 
742                 for(int i=0;i<numFiles;i++){
743                         flowFileName = flowFileVector[i];
744
745                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
746                         m->mothurOut("Reading flowgrams...\n");
747                         getFlowData();
748                         
749                         m->mothurOut("Identifying unique flowgrams...\n");
750                         getUniques();
751                         
752                         
753                         m->mothurOut("Calculating distances between flowgrams...\n");
754                         string distFileName = createDistFile(processors);
755                         string namesFileName = createNamesFile();
756                                 
757                         m->mothurOut("\nClustering flowgrams...\n");
758                         string listFileName = cluster(distFileName, namesFileName);
759                         getOTUData(listFileName);
760                         
761                         remove(distFileName.c_str());
762                         remove(namesFileName.c_str());
763                         remove(listFileName.c_str());
764                         
765                         initPyroCluster();
766                         
767                         double maxDelta = 0;
768                         int iter = 0;
769                         
770                         double begClock = clock();
771                         unsigned long int begTime = time(NULL);
772
773                         m->mothurOut("\nDenoising flowgrams...\n");
774                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
775                         
776                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
777                                 
778                                 double cycClock = clock();
779                                 unsigned long int cycTime = time(NULL);
780                                 fill();
781                                 
782                                 calcCentroids();
783                                 
784                                 maxDelta = getNewWeights();
785                                 double nLL = getLikelihood();
786                                 checkCentroids();
787                                 
788                                 calcNewDistances();
789
790                                 iter++;
791                                 
792                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
793
794                         }       
795                         
796                         m->mothurOut("\nFinalizing...\n");
797                         fill();
798                         setOTUs();
799                         
800                         vector<int> otuCounts(numOTUs, 0);
801                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
802                         
803                         calcCentroidsDriver(0, numOTUs);
804                         writeQualities(otuCounts);
805                         writeSequences(otuCounts);
806                         writeNames(otuCounts);
807                         writeClusters(otuCounts);
808                         writeGroups();
809                         
810                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
811                 }
812                 
813                 if(compositeFASTAFileName != ""){
814                         outputNames.push_back(compositeFASTAFileName);
815                         outputNames.push_back(compositeNamesFileName);
816                 }
817
818                 m->mothurOutEndLine();
819                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
820                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
821                 m->mothurOutEndLine();
822                 
823                 return 0;
824         }
825         catch(exception& e) {
826                 m->errorOut(e, "ShhherCommand", "execute");
827                 exit(1);
828         }
829 }
830 #endif
831 /**************************************************************************************************/
832
833 void ShhherCommand::getFlowData(){
834         try{
835                 ifstream flowFile;
836                 m->openInputFile(flowFileName, flowFile);
837                 
838                 string seqName;
839                 seqNameVector.clear();
840                 lengths.clear();
841                 flowDataIntI.clear();
842                 nameMap.clear();
843                 
844                 
845                 int currentNumFlowCells;
846                 
847                 float intensity;
848                 
849                 flowFile >> numFlowCells;
850                 int index = 0;//pcluster
851                 while(!flowFile.eof()){
852                         flowFile >> seqName >> currentNumFlowCells;
853                         lengths.push_back(currentNumFlowCells);
854
855                         seqNameVector.push_back(seqName);
856                         nameMap[seqName] = index++;//pcluster
857
858                         for(int i=0;i<numFlowCells;i++){
859                                 flowFile >> intensity;
860                                 if(intensity > 9.99)    {       intensity = 9.99;       }
861                                 int intI = int(100 * intensity + 0.0001);
862                                 flowDataIntI.push_back(intI);
863                         }
864                         m->gobble(flowFile);
865                 }
866                 flowFile.close();
867                 
868                 numSeqs = seqNameVector.size();         
869                 
870                 for(int i=0;i<numSeqs;i++){
871                         int iNumFlowCells = i * numFlowCells;
872                         for(int j=lengths[i];j<numFlowCells;j++){
873                                 flowDataIntI[iNumFlowCells + j] = 0;
874                         }
875                 }
876                 
877         }
878         catch(exception& e) {
879                 m->errorOut(e, "ShhherCommand", "getFlowData");
880                 exit(1);
881         }
882 }
883
884 /**************************************************************************************************/
885
886 void ShhherCommand::getSingleLookUp(){
887         try{
888                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
889                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
890                 
891                 int index = 0;
892                 ifstream lookUpFile;
893                 m->openInputFile(lookupFileName, lookUpFile);
894                 
895                 for(int i=0;i<HOMOPS;i++){
896                         float logFracFreq;
897                         lookUpFile >> logFracFreq;
898                         
899                         for(int j=0;j<NUMBINS;j++)      {
900                                 lookUpFile >> singleLookUp[index];
901                                 index++;
902                         }
903                 }       
904                 lookUpFile.close();
905         }
906         catch(exception& e) {
907                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
908                 exit(1);
909         }
910 }
911
912 /**************************************************************************************************/
913
914 void ShhherCommand::getJointLookUp(){
915         try{
916                 
917                 //      the most likely joint probability (-log) that two intenities have the same polymer length
918                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
919                 
920                 for(int i=0;i<NUMBINS;i++){
921                         for(int j=0;j<NUMBINS;j++){             
922                                 
923                                 double minSum = 100000000;
924                                 
925                                 for(int k=0;k<HOMOPS;k++){
926                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
927                                         
928                                         if(sum < minSum)        {       minSum = sum;           }
929                                 }       
930                                 jointLookUp[i * NUMBINS + j] = minSum;
931                         }
932                 }
933         }
934         catch(exception& e) {
935                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
936                 exit(1);
937         }
938 }
939
940 /**************************************************************************************************/
941
942 double ShhherCommand::getProbIntensity(int intIntensity){                          
943         try{
944
945                 double minNegLogProb = 100000000; 
946
947                 
948                 for(int i=0;i<HOMOPS;i++){//loop signal strength
949                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
950                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
951                 }
952                 
953                 return minNegLogProb;
954         }
955         catch(exception& e) {
956                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
957                 exit(1);
958         }
959 }
960
961 /**************************************************************************************************/
962
963 void ShhherCommand::getUniques(){
964         try{
965                 
966                 
967                 numUniques = 0;
968                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
969                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
970                 uniqueLengths.assign(numSeqs, 0);
971                 mapSeqToUnique.assign(numSeqs, -1);
972                 mapUniqueToSeq.assign(numSeqs, -1);
973                 
974                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
975                 
976                 for(int i=0;i<numSeqs;i++){
977                         int index = 0;
978                         
979                         vector<short> current(numFlowCells);
980                         for(int j=0;j<numFlowCells;j++){
981                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
982                         }
983                                                 
984                         for(int j=0;j<numUniques;j++){
985                                 int offset = j * numFlowCells;
986                                 bool toEnd = 1;
987                                 
988                                 int shorterLength;
989                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
990                                 else                                                            {       shorterLength = uniqueLengths[j];       }
991
992                                 for(int k=0;k<shorterLength;k++){
993                                         if(current[k] != uniqueFlowgrams[offset + k]){
994                                                 toEnd = 0;
995                                                 break;
996                                         }
997                                 }
998                                 
999                                 if(toEnd){
1000                                         mapSeqToUnique[i] = j;
1001                                         uniqueCount[j]++;
1002                                         index = j;
1003                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
1004                                         break;
1005                                 }
1006                                 index++;
1007                         }
1008                         
1009                         if(index == numUniques){
1010                                 uniqueLengths[numUniques] = lengths[i];
1011                                 uniqueCount[numUniques] = 1;
1012                                 mapSeqToUnique[i] = numUniques;//anMap
1013                                 mapUniqueToSeq[numUniques] = i;//anF
1014                                 
1015                                 for(int k=0;k<numFlowCells;k++){
1016                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1017                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1018                                 }
1019                                 
1020                                 numUniques++;
1021                         }
1022                 }
1023                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1024                 uniqueLengths.resize(numUniques);       
1025                 
1026                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
1027                 for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
1028         }
1029         catch(exception& e) {
1030                 m->errorOut(e, "ShhherCommand", "getUniques");
1031                 exit(1);
1032         }
1033 }
1034
1035 /**************************************************************************************************/
1036
1037 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1038         try{
1039                 int minLength = lengths[mapSeqToUnique[seqA]];
1040                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1041                 
1042                 int ANumFlowCells = seqA * numFlowCells;
1043                 int BNumFlowCells = seqB * numFlowCells;
1044                 
1045                 float dist = 0;
1046                 
1047                 for(int i=0;i<minLength;i++){
1048                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
1049                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
1050                         
1051                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
1052                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
1053                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1054                 }
1055                 
1056                 dist /= (float) minLength;
1057                 return dist;
1058         }
1059         catch(exception& e) {
1060                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1061                 exit(1);
1062         }
1063 }
1064
1065 /**************************************************************************************************/
1066
1067 void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
1068         try{            
1069
1070                 ostringstream outStream;
1071                 outStream.setf(ios::fixed, ios::floatfield);
1072                 outStream.setf(ios::dec, ios::basefield);
1073                 outStream.setf(ios::showpoint);
1074                 outStream.precision(6);
1075                 
1076                 int begTime = time(NULL);
1077                 double begClock = clock();
1078
1079                 for(int i=startSeq;i<stopSeq;i++){
1080                         for(int j=0;j<i;j++){
1081                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
1082
1083                                 if(flowDistance < 1e-6){
1084                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
1085                                 }
1086                                 else if(flowDistance <= cutoff){
1087                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
1088                                 }
1089                         }
1090                         if(i % 100 == 0){
1091                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
1092                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1093                                 m->mothurOutEndLine();
1094                         }
1095                 }
1096                 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
1097                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
1098                 m->mothurOutEndLine();
1099                 
1100                 ofstream distFile(distFileName.c_str());
1101                 distFile << outStream.str();            
1102                 distFile.close();
1103         }
1104         catch(exception& e) {
1105                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
1106                 exit(1);
1107         }
1108 }
1109
1110 /**************************************************************************************************/
1111
1112 string ShhherCommand::createDistFile(int processors){
1113         try{
1114                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
1115                                 
1116                 unsigned long int begTime = time(NULL);
1117                 double begClock = clock();
1118
1119                 vector<int> start;
1120                 vector<int> end;
1121                 
1122 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1123                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
1124                 else{ //you have multiple processors
1125                         
1126                         if (numSeqs < processors){      processors = 1; }
1127                         
1128                         vector<int> start(processors, 0);
1129                         vector<int> end(processors, 0);
1130                         
1131                         for (int i = 0; i < processors; i++) {
1132                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
1133                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
1134                         }
1135                         
1136                         int process = 1;
1137                         vector<int> processIDs;
1138                         
1139                         //loop through and create all the processes you want
1140                         while (process != processors) {
1141                                 int pid = fork();
1142                                 
1143                                 if (pid > 0) {
1144                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1145                                         process++;
1146                                 }else if (pid == 0){
1147                                         flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
1148                                         exit(0);
1149                                 }else { 
1150                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1151                                         perror(" : ");
1152                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1153                                         exit(0);
1154                                 }
1155                         }
1156                         
1157                         //parent does its part
1158                         flowDistParentFork(fDistFileName, start[0], end[0]);
1159                         
1160                         //force parent to wait until all the processes are done
1161                         for (int i=0;i<processIDs.size();i++) { 
1162                                 int temp = processIDs[i];
1163                                 wait(&temp);
1164                         }
1165                         
1166                         //append and remove temp files
1167                         for (int i=0;i<processIDs.size();i++) { 
1168                                 m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
1169                                 remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
1170                         }
1171                         
1172                 }
1173                 
1174 #else
1175                 flowDistParentFork(fDistFileName, 0, numUniques);
1176 #endif
1177
1178                 m->mothurOutEndLine();
1179                 
1180                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
1181                 
1182
1183                 return fDistFileName;
1184         }
1185         catch(exception& e) {
1186                 m->errorOut(e, "ShhherCommand", "createDistFile");
1187                 exit(1);
1188         }
1189         
1190 }
1191
1192 /**************************************************************************************************/
1193
1194 string ShhherCommand::createNamesFile(){
1195         try{
1196                 
1197                 vector<string> duplicateNames(numUniques, "");
1198                 for(int i=0;i<numSeqs;i++){
1199                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
1200                 }
1201                 
1202                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
1203                 
1204                 ofstream nameFile;
1205                 m->openOutputFile(nameFileName, nameFile);
1206                 
1207                 for(int i=0;i<numUniques;i++){
1208 //                      nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1209                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1210                 }
1211                 
1212                 nameFile.close();
1213                 return  nameFileName;
1214         }
1215         catch(exception& e) {
1216                 m->errorOut(e, "ShhherCommand", "createNamesFile");
1217                 exit(1);
1218         }
1219 }
1220
1221 //**********************************************************************************************************************
1222
1223 string ShhherCommand::cluster(string distFileName, string namesFileName){
1224         try {
1225                 
1226                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1227                 read->setCutoff(cutoff);
1228                 
1229                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1230                 clusterNameMap->readMap();
1231                 read->read(clusterNameMap);
1232                 
1233                 ListVector* list = read->getListVector();
1234                 SparseMatrix* matrix = read->getMatrix();
1235                 
1236                 delete read; 
1237                 delete clusterNameMap; 
1238                                 
1239                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1240                 
1241                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1242                 string tag = cluster->getTag();
1243                 
1244                 double clusterCutoff = cutoff;
1245                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1246                         cluster->update(clusterCutoff);
1247                 }
1248                 
1249                 list->setLabel(toString(cutoff));
1250                 
1251                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1252                 ofstream listFile;
1253                 m->openOutputFile(listFileName, listFile);
1254                 list->print(listFile);
1255                 listFile.close();
1256                 
1257                 delete matrix;  delete cluster; delete rabund; delete list;
1258         
1259                 return listFileName;
1260         }
1261         catch(exception& e) {
1262                 m->errorOut(e, "ShhherCommand", "cluster");
1263                 exit(1);        
1264         }               
1265 }
1266
1267 /**************************************************************************************************/
1268
1269 void ShhherCommand::getOTUData(string listFileName){
1270         try {
1271
1272                 ifstream listFile;
1273                 m->openInputFile(listFileName, listFile);
1274                 string label;
1275                 
1276                 listFile >> label >> numOTUs;
1277
1278                 otuData.assign(numSeqs, 0);
1279                 cumNumSeqs.assign(numOTUs, 0);
1280                 nSeqsPerOTU.assign(numOTUs, 0);
1281                 aaP.clear();aaP.resize(numOTUs);
1282                 
1283                 seqNumber.clear();
1284                 aaI.clear();
1285                 seqIndex.clear();
1286                 
1287                 string singleOTU = "";
1288                 
1289                 for(int i=0;i<numOTUs;i++){
1290
1291                         listFile >> singleOTU;
1292                         
1293                         istringstream otuString(singleOTU);
1294
1295                         while(otuString){
1296                                 
1297                                 string seqName = "";
1298                                 
1299                                 for(int j=0;j<singleOTU.length();j++){
1300                                         char letter = otuString.get();
1301                                         
1302                                         if(letter != ','){
1303                                                 seqName += letter;
1304                                         }
1305                                         else{
1306                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1307                                                 int index = nmIt->second;
1308                                                 
1309                                                 nameMap.erase(nmIt);
1310                                                 
1311                                                 otuData[index] = i;
1312                                                 nSeqsPerOTU[i]++;
1313                                                 aaP[i].push_back(index);
1314                                                 seqName = "";
1315                                         }
1316                                 }
1317                                 
1318                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1319
1320                                 int index = nmIt->second;
1321                                 nameMap.erase(nmIt);
1322
1323                                 otuData[index] = i;
1324                                 nSeqsPerOTU[i]++;
1325                                 aaP[i].push_back(index);        
1326                                 
1327                                 otuString.get();
1328                         }
1329                         
1330                         sort(aaP[i].begin(), aaP[i].end());
1331                         for(int j=0;j<nSeqsPerOTU[i];j++){
1332                                 seqNumber.push_back(aaP[i][j]);
1333                         }
1334                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
1335                                 aaP[i].push_back(0);
1336                         }
1337                         
1338                         
1339                 }
1340                 
1341                 for(int i=1;i<numOTUs;i++){
1342                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
1343                 }
1344                 aaI = aaP;
1345                 seqIndex = seqNumber;
1346                 
1347                 listFile.close();       
1348                 
1349         }
1350         catch(exception& e) {
1351                 m->errorOut(e, "ShhherCommand", "getOTUData");
1352                 exit(1);        
1353         }               
1354 }
1355
1356 /**************************************************************************************************/
1357
1358 void ShhherCommand::initPyroCluster(){                          
1359         try{
1360                 dist.assign(numSeqs * numOTUs, 0);
1361                 change.assign(numOTUs, 1);
1362                 centroids.assign(numOTUs, -1);
1363                 weight.assign(numOTUs, 0);
1364                 singleTau.assign(numSeqs, 1.0);
1365                 
1366                 nSeqsBreaks.assign(processors+1, 0);
1367                 nOTUsBreaks.assign(processors+1, 0);
1368
1369                 nSeqsBreaks[0] = 0;
1370                 for(int i=0;i<processors;i++){
1371                         nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1372                         nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1373                 }
1374                 nSeqsBreaks[processors] = numSeqs;
1375                 nOTUsBreaks[processors] = numOTUs;
1376         }
1377         catch(exception& e) {
1378                 m->errorOut(e, "ShhherCommand", "initPyroCluster");
1379                 exit(1);        
1380         }
1381 }
1382
1383 /**************************************************************************************************/
1384
1385 void ShhherCommand::fill(){
1386         try {
1387                 int index = 0;
1388                 for(int i=0;i<numOTUs;i++){
1389                         cumNumSeqs[i] = index;
1390                         for(int j=0;j<nSeqsPerOTU[i];j++){
1391                                 seqNumber[index] = aaP[i][j];
1392                                 seqIndex[index] = aaI[i][j];
1393                                 
1394                                 index++;
1395                         }
1396                 }
1397         }
1398         catch(exception& e) {
1399                 m->errorOut(e, "ShhherCommand", "fill");
1400                 exit(1);        
1401         }               
1402 }
1403
1404 /**************************************************************************************************/
1405
1406 void ShhherCommand::calcCentroids(){                          
1407         try{
1408                 
1409 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1410
1411                 if(processors == 1)     {
1412                         calcCentroidsDriver(0, numOTUs);                
1413                 }
1414                 else{ //you have multiple processors
1415                         if (numOTUs < processors){      processors = 1; }
1416                         
1417                         int process = 1;
1418                         vector<int> processIDs;
1419                         
1420                         //loop through and create all the processes you want
1421                         while (process != processors) {
1422                                 int pid = vfork();
1423                                 
1424                                 if (pid > 0) {
1425                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1426                                         process++;
1427                                 }else if (pid == 0){
1428                                         calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
1429                                         exit(0);
1430                                 }else { 
1431                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1432                                         perror(" : ");
1433                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1434                                         exit(0);
1435                                 }
1436                         }
1437                         
1438                         //parent does its part
1439                         calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
1440
1441                         //force parent to wait until all the processes are done
1442                         for (int i=0;i<processIDs.size();i++) { 
1443                                 int temp = processIDs[i];
1444                                 wait(&temp);
1445                         }
1446                 }
1447                 
1448 #else
1449                 calcCentroidsDriver(0, numOTUs);
1450 #endif          
1451         }
1452         catch(exception& e) {
1453                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1454                 exit(1);        
1455         }               
1456 }
1457
1458 /**************************************************************************************************/
1459
1460 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1461         
1462         //this function gets the most likely homopolymer length at a flow position for a group of sequences
1463         //within an otu
1464         
1465         try{
1466                 
1467         
1468                 for(int i=start;i<finish;i++){
1469                         
1470                         double count = 0;
1471                         int position = 0;
1472                         int minFlowGram = 100000000;
1473                         double minFlowValue = 1e8;
1474                         change[i] = 0; //FALSE
1475                         
1476                         for(int j=0;j<nSeqsPerOTU[i];j++){
1477                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1478                         }
1479
1480                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1481                                 vector<double> adF(nSeqsPerOTU[i]);
1482                                 vector<int> anL(nSeqsPerOTU[i]);
1483                                 
1484                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1485                                         int index = cumNumSeqs[i] + j;
1486                                         int nI = seqIndex[index];
1487                                         int nIU = mapSeqToUnique[nI];
1488                                         
1489                                         int k;
1490                                         for(k=0;k<position;k++){
1491                                                 if(nIU == anL[k]){
1492                                                         break;
1493                                                 }
1494                                         }
1495                                         if(k == position){
1496                                                 anL[position] = nIU;
1497                                                 adF[position] = 0.0000;
1498                                                 position++;
1499                                         }                                               
1500                                 }
1501                                 
1502                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1503                                         int index = cumNumSeqs[i] + j;
1504                                         int nI = seqIndex[index];
1505                                         
1506                                         double tauValue = singleTau[seqNumber[index]];
1507                                         
1508                                         for(int k=0;k<position;k++){
1509                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1510                                                 adF[k] += dist * tauValue;
1511                                         }
1512                                 }
1513                                 
1514                                 for(int j=0;j<position;j++){
1515                                         if(adF[j] < minFlowValue){
1516                                                 minFlowGram = j;
1517                                                 minFlowValue = adF[j];
1518                                         }
1519                                 }
1520                                 
1521                                 if(centroids[i] != anL[minFlowGram]){
1522                                         change[i] = 1;
1523                                         centroids[i] = anL[minFlowGram];
1524                                 }
1525                         }
1526                         else if(centroids[i] != -1){
1527                                 change[i] = 1;
1528                                 centroids[i] = -1;                      
1529                         }
1530                 }
1531         }
1532         catch(exception& e) {
1533                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1534                 exit(1);        
1535         }               
1536 }
1537
1538 /**************************************************************************************************/
1539
1540 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1541         try{
1542                 
1543                 int flowAValue = cent * numFlowCells;
1544                 int flowBValue = flow * numFlowCells;
1545                 
1546                 double dist = 0;
1547                 
1548                 for(int i=0;i<length;i++){
1549                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1550                         flowAValue++;
1551                         flowBValue++;
1552                 }
1553                 
1554                 return dist / (double)length;
1555         }
1556         catch(exception& e) {
1557                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1558                 exit(1);        
1559         }               
1560 }
1561
1562 /**************************************************************************************************/
1563
1564 double ShhherCommand::getNewWeights(){
1565         try{
1566                 
1567                 double maxChange = 0;
1568                 
1569                 for(int i=0;i<numOTUs;i++){
1570                         
1571                         double difference = weight[i];
1572                         weight[i] = 0;
1573                         
1574                         for(int j=0;j<nSeqsPerOTU[i];j++){
1575                                 int index = cumNumSeqs[i] + j;
1576                                 double tauValue = singleTau[seqNumber[index]];
1577                                 weight[i] += tauValue;
1578                         }
1579                         
1580                         difference = fabs(weight[i] - difference);
1581                         if(difference > maxChange){     maxChange = difference; }
1582                 }
1583                 return maxChange;
1584         }
1585         catch(exception& e) {
1586                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1587                 exit(1);        
1588         }               
1589 }
1590
1591 /**************************************************************************************************/
1592
1593 double ShhherCommand::getLikelihood(){
1594         
1595         try{
1596                 
1597                 vector<long double> P(numSeqs, 0);
1598                 int effNumOTUs = 0;
1599                 
1600                 for(int i=0;i<numOTUs;i++){
1601                         if(weight[i] > MIN_WEIGHT){
1602                                 effNumOTUs++;
1603                         }
1604                 }
1605                 
1606                 string hold;
1607                 for(int i=0;i<numOTUs;i++){
1608                         for(int j=0;j<nSeqsPerOTU[i];j++){
1609                                 int index = cumNumSeqs[i] + j;
1610                                 int nI = seqIndex[index];
1611                                 double singleDist = dist[seqNumber[index]];
1612                                 
1613                                 P[nI] += weight[i] * exp(-singleDist * sigma);
1614                         }
1615                 }
1616                 double nLL = 0.00;
1617                 for(int i=0;i<numSeqs;i++){
1618                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
1619
1620                         nLL += -log(P[i]);
1621                 }
1622                 
1623                 nLL = nLL -(double)numSeqs * log(sigma);
1624
1625                 return nLL; 
1626         }
1627         catch(exception& e) {
1628                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1629                 exit(1);        
1630         }               
1631 }
1632
1633 /**************************************************************************************************/
1634
1635 void ShhherCommand::checkCentroids(){
1636         try{
1637                 vector<int> unique(numOTUs, 1);
1638                 
1639                 for(int i=0;i<numOTUs;i++){
1640                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1641                                 unique[i] = -1;
1642                         }
1643                 }
1644                 
1645                 for(int i=0;i<numOTUs;i++){
1646                         if(unique[i] == 1){
1647                                 for(int j=i+1;j<numOTUs;j++){
1648                                         if(unique[j] == 1){
1649                                                 
1650                                                 if(centroids[j] == centroids[i]){
1651                                                         unique[j] = 0;
1652                                                         centroids[j] = -1;
1653                                                         
1654                                                         weight[i] += weight[j];
1655                                                         weight[j] = 0.0;
1656                                                 }
1657                                         }
1658                                 }
1659                         }
1660                 }
1661         }
1662         catch(exception& e) {
1663                 m->errorOut(e, "ShhherCommand", "checkCentroids");
1664                 exit(1);        
1665         }               
1666 }
1667
1668 /**************************************************************************************************/
1669
1670 void ShhherCommand::calcNewDistances(){                          
1671         try{
1672                 
1673 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1674                 
1675                 if(processors == 1)     {
1676                         calcNewDistancesParent(0, numSeqs);             
1677                 }
1678                 else{ //you have multiple processors
1679                         if (numSeqs < processors){      processors = 1; }
1680                         
1681                         vector<vector<int> > child_otuIndex(processors);
1682                         vector<vector<int> > child_seqIndex(processors);
1683                         vector<vector<double> > child_singleTau(processors);                    
1684                         vector<int> totals(processors);
1685                         
1686                         int process = 1;
1687                         vector<int> processIDs;
1688
1689                         //loop through and create all the processes you want
1690                         while (process != processors) {
1691                                 int pid = vfork();
1692                                 
1693                                 if (pid > 0) {
1694                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1695                                         process++;
1696                                 }else if (pid == 0){
1697                                         calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
1698                                         totals[process] = child_otuIndex[process].size();
1699
1700                                         exit(0);
1701                                 }else { 
1702                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1703                                         perror(" : ");
1704                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1705                                         exit(0);
1706                                 }
1707                         }
1708                         
1709                         //parent does its part
1710                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
1711                         int total = seqIndex.size();
1712
1713                         //force parent to wait until all the processes are done
1714                         for (int i=0;i<processIDs.size();i++) { 
1715                                 int temp = processIDs[i];
1716                                 wait(&temp);
1717                         }
1718
1719                         for(int i=1;i<processors;i++){
1720                                 int oldTotal = total;
1721                                 total += totals[i];
1722
1723                                 singleTau.resize(total, 0);
1724                                 seqIndex.resize(total, 0);
1725                                 seqNumber.resize(total, 0);
1726                                 
1727                                 int childIndex = 0;
1728                                 
1729                                 for(int j=oldTotal;j<total;j++){
1730                                         int otuI = child_otuIndex[i][childIndex];
1731                                         int seqI = child_seqIndex[i][childIndex];
1732
1733                                         singleTau[j] = child_singleTau[i][childIndex];
1734                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
1735                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
1736                                         nSeqsPerOTU[otuI]++;
1737
1738                                         childIndex++;
1739                                 }
1740                         }
1741                 }
1742 #else
1743                 calcNewDistancesParent(0, numSeqs);             
1744 #endif          
1745         }
1746         catch(exception& e) {
1747                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1748                 exit(1);        
1749         }               
1750 }
1751
1752 /**************************************************************************************************/
1753 #ifdef USE_MPI
1754 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1755         
1756         try{
1757                 vector<double> newTau(numOTUs,0);
1758                 vector<double> norms(numSeqs, 0);
1759                 otuIndex.clear();
1760                 seqIndex.clear();
1761                 singleTau.clear();
1762                 
1763                 
1764                 
1765                 for(int i=startSeq;i<stopSeq;i++){
1766                         double offset = 1e8;
1767                         int indexOffset = i * numOTUs;
1768                         
1769                         for(int j=0;j<numOTUs;j++){
1770                                 
1771                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1772                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1773                                 }
1774                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1775                                         offset = dist[indexOffset + j];
1776                                 }
1777                         }
1778                         
1779                         for(int j=0;j<numOTUs;j++){
1780                                 if(weight[j] > MIN_WEIGHT){
1781                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1782                                         norms[i] += newTau[j];
1783                                 }
1784                                 else{
1785                                         newTau[j] = 0.0;
1786                                 }
1787                         }
1788                         
1789                         for(int j=0;j<numOTUs;j++){
1790
1791                                 newTau[j] /= norms[i];
1792                                 
1793                                 if(newTau[j] > MIN_TAU){
1794                                         otuIndex.push_back(j);
1795                                         seqIndex.push_back(i);
1796                                         singleTau.push_back(newTau[j]);
1797                                 }
1798                         }
1799                         
1800                 }
1801         }
1802         catch(exception& e) {
1803                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1804                 exit(1);        
1805         }               
1806 }
1807 #endif
1808 /**************************************************************************************************/
1809
1810 void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
1811         
1812         try{
1813                 vector<double> newTau(numOTUs,0);
1814                 vector<double> norms(numSeqs, 0);
1815                 child_otuIndex.resize(0);
1816                 child_seqIndex.resize(0);
1817                 child_singleTau.resize(0);
1818                 
1819                 for(int i=startSeq;i<stopSeq;i++){
1820                         double offset = 1e8;
1821                         int indexOffset = i * numOTUs;
1822                         
1823                         
1824                         for(int j=0;j<numOTUs;j++){
1825                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1826                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1827                                 }
1828                                 
1829                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1830                                         offset = dist[indexOffset + j];
1831                                 }
1832                         }
1833                         
1834                         for(int j=0;j<numOTUs;j++){
1835                                 if(weight[j] > MIN_WEIGHT){
1836                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1837                                         norms[i] += newTau[j];
1838                                 }
1839                                 else{
1840                                         newTau[j] = 0.0;
1841                                 }
1842                         }
1843                         
1844                         for(int j=0;j<numOTUs;j++){
1845                                 newTau[j] /= norms[i];
1846                                 
1847                                 if(newTau[j] > MIN_TAU){
1848                                         child_otuIndex.push_back(j);
1849                                         child_seqIndex.push_back(i);
1850                                         child_singleTau.push_back(newTau[j]);
1851                                 }
1852                         }
1853                 }
1854         }
1855         catch(exception& e) {
1856                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
1857                 exit(1);        
1858         }               
1859 }
1860
1861 /**************************************************************************************************/
1862
1863 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1864         
1865         try{
1866                 
1867                 int total = 0;
1868                 vector<double> newTau(numOTUs,0);
1869                 vector<double> norms(numSeqs, 0);
1870                 nSeqsPerOTU.assign(numOTUs, 0);
1871                 
1872                 for(int i=startSeq;i<stopSeq;i++){
1873                         int indexOffset = i * numOTUs;
1874                         
1875                         double offset = 1e8;
1876                         
1877                         for(int j=0;j<numOTUs;j++){
1878                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1879                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1880                                 }
1881                                 
1882                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1883                                         offset = dist[indexOffset + j];
1884                                 }
1885                         }
1886                         
1887                         for(int j=0;j<numOTUs;j++){
1888                                 if(weight[j] > MIN_WEIGHT){
1889                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1890                                         norms[i] += newTau[j];
1891                                 }
1892                                 else{
1893                                         newTau[j] = 0.0;
1894                                 }
1895                         }
1896                         
1897                         for(int j=0;j<numOTUs;j++){
1898                                 newTau[j] /= norms[i];
1899                         }
1900                         
1901                         for(int j=0;j<numOTUs;j++){
1902                                 if(newTau[j] > MIN_TAU){
1903                                         
1904                                         int oldTotal = total;
1905                                         
1906                                         total++;
1907                                         
1908                                         singleTau.resize(total, 0);
1909                                         seqNumber.resize(total, 0);
1910                                         seqIndex.resize(total, 0);
1911                                         
1912                                         singleTau[oldTotal] = newTau[j];
1913                                         
1914                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
1915                                         aaI[j][nSeqsPerOTU[j]] = i;
1916                                         nSeqsPerOTU[j]++;
1917                                 }
1918                         }
1919                 }
1920         }
1921         catch(exception& e) {
1922                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1923                 exit(1);        
1924         }               
1925 }
1926
1927 /**************************************************************************************************/
1928
1929 void ShhherCommand::setOTUs(){
1930         
1931         try {
1932                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1933                 
1934                 for(int i=0;i<numOTUs;i++){
1935                         for(int j=0;j<nSeqsPerOTU[i];j++){
1936                                 int index = cumNumSeqs[i] + j;
1937                                 double tauValue = singleTau[seqNumber[index]];
1938                                 int sIndex = seqIndex[index];
1939                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1940                         }
1941                 }
1942                 
1943                 for(int i=0;i<numSeqs;i++){
1944                         double maxTau = -1.0000;
1945                         int maxOTU = -1;
1946                         for(int j=0;j<numOTUs;j++){
1947                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1948                                         maxTau = bigTauMatrix[i * numOTUs + j];
1949                                         maxOTU = j;
1950                                 }
1951                         }
1952                         
1953                         otuData[i] = maxOTU;
1954                 }
1955                 
1956                 nSeqsPerOTU.assign(numOTUs, 0);         
1957                 
1958                 for(int i=0;i<numSeqs;i++){
1959                         int index = otuData[i];
1960                         
1961                         singleTau[i] = 1.0000;
1962                         dist[i] = 0.0000;
1963                         
1964                         aaP[index][nSeqsPerOTU[index]] = i;
1965                         aaI[index][nSeqsPerOTU[index]] = i;
1966                         
1967                         nSeqsPerOTU[index]++;
1968                 }
1969                 fill(); 
1970         }
1971         catch(exception& e) {
1972                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1973                 exit(1);        
1974         }               
1975 }
1976
1977 /**************************************************************************************************/
1978
1979 void ShhherCommand::writeQualities(vector<int> otuCounts){
1980         
1981         try {
1982                 string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.qual";
1983
1984                 ofstream qualityFile;
1985                 m->openOutputFile(qualityFileName, qualityFile);
1986
1987                 qualityFile.setf(ios::fixed, ios::floatfield);
1988                 qualityFile.setf(ios::showpoint);
1989                 qualityFile << setprecision(6);
1990                 
1991                 vector<vector<int> > qualities(numOTUs);
1992                 vector<double> pr(HOMOPS, 0);
1993                 
1994                 
1995                 for(int i=0;i<numOTUs;i++){
1996                         int index = 0;
1997                         int base = 0;
1998                         
1999                         if(nSeqsPerOTU[i] > 0){
2000                                 qualities[i].assign(1024, -1);
2001                                 
2002                                 while(index < numFlowCells){
2003                                         double maxPrValue = 1e8;
2004                                         short maxPrIndex = -1;
2005                                         double count = 0.0000;
2006                                         
2007                                         pr.assign(HOMOPS, 0);
2008                                         
2009                                         for(int j=0;j<nSeqsPerOTU[i];j++){
2010                                                 int lIndex = cumNumSeqs[i] + j;
2011                                                 double tauValue = singleTau[seqNumber[lIndex]];
2012                                                 int sequenceIndex = aaI[i][j];
2013                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
2014                                                 
2015                                                 count += tauValue;
2016                                                 
2017                                                 for(int s=0;s<HOMOPS;s++){
2018                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
2019                                                 }
2020                                         }
2021                                         
2022                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
2023                                         maxPrValue = pr[maxPrIndex];
2024                                         
2025                                         if(count > MIN_COUNT){
2026                                                 double U = 0.0000;
2027                                                 double norm = 0.0000;
2028                                                 
2029                                                 for(int s=0;s<HOMOPS;s++){
2030                                                         norm += exp(-(pr[s] - maxPrValue));
2031                                                 }
2032                                                 
2033                                                 for(int s=1;s<=maxPrIndex;s++){
2034                                                         int value = 0;
2035                                                         double temp = 0.0000;
2036                                                         
2037                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
2038                                                         
2039                                                         if(U>0.00){
2040                                                                 temp = log10(U);
2041                                                         }
2042                                                         else{
2043                                                                 temp = -10.1;
2044                                                         }
2045                                                         temp = floor(-10 * temp);
2046                                                         value = (int)floor(temp);
2047                                                         if(value > 100){        value = 100;    }
2048                                                         
2049                                                         qualities[i][base] = (int)value;
2050                                                         base++;
2051                                                 }
2052                                         }
2053                                         
2054                                         index++;
2055                                 }
2056                         }
2057                         
2058                         
2059                         if(otuCounts[i] > 0){
2060                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
2061                                 
2062                                 int j=4;        //need to get past the first four bases
2063                                 while(qualities[i][j] != -1){
2064                                         qualityFile << qualities[i][j] << ' ';
2065                                         j++;
2066                                 }
2067                                 qualityFile << endl;
2068                         }
2069                 }
2070                 qualityFile.close();
2071                 outputNames.push_back(qualityFileName);
2072
2073         }
2074         catch(exception& e) {
2075                 m->errorOut(e, "ShhherCommand", "writeQualities");
2076                 exit(1);        
2077         }               
2078 }
2079
2080 /**************************************************************************************************/
2081
2082 void ShhherCommand::writeSequences(vector<int> otuCounts){
2083         try {
2084                 
2085                 string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.fasta";
2086                 ofstream fastaFile;
2087                 m->openOutputFile(fastaFileName, fastaFile);
2088                 
2089                 vector<string> names(numOTUs, "");
2090                 
2091                 for(int i=0;i<numOTUs;i++){
2092                         int index = centroids[i];
2093                         
2094                         if(otuCounts[i] > 0){
2095                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
2096                                 
2097                                 string newSeq = "";
2098                                 
2099                                 for(int j=0;j<numFlowCells;j++){
2100                                         
2101                                         char base = flowOrder[j % 4];
2102                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
2103                                                 newSeq += base;
2104                                         }
2105                                 }
2106                                 
2107                                 fastaFile << newSeq.substr(4) << endl;
2108                         }
2109                 }
2110                 fastaFile.close();
2111
2112                 outputNames.push_back(fastaFileName);
2113
2114                 if(compositeFASTAFileName != ""){
2115                         m->appendFiles(fastaFileName, compositeFASTAFileName);
2116                 }
2117         }
2118         catch(exception& e) {
2119                 m->errorOut(e, "ShhherCommand", "writeSequences");
2120                 exit(1);        
2121         }               
2122 }
2123
2124 /**************************************************************************************************/
2125
2126 void ShhherCommand::writeNames(vector<int> otuCounts){
2127         try {
2128                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2129                 ofstream nameFile;
2130                 m->openOutputFile(nameFileName, nameFile);
2131                 
2132                 for(int i=0;i<numOTUs;i++){
2133                         if(otuCounts[i] > 0){
2134                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
2135                                 
2136                                 for(int j=1;j<nSeqsPerOTU[i];j++){
2137                                         nameFile << ',' << seqNameVector[aaI[i][j]];
2138                                 }
2139                                 
2140                                 nameFile << endl;
2141                         }
2142                 }
2143                 nameFile.close();
2144                 outputNames.push_back(nameFileName);
2145                 
2146                 
2147                 if(compositeNamesFileName != ""){
2148                         m->appendFiles(nameFileName, compositeNamesFileName);
2149                 }               
2150         }
2151         catch(exception& e) {
2152                 m->errorOut(e, "ShhherCommand", "writeNames");
2153                 exit(1);        
2154         }               
2155 }
2156
2157 /**************************************************************************************************/
2158
2159 void ShhherCommand::writeGroups(){
2160         try {
2161                 string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
2162                 string groupFileName = fileRoot + ".shhh.groups";
2163                 ofstream groupFile;
2164                 m->openOutputFile(groupFileName, groupFile);
2165                 
2166                 for(int i=0;i<numSeqs;i++){
2167                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
2168                 }
2169                 groupFile.close();
2170                 outputNames.push_back(groupFileName);
2171
2172         }
2173         catch(exception& e) {
2174                 m->errorOut(e, "ShhherCommand", "writeGroups");
2175                 exit(1);        
2176         }               
2177 }
2178
2179 /**************************************************************************************************/
2180
2181 void ShhherCommand::writeClusters(vector<int> otuCounts){
2182         try {
2183                 string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.counts";
2184                 ofstream otuCountsFile;
2185                 m->openOutputFile(otuCountsFileName, otuCountsFile);
2186                 
2187                 string bases = flowOrder;
2188                 
2189                 for(int i=0;i<numOTUs;i++){
2190                         //output the translated version of the centroid sequence for the otu
2191                         if(otuCounts[i] > 0){
2192                                 int index = centroids[i];
2193                                 
2194                                 otuCountsFile << "ideal\t";
2195                                 for(int j=8;j<numFlowCells;j++){
2196                                         char base = bases[j % 4];
2197                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
2198                                                 otuCountsFile << base;
2199                                         }
2200                                 }
2201                                 otuCountsFile << endl;
2202                                 
2203                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2204                                         int sequence = aaI[i][j];
2205                                         otuCountsFile << seqNameVector[sequence] << '\t';
2206                                         
2207                                         string newSeq = "";
2208                                         
2209                                         for(int k=0;k<lengths[sequence];k++){
2210                                                 char base = bases[k % 4];
2211                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
2212                                                         
2213                                                 for(int s=0;s<freq;s++){
2214                                                         newSeq += base;
2215                                                         //otuCountsFile << base;
2216                                                 }
2217                                         }
2218                                         otuCountsFile << newSeq.substr(4) << endl;
2219                                 }
2220                                 otuCountsFile << endl;
2221                         }
2222                 }
2223                 otuCountsFile.close();
2224                 outputNames.push_back(otuCountsFileName);
2225
2226         }
2227         catch(exception& e) {
2228                 m->errorOut(e, "ShhherCommand", "writeClusters");
2229                 exit(1);        
2230         }               
2231 }
2232
2233 //**********************************************************************************************************************