]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added check to make sure shhh.flows child processes finish properly. added subsamplin...
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
24                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ShhherCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string ShhherCommand::getHelpString(){  
39         try {
40                 string helpString = "";
41                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
42                 return helpString;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ShhherCommand", "getHelpString");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50
51 ShhherCommand::ShhherCommand(){ 
52         try {
53                 abort = true; calledHelp = true;
54                 setParameters();
55                 
56                 //initialize outputTypes
57 //              vector<string> tempOutNames;
58 //              outputTypes["pn.dist"] = tempOutNames;
59
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
63                 exit(1);
64         }
65 }
66
67 //**********************************************************************************************************************
68
69 ShhherCommand::ShhherCommand(string option) {
70         try {
71         
72 #ifdef USE_MPI
73                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
74                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
75         
76                 if(pid == 0){
77 #endif
78                 abort = false; calledHelp = false;   
79                 
80                 //allow user to run help
81                 if(option == "help") { help(); abort = true; calledHelp = true; }
82                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
83                 
84                 else {
85                         vector<string> myArray = setParameters();
86                         
87                         OptionParser parser(option);
88                         map<string,string> parameters = parser.getParameters();
89                         
90                         ValidParameters validParameter;
91                         map<string,string>::iterator it;
92                         
93                         //check to make sure all parameters are valid for command
94                         for (it = parameters.begin(); it != parameters.end(); it++) { 
95                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
96                         }
97                         
98                         //initialize outputTypes
99                         vector<string> tempOutNames;
100 //                      outputTypes["pn.dist"] = tempOutNames;
101                         //                      outputTypes["fasta"] = tempOutNames;
102                         
103                         //if the user changes the input directory command factory will send this info to us in the output parameter 
104                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
105                         if (inputDir == "not found"){   inputDir = "";          }
106                         else {
107                                 string path;
108                                 it = parameters.find("flow");
109                                 //user has given a template file
110                                 if(it != parameters.end()){ 
111                                         path = m->hasPath(it->second);
112                                         //if the user has not given a path then, add inputdir. else leave path alone.
113                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
114                                 }
115                                 
116                                 it = parameters.find("lookup");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
122                                 }
123
124                                 it = parameters.find("file");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
130                                 }
131                         }
132             
133             //if the user changes the output directory command factory will send this info to us in the output parameter 
134                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
135             
136                         //check for required parameters
137                         flowFileName = validParameter.validFile(parameters, "flow", true);
138                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
139                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
140                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
141                                 m->mothurOutEndLine();
142                                 abort = true; 
143                         }
144                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
145                         
146                         if(flowFileName != "not found"){
147                                 compositeFASTAFileName = "";    
148                                 compositeNamesFileName = "";    
149                         }
150                         else{
151                                 ofstream temp;
152                 
153                 string thisoutputDir = outputDir;
154                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
155                 
156                                 //we want to rip off .files, and also .flow if its there
157                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
158                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
159                 string extension = m->getExtension(fileroot);
160                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
161                 else { fileroot += "."; } //add back if needed
162                 
163                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
164                                 m->openOutputFile(compositeFASTAFileName, temp);
165                                 temp.close();
166                                 
167                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
168                                 m->openOutputFile(compositeNamesFileName, temp);
169                                 temp.close();
170                         }
171             
172             if(flowFilesFileName != "not found"){
173                 string fName;
174                 
175                 ifstream flowFilesFile;
176                 m->openInputFile(flowFilesFileName, flowFilesFile);
177                 while(flowFilesFile){
178                     fName = m->getline(flowFilesFile);
179                     
180                     //test if file is valid
181                     ifstream in;
182                     int ableToOpen = m->openInputFile(fName, in, "noerror");
183                     in.close(); 
184                     if (ableToOpen == 1) {
185                         if (inputDir != "") { //default path is set
186                             string tryPath = inputDir + fName;
187                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
188                             ifstream in2;
189                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
190                             in2.close();
191                             fName = tryPath;
192                         }
193                     }
194                     
195                     if (ableToOpen == 1) {
196                         if (m->getDefaultPath() != "") { //default path is set
197                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
198                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
199                             ifstream in2;
200                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
201                             in2.close();
202                             fName = tryPath;
203                         }
204                     }
205                     
206                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
207                     if (ableToOpen == 1) {
208                         string exepath = m->argv;
209                         string tempPath = exepath;
210                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
211                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
212                         
213                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
214                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
215                         ifstream in2;
216                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
217                         in2.close();
218                         fName = tryPath;
219                     }
220                     
221                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
222                     else { flowFileVector.push_back(fName); }
223                     m->gobble(flowFilesFile);
224                 }
225                 flowFilesFile.close();
226                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
227             }
228             else{
229                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
230                 flowFileVector.push_back(flowFileName);
231             }
232                 
233                         //check for optional parameter and set defaults
234                         // ...at some point should added some additional type checking...
235                         string temp;
236                         temp = validParameter.validFile(parameters, "lookup", true);
237                         if (temp == "not found")        {       
238                                 lookupFileName = "LookUp_Titanium.pat"; 
239                                 
240                                 int ableToOpen;
241                                 ifstream in;
242                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
243                                 in.close();     
244                                 
245                                 //if you can't open it, try input location
246                                 if (ableToOpen == 1) {
247                                         if (inputDir != "") { //default path is set
248                                                 string tryPath = inputDir + lookupFileName;
249                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
250                                                 ifstream in2;
251                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
252                                                 in2.close();
253                                                 lookupFileName = tryPath;
254                                         }
255                                 }
256                                 
257                                 //if you can't open it, try default location
258                                 if (ableToOpen == 1) {
259                                         if (m->getDefaultPath() != "") { //default path is set
260                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
261                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
262                                                 ifstream in2;
263                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
264                                                 in2.close();
265                                                 lookupFileName = tryPath;
266                                         }
267                                 }
268                                 
269                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
270                                 if (ableToOpen == 1) {
271                                         string exepath = m->argv;
272                                         string tempPath = exepath;
273                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
274                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
275                                         
276                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
277                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
278                                         ifstream in2;
279                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
280                                         in2.close();
281                                         lookupFileName = tryPath;
282                                 }
283                                 
284                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
285                         }
286                         else if(temp == "not open")     {       
287                                 
288                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
289                                 
290                                 //if you can't open it its not inputDir, try mothur excutable location
291                                 string exepath = m->argv;
292                                 string tempPath = exepath;
293                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
294                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
295                                         
296                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
297                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
298                                 ifstream in2;
299                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
300                                 in2.close();
301                                 lookupFileName = tryPath;
302                                 
303                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
304                         }else                                           {       lookupFileName = temp;  }
305                         
306                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
307                         m->setProcessors(temp);
308                         m->mothurConvert(temp, processors);
309
310                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
311                         m->mothurConvert(temp, cutoff); 
312                         
313                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
314                         m->mothurConvert(temp, minDelta); 
315
316                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
317                         m->mothurConvert(temp, maxIters); 
318             
319             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
320                         m->mothurConvert(temp, largeSize); 
321             if (largeSize != 0) { large = true; }
322             else { large = false;  }
323             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
324             
325 #ifdef USE_MPI
326             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
327 #endif
328             
329
330                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
331                         m->mothurConvert(temp, sigma); 
332                         
333                         flowOrder = validParameter.validFile(parameters, "order", false);
334                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
335                         else if(flowOrder.length() != 4){
336                                 m->mothurOut("The value of the order option must be four bases long\n");
337                         }
338                         
339                 }
340 #ifdef USE_MPI
341                 }                               
342 #endif
343         }
344         catch(exception& e) {
345                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
346                 exit(1);
347         }
348 }
349 //**********************************************************************************************************************
350 #ifdef USE_MPI
351 int ShhherCommand::execute(){
352         try {
353                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
354                 
355                 int tag = 1976;
356                 MPI_Status status; 
357                         
358                 if(pid == 0){
359
360                         for(int i=1;i<ncpus;i++){
361                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
362                         }
363                         if(abort == 1){ return 0;       }
364
365                         processors = ncpus;
366                         
367                         m->mothurOut("\nGetting preliminary data...\n");
368                         getSingleLookUp();      if (m->control_pressed) { return 0; }
369                         getJointLookUp();       if (m->control_pressed) { return 0; }
370                         
371             vector<string> flowFileVector;
372                         if(flowFilesFileName != "not found"){
373                                 string fName;
374                 
375                                 ifstream flowFilesFile;
376                                 m->openInputFile(flowFilesFileName, flowFilesFile);
377                                 while(flowFilesFile){
378                                         fName = m->getline(flowFilesFile);
379                                         flowFileVector.push_back(fName);
380                                         m->gobble(flowFilesFile);
381                                 }
382                         }
383                         else{
384                                 flowFileVector.push_back(flowFileName);
385                         }
386             
387                         int numFiles = flowFileVector.size();
388
389                         for(int i=1;i<ncpus;i++){
390                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
391                         }
392                         
393                         for(int i=0;i<numFiles;i++){
394                                 
395                                 if (m->control_pressed) { break; }
396                                 
397                                 double begClock = clock();
398                                 unsigned long long begTime = time(NULL);
399
400                                 flowFileName = flowFileVector[i];
401                                 
402                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
403                                 m->mothurOut("Reading flowgrams...\n");
404                                 getFlowData();
405                                 
406                                 if (m->control_pressed) { break; }
407
408                                 m->mothurOut("Identifying unique flowgrams...\n");
409                                 getUniques();
410                                 
411                                 if (m->control_pressed) { break; }
412
413                                 m->mothurOut("Calculating distances between flowgrams...\n");
414                                 char fileName[1024];
415                                 strcpy(fileName, flowFileName.c_str());
416
417                                 for(int i=1;i<ncpus;i++){
418                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
419
420                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
421                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
422                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
423                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
424                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
425                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
426                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
427                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
428                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
429                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
430                                 }                               
431                                                         
432                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
433                                 
434                                 if (m->control_pressed) { break; }
435                                 
436                                 int done;
437                                 for(int i=1;i<ncpus;i++){
438                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
439                                         
440                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
441                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
442                                 }
443
444                                 string namesFileName = createNamesFile();
445                                 
446                                 if (m->control_pressed) { break; }
447                                 
448                                 m->mothurOut("\nClustering flowgrams...\n");
449                                 string listFileName = cluster(distFileName, namesFileName);
450                                 
451                                 if (m->control_pressed) { break; }
452                                 
453                 
454                 
455                                 getOTUData(listFileName);
456
457                                 m->mothurRemove(distFileName);
458                                 m->mothurRemove(namesFileName);
459                                 m->mothurRemove(listFileName);
460                                 
461                                 if (m->control_pressed) { break; }
462                                 
463                                 initPyroCluster();
464
465                                 if (m->control_pressed) { break; }
466                                 
467             
468                                 for(int i=1;i<ncpus;i++){
469                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
470                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
471                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
472                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
473                                 }
474                                 
475                                 if (m->control_pressed) { break; }
476                                 
477                                 double maxDelta = 0;
478                                 int iter = 0;
479                                 
480                                 int numOTUsOnCPU = numOTUs / ncpus;
481                                 int numSeqsOnCPU = numSeqs / ncpus;
482                                 m->mothurOut("\nDenoising flowgrams...\n");
483                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
484                                 
485                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
486                                         
487                                         double cycClock = clock();
488                                         unsigned long long cycTime = time(NULL);
489                                         fill();
490                                         
491                                         if (m->control_pressed) { break; }
492
493                                         int total = singleTau.size();
494                                         for(int i=1;i<ncpus;i++){
495                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
496                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
497                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
498                                                 
499                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
500                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
501                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
502                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
503                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
504                                         }
505                                         
506                                         calcCentroidsDriver(0, numOTUsOnCPU);
507                                         
508                                         for(int i=1;i<ncpus;i++){
509                                                 int otuStart = i * numOTUs / ncpus;
510                                                 int otuStop = (i + 1) * numOTUs / ncpus;
511                                                 
512                                                 vector<int> tempCentroids(numOTUs, 0);
513                                                 vector<short> tempChange(numOTUs, 0);
514                                                 
515                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
516                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
517                                                 
518                                                 for(int j=otuStart;j<otuStop;j++){
519                                                         centroids[j] = tempCentroids[j];
520                                                         change[j] = tempChange[j];
521                                                 }
522                                         }
523                                                                         
524                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
525                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
526                                         checkCentroids(); if (m->control_pressed) { break; }
527                                         
528                                         for(int i=1;i<ncpus;i++){
529                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
530                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
531                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
532                                         }
533                                         
534                                         calcNewDistancesParent(0, numSeqsOnCPU);
535
536                                         total = singleTau.size();
537
538                                         for(int i=1;i<ncpus;i++){
539                                                 int childTotal;
540                                                 int seqStart = i * numSeqs / ncpus;
541                                                 int seqStop = (i + 1) * numSeqs / ncpus;
542                                                 
543                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
544
545                                                 vector<int> childSeqIndex(childTotal, 0);
546                                                 vector<double> childSingleTau(childTotal, 0);
547                                                 vector<double> childDist(numSeqs * numOTUs, 0);
548                                                 vector<int> otuIndex(childTotal, 0);
549                                                 
550                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
551                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
552                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
553                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
554
555                                                 int oldTotal = total;
556                                                 total += childTotal;
557                                                 singleTau.resize(total, 0);
558                                                 seqIndex.resize(total, 0);
559                                                 seqNumber.resize(total, 0);
560                                                 
561                                                 int childIndex = 0;
562                                                 
563                                                 for(int j=oldTotal;j<total;j++){
564                                                         int otuI = otuIndex[childIndex];
565                                                         int seqI = childSeqIndex[childIndex];
566                                                         
567                                                         singleTau[j] = childSingleTau[childIndex];
568                                                         
569                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
570                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
571                                                         nSeqsPerOTU[otuI]++;
572                                                         childIndex++;
573                                                 }
574                                                 
575                                                 int index = seqStart * numOTUs;
576                                                 for(int j=seqStart;j<seqStop;j++){
577                                                         for(int k=0;k<numOTUs;k++){
578                                                                 dist[index] = childDist[index];
579                                                                 index++;
580                                                         }
581                                                 }                                       
582                                         }
583                                         
584                                         iter++;
585                                         
586                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
587
588                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
589                                                 int live = 1;
590                                                 for(int i=1;i<ncpus;i++){
591                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
592                                                 }
593                                         }
594                                         else{
595                                                 int live = 0;
596                                                 for(int i=1;i<ncpus;i++){
597                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
598                                                 }
599                                         }
600                                         
601                                 }       
602                                 
603                                 if (m->control_pressed) { break; }
604                                 
605                                 m->mothurOut("\nFinalizing...\n");
606                                 fill();
607                                 
608                                 if (m->control_pressed) { break; }
609                                 
610                                 setOTUs();
611                                 
612                                 vector<int> otuCounts(numOTUs, 0);
613                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
614                                 calcCentroidsDriver(0, numOTUs);
615                                 
616                                 if (m->control_pressed) { break; }
617                                 
618                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
619                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
620                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
621                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
622                                 writeGroups();                          if (m->control_pressed) { break; }
623                                 
624                                                                  
625                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
626                         }
627                 }
628                 else{
629                         int abort = 1;
630
631                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
632                         if(abort){      return 0;       }
633
634                         int numFiles;
635                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
636
637                         for(int i=0;i<numFiles;i++){
638                                 
639                                 if (m->control_pressed) { break; }
640                                 
641                                 //Now into the pyrodist part
642                                 bool live = 1;
643
644                                 char fileName[1024];
645                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
646                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
647                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
648                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
649                                 
650                                 flowDataIntI.resize(numSeqs * numFlowCells);
651                                 flowDataPrI.resize(numSeqs * numFlowCells);
652                                 mapUniqueToSeq.resize(numSeqs);
653                                 mapSeqToUnique.resize(numSeqs);
654                                 lengths.resize(numSeqs);
655                                 jointLookUp.resize(NUMBINS * NUMBINS);
656
657                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
658                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
659                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
660                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
661                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
662                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
663                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
664
665                                 flowFileName = string(fileName);
666                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
667                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
668                                 
669                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
670                                 
671                                 if (m->control_pressed) { break; }
672                                 
673                                 int done = 1;
674                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
675
676                                 //Now into the pyronoise part
677                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
678                                 
679                                 singleLookUp.resize(HOMOPS * NUMBINS);
680                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
681                                 weight.resize(numOTUs);
682                                 centroids.resize(numOTUs);
683                                 change.resize(numOTUs);
684                                 dist.assign(numOTUs * numSeqs, 0);
685                                 nSeqsPerOTU.resize(numOTUs);
686                                 cumNumSeqs.resize(numOTUs);
687
688                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
689                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
690                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
691                                 
692                                 int startOTU = pid * numOTUs / ncpus;
693                                 int endOTU = (pid + 1) * numOTUs / ncpus;
694
695                                 int startSeq = pid * numSeqs / ncpus;
696                                 int endSeq = (pid + 1) * numSeqs /ncpus;
697                                 
698                                 int total;
699
700                                 while(live){
701                                         
702                                         if (m->control_pressed) { break; }
703                                         
704                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
705                                         singleTau.assign(total, 0.0000);
706                                         seqNumber.assign(total, 0);
707                                         seqIndex.assign(total, 0);
708
709                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
710                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
711                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
712                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
713                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
714                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
715                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
716
717                                         calcCentroidsDriver(startOTU, endOTU);
718
719                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
720                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
721
722                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
723                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
724                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
725
726                                         vector<int> otuIndex(total, 0);
727                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
728                                         total = otuIndex.size();
729                                         
730                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
731                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
732                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
733                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
734                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
735                                 
736                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
737                                 }
738                         }
739                 }
740                 
741                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
742                 
743                 MPI_Barrier(MPI_COMM_WORLD);
744
745                 
746                 if(compositeFASTAFileName != ""){
747                         outputNames.push_back(compositeFASTAFileName);
748                         outputNames.push_back(compositeNamesFileName);
749                 }
750
751                 m->mothurOutEndLine();
752                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
753                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
754                 m->mothurOutEndLine();
755                 
756                 return 0;
757
758         }
759         catch(exception& e) {
760                 m->errorOut(e, "ShhherCommand", "execute");
761                 exit(1);
762         }
763 }
764 /**************************************************************************************************/
765 string ShhherCommand::createNamesFile(){
766         try{
767                 
768                 vector<string> duplicateNames(numUniques, "");
769                 for(int i=0;i<numSeqs;i++){
770                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
771                 }
772                 
773                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
774                 
775                 ofstream nameFile;
776                 m->openOutputFile(nameFileName, nameFile);
777                 
778                 for(int i=0;i<numUniques;i++){
779                         
780                         if (m->control_pressed) { break; }
781                         
782             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
783                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
784                 }
785                 
786                 nameFile.close();
787                 return  nameFileName;
788         }
789         catch(exception& e) {
790                 m->errorOut(e, "ShhherCommand", "createNamesFile");
791                 exit(1);
792         }
793 }
794 /**************************************************************************************************/
795
796 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
797         try{            
798                 ostringstream outStream;
799                 outStream.setf(ios::fixed, ios::floatfield);
800                 outStream.setf(ios::dec, ios::basefield);
801                 outStream.setf(ios::showpoint);
802                 outStream.precision(6);
803                 
804                 int begTime = time(NULL);
805                 double begClock = clock();
806                 
807                 for(int i=startSeq;i<stopSeq;i++){
808                         
809                         if (m->control_pressed) { break; }
810                         
811                         for(int j=0;j<i;j++){
812                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
813                                 
814                                 if(flowDistance < 1e-6){
815                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
816                                 }
817                                 else if(flowDistance <= cutoff){
818                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
819                                 }
820                         }
821                         if(i % 100 == 0){
822                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
823                         }
824                 }
825                 
826                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
827                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
828                 
829                 if (m->control_pressed) { return fDistFileName; }
830                 
831                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
832
833                 ofstream distFile(fDistFileName.c_str());
834                 distFile << outStream.str();            
835                 distFile.close();
836                 
837                 return fDistFileName;
838         }
839         catch(exception& e) {
840                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
841                 exit(1);
842         }
843 }
844 /**************************************************************************************************/
845  
846 void ShhherCommand::getOTUData(string listFileName){
847     try {
848         
849         ifstream listFile;
850         m->openInputFile(listFileName, listFile);
851         string label;
852         
853         listFile >> label >> numOTUs;
854         
855         otuData.assign(numSeqs, 0);
856         cumNumSeqs.assign(numOTUs, 0);
857         nSeqsPerOTU.assign(numOTUs, 0);
858         aaP.clear();aaP.resize(numOTUs);
859         
860         seqNumber.clear();
861         aaI.clear();
862         seqIndex.clear();
863         
864         string singleOTU = "";
865         
866         for(int i=0;i<numOTUs;i++){
867             
868             if (m->control_pressed) { break; }
869             
870             listFile >> singleOTU;
871             
872             istringstream otuString(singleOTU);
873             
874             while(otuString){
875                 
876                 string seqName = "";
877                 
878                 for(int j=0;j<singleOTU.length();j++){
879                     char letter = otuString.get();
880                     
881                     if(letter != ','){
882                         seqName += letter;
883                     }
884                     else{
885                         map<string,int>::iterator nmIt = nameMap.find(seqName);
886                         int index = nmIt->second;
887                         
888                         nameMap.erase(nmIt);
889                         
890                         otuData[index] = i;
891                         nSeqsPerOTU[i]++;
892                         aaP[i].push_back(index);
893                         seqName = "";
894                     }
895                 }
896                 
897                 map<string,int>::iterator nmIt = nameMap.find(seqName);
898                 
899                 int index = nmIt->second;
900                 nameMap.erase(nmIt);
901                 
902                 otuData[index] = i;
903                 nSeqsPerOTU[i]++;
904                 aaP[i].push_back(index);        
905                 
906                 otuString.get();
907             }
908             
909             sort(aaP[i].begin(), aaP[i].end());
910             for(int j=0;j<nSeqsPerOTU[i];j++){
911                 seqNumber.push_back(aaP[i][j]);
912             }
913             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
914                 aaP[i].push_back(0);
915             }
916             
917             
918         }
919         
920         for(int i=1;i<numOTUs;i++){
921             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
922         }
923         aaI = aaP;
924         seqIndex = seqNumber;
925         
926         listFile.close();       
927         
928     }
929     catch(exception& e) {
930         m->errorOut(e, "ShhherCommand", "getOTUData");
931         exit(1);        
932     }           
933 }
934
935 /**************************************************************************************************/
936
937 void ShhherCommand::initPyroCluster(){                          
938     try{
939         if (numOTUs < processors) { processors = 1; }
940         
941         dist.assign(numSeqs * numOTUs, 0);
942         change.assign(numOTUs, 1);
943         centroids.assign(numOTUs, -1);
944         weight.assign(numOTUs, 0);
945         singleTau.assign(numSeqs, 1.0);
946         
947         nSeqsBreaks.assign(processors+1, 0);
948         nOTUsBreaks.assign(processors+1, 0);
949         
950         nSeqsBreaks[0] = 0;
951         for(int i=0;i<processors;i++){
952             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
953             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
954         }
955         nSeqsBreaks[processors] = numSeqs;
956         nOTUsBreaks[processors] = numOTUs;
957     }
958     catch(exception& e) {
959         m->errorOut(e, "ShhherCommand", "initPyroCluster");
960         exit(1);        
961     }
962 }
963
964 /**************************************************************************************************/
965
966 void ShhherCommand::fill(){
967     try {
968         int index = 0;
969         for(int i=0;i<numOTUs;i++){
970             
971             if (m->control_pressed) { break; }
972             
973             cumNumSeqs[i] = index;
974             for(int j=0;j<nSeqsPerOTU[i];j++){
975                 seqNumber[index] = aaP[i][j];
976                 seqIndex[index] = aaI[i][j];
977                 
978                 index++;
979             }
980         }
981     }
982     catch(exception& e) {
983         m->errorOut(e, "ShhherCommand", "fill");
984         exit(1);        
985     }           
986 }
987
988 /**************************************************************************************************/
989  
990 void ShhherCommand::getFlowData(){
991     try{
992         ifstream flowFile;
993         m->openInputFile(flowFileName, flowFile);
994         
995         string seqName;
996         seqNameVector.clear();
997         lengths.clear();
998         flowDataIntI.clear();
999         nameMap.clear();
1000         
1001         
1002         int currentNumFlowCells;
1003         
1004         float intensity;
1005         
1006         flowFile >> numFlowCells;
1007         int index = 0;//pcluster
1008         while(!flowFile.eof()){
1009             
1010             if (m->control_pressed) { break; }
1011             
1012             flowFile >> seqName >> currentNumFlowCells;
1013             lengths.push_back(currentNumFlowCells);
1014             
1015             seqNameVector.push_back(seqName);
1016             nameMap[seqName] = index++;//pcluster
1017             
1018             for(int i=0;i<numFlowCells;i++){
1019                 flowFile >> intensity;
1020                 if(intensity > 9.99)    {       intensity = 9.99;       }
1021                 int intI = int(100 * intensity + 0.0001);
1022                 flowDataIntI.push_back(intI);
1023             }
1024             m->gobble(flowFile);
1025         }
1026         flowFile.close();
1027         
1028         numSeqs = seqNameVector.size();         
1029         
1030         for(int i=0;i<numSeqs;i++){
1031             
1032             if (m->control_pressed) { break; }
1033             
1034             int iNumFlowCells = i * numFlowCells;
1035             for(int j=lengths[i];j<numFlowCells;j++){
1036                 flowDataIntI[iNumFlowCells + j] = 0;
1037             }
1038         }
1039         
1040     }
1041     catch(exception& e) {
1042         m->errorOut(e, "ShhherCommand", "getFlowData");
1043         exit(1);
1044     }
1045 }
1046 /**************************************************************************************************/
1047 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1048         
1049         try{
1050                 vector<double> newTau(numOTUs,0);
1051                 vector<double> norms(numSeqs, 0);
1052                 otuIndex.clear();
1053                 seqIndex.clear();
1054                 singleTau.clear();
1055                 
1056                 for(int i=startSeq;i<stopSeq;i++){
1057                         
1058                         if (m->control_pressed) { break; }
1059                         
1060                         double offset = 1e8;
1061                         int indexOffset = i * numOTUs;
1062                         
1063                         for(int j=0;j<numOTUs;j++){
1064                                 
1065                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1066                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1067                                 }
1068                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1069                                         offset = dist[indexOffset + j];
1070                                 }
1071                         }
1072                         
1073                         for(int j=0;j<numOTUs;j++){
1074                                 if(weight[j] > MIN_WEIGHT){
1075                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1076                                         norms[i] += newTau[j];
1077                                 }
1078                                 else{
1079                                         newTau[j] = 0.0;
1080                                 }
1081                         }
1082                         
1083                         for(int j=0;j<numOTUs;j++){
1084                 
1085                                 newTau[j] /= norms[i];
1086                                 
1087                                 if(newTau[j] > MIN_TAU){
1088                                         otuIndex.push_back(j);
1089                                         seqIndex.push_back(i);
1090                                         singleTau.push_back(newTau[j]);
1091                                 }
1092                         }
1093                         
1094                 }
1095         }
1096         catch(exception& e) {
1097                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1098                 exit(1);        
1099         }               
1100 }
1101
1102 /**************************************************************************************************/
1103
1104 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1105     
1106     try{
1107         
1108         int total = 0;
1109         vector<double> newTau(numOTUs,0);
1110         vector<double> norms(numSeqs, 0);
1111         nSeqsPerOTU.assign(numOTUs, 0);
1112         
1113         for(int i=startSeq;i<stopSeq;i++){
1114             
1115             if (m->control_pressed) { break; }
1116             
1117             int indexOffset = i * numOTUs;
1118             
1119             double offset = 1e8;
1120             
1121             for(int j=0;j<numOTUs;j++){
1122                 
1123                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1124                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1125                 }
1126                 
1127                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1128                     offset = dist[indexOffset + j];
1129                 }
1130             }
1131             
1132             for(int j=0;j<numOTUs;j++){
1133                 if(weight[j] > MIN_WEIGHT){
1134                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1135                     norms[i] += newTau[j];
1136                 }
1137                 else{
1138                     newTau[j] = 0.0;
1139                 }
1140             }
1141             
1142             for(int j=0;j<numOTUs;j++){
1143                 newTau[j] /= norms[i];
1144             }
1145             
1146             for(int j=0;j<numOTUs;j++){
1147                 if(newTau[j] > MIN_TAU){
1148                     
1149                     int oldTotal = total;
1150                     
1151                     total++;
1152                     
1153                     singleTau.resize(total, 0);
1154                     seqNumber.resize(total, 0);
1155                     seqIndex.resize(total, 0);
1156                     
1157                     singleTau[oldTotal] = newTau[j];
1158                     
1159                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1160                     aaI[j][nSeqsPerOTU[j]] = i;
1161                     nSeqsPerOTU[j]++;
1162                 }
1163             }
1164             
1165         }
1166         
1167     }
1168     catch(exception& e) {
1169         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1170         exit(1);        
1171     }           
1172 }
1173
1174 /**************************************************************************************************/
1175
1176 void ShhherCommand::setOTUs(){
1177     
1178     try {
1179         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1180         
1181         for(int i=0;i<numOTUs;i++){
1182             
1183             if (m->control_pressed) { break; }
1184             
1185             for(int j=0;j<nSeqsPerOTU[i];j++){
1186                 int index = cumNumSeqs[i] + j;
1187                 double tauValue = singleTau[seqNumber[index]];
1188                 int sIndex = seqIndex[index];
1189                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1190             }
1191         }
1192         
1193         for(int i=0;i<numSeqs;i++){
1194             double maxTau = -1.0000;
1195             int maxOTU = -1;
1196             for(int j=0;j<numOTUs;j++){
1197                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1198                     maxTau = bigTauMatrix[i * numOTUs + j];
1199                     maxOTU = j;
1200                 }
1201             }
1202             
1203             otuData[i] = maxOTU;
1204         }
1205         
1206         nSeqsPerOTU.assign(numOTUs, 0);         
1207         
1208         for(int i=0;i<numSeqs;i++){
1209             int index = otuData[i];
1210             
1211             singleTau[i] = 1.0000;
1212             dist[i] = 0.0000;
1213             
1214             aaP[index][nSeqsPerOTU[index]] = i;
1215             aaI[index][nSeqsPerOTU[index]] = i;
1216             
1217             nSeqsPerOTU[index]++;
1218         }
1219         fill(); 
1220     }
1221     catch(exception& e) {
1222         m->errorOut(e, "ShhherCommand", "setOTUs");
1223         exit(1);        
1224     }           
1225 }
1226
1227 /**************************************************************************************************/
1228  
1229 void ShhherCommand::getUniques(){
1230     try{
1231         
1232         
1233         numUniques = 0;
1234         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1235         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1236         uniqueLengths.assign(numSeqs, 0);
1237         mapSeqToUnique.assign(numSeqs, -1);
1238         mapUniqueToSeq.assign(numSeqs, -1);
1239         
1240         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1241         
1242         for(int i=0;i<numSeqs;i++){
1243             
1244             if (m->control_pressed) { break; }
1245             
1246             int index = 0;
1247             
1248             vector<short> current(numFlowCells);
1249             for(int j=0;j<numFlowCells;j++){
1250                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1251             }
1252             
1253             for(int j=0;j<numUniques;j++){
1254                 int offset = j * numFlowCells;
1255                 bool toEnd = 1;
1256                 
1257                 int shorterLength;
1258                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1259                 else                                                            {       shorterLength = uniqueLengths[j];       }
1260                 
1261                 for(int k=0;k<shorterLength;k++){
1262                     if(current[k] != uniqueFlowgrams[offset + k]){
1263                         toEnd = 0;
1264                         break;
1265                     }
1266                 }
1267                 
1268                 if(toEnd){
1269                     mapSeqToUnique[i] = j;
1270                     uniqueCount[j]++;
1271                     index = j;
1272                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1273                     break;
1274                 }
1275                 index++;
1276             }
1277             
1278             if(index == numUniques){
1279                 uniqueLengths[numUniques] = lengths[i];
1280                 uniqueCount[numUniques] = 1;
1281                 mapSeqToUnique[i] = numUniques;//anMap
1282                 mapUniqueToSeq[numUniques] = i;//anF
1283                 
1284                 for(int k=0;k<numFlowCells;k++){
1285                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1286                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1287                 }
1288                 
1289                 numUniques++;
1290             }
1291         }
1292         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1293         uniqueLengths.resize(numUniques);       
1294         
1295         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1296         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1297     }
1298     catch(exception& e) {
1299         m->errorOut(e, "ShhherCommand", "getUniques");
1300         exit(1);
1301     }
1302 }
1303
1304 /**************************************************************************************************/
1305
1306 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1307     try{
1308         int minLength = lengths[mapSeqToUnique[seqA]];
1309         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1310         
1311         int ANumFlowCells = seqA * numFlowCells;
1312         int BNumFlowCells = seqB * numFlowCells;
1313         
1314         float dist = 0;
1315         
1316         for(int i=0;i<minLength;i++){
1317             
1318             if (m->control_pressed) { break; }
1319             
1320             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1321             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1322             
1323             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1324             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1325             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1326         }
1327         
1328         dist /= (float) minLength;
1329         return dist;
1330     }
1331     catch(exception& e) {
1332         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1333         exit(1);
1334     }
1335 }
1336
1337 //**********************************************************************************************************************/
1338
1339 string ShhherCommand::cluster(string distFileName, string namesFileName){
1340     try {
1341         
1342         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1343         read->setCutoff(cutoff);
1344         
1345         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1346         clusterNameMap->readMap();
1347         read->read(clusterNameMap);
1348         
1349         ListVector* list = read->getListVector();
1350         SparseMatrix* matrix = read->getMatrix();
1351         
1352         delete read; 
1353         delete clusterNameMap; 
1354         
1355         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1356         
1357         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1358         string tag = cluster->getTag();
1359         
1360         double clusterCutoff = cutoff;
1361         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1362             
1363             if (m->control_pressed) { break; }
1364             
1365             cluster->update(clusterCutoff);
1366         }
1367         
1368         list->setLabel(toString(cutoff));
1369         
1370         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1371         ofstream listFile;
1372         m->openOutputFile(listFileName, listFile);
1373         list->print(listFile);
1374         listFile.close();
1375         
1376         delete matrix;  delete cluster; delete rabund; delete list;
1377         
1378         return listFileName;
1379     }
1380     catch(exception& e) {
1381         m->errorOut(e, "ShhherCommand", "cluster");
1382         exit(1);        
1383     }           
1384 }
1385
1386 /**************************************************************************************************/
1387
1388 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1389     
1390     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1391     //within an otu
1392     
1393     try{
1394         
1395         for(int i=start;i<finish;i++){
1396             
1397             if (m->control_pressed) { break; }
1398             
1399             double count = 0;
1400             int position = 0;
1401             int minFlowGram = 100000000;
1402             double minFlowValue = 1e8;
1403             change[i] = 0; //FALSE
1404             
1405             for(int j=0;j<nSeqsPerOTU[i];j++){
1406                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1407             }
1408             
1409             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1410                 vector<double> adF(nSeqsPerOTU[i]);
1411                 vector<int> anL(nSeqsPerOTU[i]);
1412                 
1413                 for(int j=0;j<nSeqsPerOTU[i];j++){
1414                     int index = cumNumSeqs[i] + j;
1415                     int nI = seqIndex[index];
1416                     int nIU = mapSeqToUnique[nI];
1417                     
1418                     int k;
1419                     for(k=0;k<position;k++){
1420                         if(nIU == anL[k]){
1421                             break;
1422                         }
1423                     }
1424                     if(k == position){
1425                         anL[position] = nIU;
1426                         adF[position] = 0.0000;
1427                         position++;
1428                     }                                           
1429                 }
1430                 
1431                 for(int j=0;j<nSeqsPerOTU[i];j++){
1432                     int index = cumNumSeqs[i] + j;
1433                     int nI = seqIndex[index];
1434                     
1435                     double tauValue = singleTau[seqNumber[index]];
1436                     
1437                     for(int k=0;k<position;k++){
1438                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1439                         adF[k] += dist * tauValue;
1440                     }
1441                 }
1442                 
1443                 for(int j=0;j<position;j++){
1444                     if(adF[j] < minFlowValue){
1445                         minFlowGram = j;
1446                         minFlowValue = adF[j];
1447                     }
1448                 }
1449                 
1450                 if(centroids[i] != anL[minFlowGram]){
1451                     change[i] = 1;
1452                     centroids[i] = anL[minFlowGram];
1453                 }
1454             }
1455             else if(centroids[i] != -1){
1456                 change[i] = 1;
1457                 centroids[i] = -1;                      
1458             }
1459         }
1460     }
1461     catch(exception& e) {
1462         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1463         exit(1);        
1464     }           
1465 }
1466
1467 /**************************************************************************************************/
1468
1469 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1470     try{
1471         
1472         int flowAValue = cent * numFlowCells;
1473         int flowBValue = flow * numFlowCells;
1474         
1475         double dist = 0;
1476         
1477         for(int i=0;i<length;i++){
1478             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1479             flowAValue++;
1480             flowBValue++;
1481         }
1482         
1483         return dist / (double)length;
1484     }
1485     catch(exception& e) {
1486         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1487         exit(1);        
1488     }           
1489 }
1490
1491 /**************************************************************************************************/
1492
1493 double ShhherCommand::getNewWeights(){
1494     try{
1495         
1496         double maxChange = 0;
1497         
1498         for(int i=0;i<numOTUs;i++){
1499             
1500             if (m->control_pressed) { break; }
1501             
1502             double difference = weight[i];
1503             weight[i] = 0;
1504             
1505             for(int j=0;j<nSeqsPerOTU[i];j++){
1506                 int index = cumNumSeqs[i] + j;
1507                 double tauValue = singleTau[seqNumber[index]];
1508                 weight[i] += tauValue;
1509             }
1510             
1511             difference = fabs(weight[i] - difference);
1512             if(difference > maxChange){ maxChange = difference; }
1513         }
1514         return maxChange;
1515     }
1516     catch(exception& e) {
1517         m->errorOut(e, "ShhherCommand", "getNewWeights");
1518         exit(1);        
1519     }           
1520 }
1521  
1522  /**************************************************************************************************/
1523  
1524 double ShhherCommand::getLikelihood(){
1525     
1526     try{
1527         
1528         vector<long double> P(numSeqs, 0);
1529         int effNumOTUs = 0;
1530         
1531         for(int i=0;i<numOTUs;i++){
1532             if(weight[i] > MIN_WEIGHT){
1533                 effNumOTUs++;
1534             }
1535         }
1536         
1537         string hold;
1538         for(int i=0;i<numOTUs;i++){
1539             
1540             if (m->control_pressed) { break; }
1541             
1542             for(int j=0;j<nSeqsPerOTU[i];j++){
1543                 int index = cumNumSeqs[i] + j;
1544                 int nI = seqIndex[index];
1545                 double singleDist = dist[seqNumber[index]];
1546                 
1547                 P[nI] += weight[i] * exp(-singleDist * sigma);
1548             }
1549         }
1550         double nLL = 0.00;
1551         for(int i=0;i<numSeqs;i++){
1552             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1553             
1554             nLL += -log(P[i]);
1555         }
1556         
1557         nLL = nLL -(double)numSeqs * log(sigma);
1558         
1559         return nLL; 
1560     }
1561     catch(exception& e) {
1562         m->errorOut(e, "ShhherCommand", "getNewWeights");
1563         exit(1);        
1564     }           
1565 }
1566
1567 /**************************************************************************************************/
1568
1569 void ShhherCommand::checkCentroids(){
1570     try{
1571         vector<int> unique(numOTUs, 1);
1572         
1573         for(int i=0;i<numOTUs;i++){
1574             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1575                 unique[i] = -1;
1576             }
1577         }
1578         
1579         for(int i=0;i<numOTUs;i++){
1580             
1581             if (m->control_pressed) { break; }
1582             
1583             if(unique[i] == 1){
1584                 for(int j=i+1;j<numOTUs;j++){
1585                     if(unique[j] == 1){
1586                         
1587                         if(centroids[j] == centroids[i]){
1588                             unique[j] = 0;
1589                             centroids[j] = -1;
1590                             
1591                             weight[i] += weight[j];
1592                             weight[j] = 0.0;
1593                         }
1594                     }
1595                 }
1596             }
1597         }
1598     }
1599     catch(exception& e) {
1600         m->errorOut(e, "ShhherCommand", "checkCentroids");
1601         exit(1);        
1602     }           
1603 }
1604  /**************************************************************************************************/
1605
1606
1607  
1608 void ShhherCommand::writeQualities(vector<int> otuCounts){
1609     
1610     try {
1611         string thisOutputDir = outputDir;
1612         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1613         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1614         
1615         ofstream qualityFile;
1616         m->openOutputFile(qualityFileName, qualityFile);
1617         
1618         qualityFile.setf(ios::fixed, ios::floatfield);
1619         qualityFile.setf(ios::showpoint);
1620         qualityFile << setprecision(6);
1621         
1622         vector<vector<int> > qualities(numOTUs);
1623         vector<double> pr(HOMOPS, 0);
1624         
1625         
1626         for(int i=0;i<numOTUs;i++){
1627             
1628             if (m->control_pressed) { break; }
1629             
1630             int index = 0;
1631             int base = 0;
1632             
1633             if(nSeqsPerOTU[i] > 0){
1634                 qualities[i].assign(1024, -1);
1635                 
1636                 while(index < numFlowCells){
1637                     double maxPrValue = 1e8;
1638                     short maxPrIndex = -1;
1639                     double count = 0.0000;
1640                     
1641                     pr.assign(HOMOPS, 0);
1642                     
1643                     for(int j=0;j<nSeqsPerOTU[i];j++){
1644                         int lIndex = cumNumSeqs[i] + j;
1645                         double tauValue = singleTau[seqNumber[lIndex]];
1646                         int sequenceIndex = aaI[i][j];
1647                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1648                         
1649                         count += tauValue;
1650                         
1651                         for(int s=0;s<HOMOPS;s++){
1652                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1653                         }
1654                     }
1655                     
1656                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1657                     maxPrValue = pr[maxPrIndex];
1658                     
1659                     if(count > MIN_COUNT){
1660                         double U = 0.0000;
1661                         double norm = 0.0000;
1662                         
1663                         for(int s=0;s<HOMOPS;s++){
1664                             norm += exp(-(pr[s] - maxPrValue));
1665                         }
1666                         
1667                         for(int s=1;s<=maxPrIndex;s++){
1668                             int value = 0;
1669                             double temp = 0.0000;
1670                             
1671                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1672                             
1673                             if(U>0.00){
1674                                 temp = log10(U);
1675                             }
1676                             else{
1677                                 temp = -10.1;
1678                             }
1679                             temp = floor(-10 * temp);
1680                             value = (int)floor(temp);
1681                             if(value > 100){    value = 100;    }
1682                             
1683                             qualities[i][base] = (int)value;
1684                             base++;
1685                         }
1686                     }
1687                     
1688                     index++;
1689                 }
1690             }
1691             
1692             
1693             if(otuCounts[i] > 0){
1694                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1695                 
1696                 int j=4;        //need to get past the first four bases
1697                 while(qualities[i][j] != -1){
1698                     qualityFile << qualities[i][j] << ' ';
1699                     j++;
1700                 }
1701                 qualityFile << endl;
1702             }
1703         }
1704         qualityFile.close();
1705         outputNames.push_back(qualityFileName);
1706         
1707     }
1708     catch(exception& e) {
1709         m->errorOut(e, "ShhherCommand", "writeQualities");
1710         exit(1);        
1711     }           
1712 }
1713
1714 /**************************************************************************************************/
1715
1716 void ShhherCommand::writeSequences(vector<int> otuCounts){
1717     try {
1718         string thisOutputDir = outputDir;
1719         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1720         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1721         ofstream fastaFile;
1722         m->openOutputFile(fastaFileName, fastaFile);
1723         
1724         vector<string> names(numOTUs, "");
1725         
1726         for(int i=0;i<numOTUs;i++){
1727             
1728             if (m->control_pressed) { break; }
1729             
1730             int index = centroids[i];
1731             
1732             if(otuCounts[i] > 0){
1733                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1734                 
1735                 string newSeq = "";
1736                 
1737                 for(int j=0;j<numFlowCells;j++){
1738                     
1739                     char base = flowOrder[j % 4];
1740                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1741                         newSeq += base;
1742                     }
1743                 }
1744                 
1745                 fastaFile << newSeq.substr(4) << endl;
1746             }
1747         }
1748         fastaFile.close();
1749         
1750         outputNames.push_back(fastaFileName);
1751         
1752         if(compositeFASTAFileName != ""){
1753             m->appendFiles(fastaFileName, compositeFASTAFileName);
1754         }
1755     }
1756     catch(exception& e) {
1757         m->errorOut(e, "ShhherCommand", "writeSequences");
1758         exit(1);        
1759     }           
1760 }
1761
1762 /**************************************************************************************************/
1763
1764 void ShhherCommand::writeNames(vector<int> otuCounts){
1765     try {
1766         string thisOutputDir = outputDir;
1767         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1768         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1769         ofstream nameFile;
1770         m->openOutputFile(nameFileName, nameFile);
1771         
1772         for(int i=0;i<numOTUs;i++){
1773             
1774             if (m->control_pressed) { break; }
1775             
1776             if(otuCounts[i] > 0){
1777                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1778                 
1779                 for(int j=1;j<nSeqsPerOTU[i];j++){
1780                     nameFile << ',' << seqNameVector[aaI[i][j]];
1781                 }
1782                 
1783                 nameFile << endl;
1784             }
1785         }
1786         nameFile.close();
1787         outputNames.push_back(nameFileName);
1788         
1789         
1790         if(compositeNamesFileName != ""){
1791             m->appendFiles(nameFileName, compositeNamesFileName);
1792         }               
1793     }
1794     catch(exception& e) {
1795         m->errorOut(e, "ShhherCommand", "writeNames");
1796         exit(1);        
1797     }           
1798 }
1799
1800 /**************************************************************************************************/
1801
1802 void ShhherCommand::writeGroups(){
1803     try {
1804         string thisOutputDir = outputDir;
1805         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1806         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1807         string groupFileName = fileRoot + "shhh.groups";
1808         ofstream groupFile;
1809         m->openOutputFile(groupFileName, groupFile);
1810         
1811         for(int i=0;i<numSeqs;i++){
1812             if (m->control_pressed) { break; }
1813             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1814         }
1815         groupFile.close();
1816         outputNames.push_back(groupFileName);
1817         
1818     }
1819     catch(exception& e) {
1820         m->errorOut(e, "ShhherCommand", "writeGroups");
1821         exit(1);        
1822     }           
1823 }
1824
1825 /**************************************************************************************************/
1826
1827 void ShhherCommand::writeClusters(vector<int> otuCounts){
1828     try {
1829         string thisOutputDir = outputDir;
1830         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1831         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1832         ofstream otuCountsFile;
1833         m->openOutputFile(otuCountsFileName, otuCountsFile);
1834         
1835         string bases = flowOrder;
1836         
1837         for(int i=0;i<numOTUs;i++){
1838             
1839             if (m->control_pressed) {
1840                 break;
1841             }
1842             //output the translated version of the centroid sequence for the otu
1843             if(otuCounts[i] > 0){
1844                 int index = centroids[i];
1845                 
1846                 otuCountsFile << "ideal\t";
1847                 for(int j=8;j<numFlowCells;j++){
1848                     char base = bases[j % 4];
1849                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1850                         otuCountsFile << base;
1851                     }
1852                 }
1853                 otuCountsFile << endl;
1854                 
1855                 for(int j=0;j<nSeqsPerOTU[i];j++){
1856                     int sequence = aaI[i][j];
1857                     otuCountsFile << seqNameVector[sequence] << '\t';
1858                     
1859                     string newSeq = "";
1860                     
1861                     for(int k=0;k<lengths[sequence];k++){
1862                         char base = bases[k % 4];
1863                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1864                         
1865                         for(int s=0;s<freq;s++){
1866                             newSeq += base;
1867                             //otuCountsFile << base;
1868                         }
1869                     }
1870                     otuCountsFile << newSeq.substr(4) << endl;
1871                 }
1872                 otuCountsFile << endl;
1873             }
1874         }
1875         otuCountsFile.close();
1876         outputNames.push_back(otuCountsFileName);
1877         
1878     }
1879     catch(exception& e) {
1880         m->errorOut(e, "ShhherCommand", "writeClusters");
1881         exit(1);        
1882     }           
1883 }
1884
1885 #else
1886 //**********************************************************************************************************************
1887
1888 int ShhherCommand::execute(){
1889         try {
1890                 if (abort == true) { return 0; }
1891                 
1892                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1893                 getJointLookUp();       if (m->control_pressed) { return 0; }
1894                 
1895         int numFiles = flowFileVector.size();
1896                 
1897         if (numFiles < processors) { processors = numFiles; }
1898         
1899 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1900         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1901         else { createProcesses(flowFileVector); } //each processor processes one file
1902 #else
1903         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1904 #endif
1905         
1906                 if(compositeFASTAFileName != ""){
1907                         outputNames.push_back(compositeFASTAFileName);
1908                         outputNames.push_back(compositeNamesFileName);
1909                 }
1910
1911                 m->mothurOutEndLine();
1912                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1913                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1914                 m->mothurOutEndLine();
1915                 
1916                 return 0;
1917         }
1918         catch(exception& e) {
1919                 m->errorOut(e, "ShhherCommand", "execute");
1920                 exit(1);
1921         }
1922 }
1923 #endif
1924 /**************************************************************************************************/
1925
1926 int ShhherCommand::createProcesses(vector<string> filenames){
1927     try {
1928         vector<int> processIDS;
1929                 int process = 1;
1930                 int num = 0;
1931                 
1932                 //sanity check
1933                 if (filenames.size() < processors) { processors = filenames.size(); }
1934                 
1935                 //divide the groups between the processors
1936                 vector<linePair> lines;
1937         vector<int> numFilesToComplete;
1938                 int numFilesPerProcessor = filenames.size() / processors;
1939                 for (int i = 0; i < processors; i++) {
1940                         int startIndex =  i * numFilesPerProcessor;
1941                         int endIndex = (i+1) * numFilesPerProcessor;
1942                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1943                         lines.push_back(linePair(startIndex, endIndex));
1944             numFilesToComplete.push_back((endIndex-startIndex));
1945                 }
1946                 
1947         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1948                 
1949                 //loop through and create all the processes you want
1950                 while (process != processors) {
1951                         int pid = fork();
1952                         
1953                         if (pid > 0) {
1954                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1955                                 process++;
1956                         }else if (pid == 0){
1957                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1958                 
1959                 //pass numSeqs to parent
1960                                 ofstream out;
1961                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
1962                                 m->openOutputFile(tempFile, out);
1963                                 out << num << endl;
1964                                 out.close();
1965                 
1966                                 exit(0);
1967                         }else { 
1968                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1969                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1970                                 exit(0);
1971                         }
1972                 }
1973                 
1974                 //do my part
1975                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1976                 
1977                 //force parent to wait until all the processes are done
1978                 for (int i=0;i<processIDS.size();i++) { 
1979                         int temp = processIDS[i];
1980                         wait(&temp);
1981                 }
1982         
1983         #else
1984         
1985         //////////////////////////////////////////////////////////////////////////////////////////////////////
1986         
1987         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1988         
1989         //////////////////////////////////////////////////////////////////////////////////////////////////////
1990                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
1991                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1992                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1993                 
1994                 vector<shhhFlowsData*> pDataArray; 
1995                 DWORD   dwThreadIdArray[processors-1];
1996                 HANDLE  hThreadArray[processors-1]; 
1997                 
1998                 //Create processor worker threads.
1999                 for( int i=0; i<processors-1; i++ ){
2000                         // Allocate memory for thread data.
2001                         string extension = "";
2002                         if (i != 0) { extension = toString(i) + ".temp"; }
2003                         
2004             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2005                         pDataArray.push_back(tempFlow);
2006                         processIDS.push_back(i);
2007             
2008                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2009                 }
2010                 
2011                 //using the main process as a worker saves time and memory
2012                 //do my part
2013                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2014                 
2015                 //Wait until all threads have terminated.
2016                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2017                 
2018                 //Close all thread handles and free memory allocations.
2019                 for(int i=0; i < pDataArray.size(); i++){
2020                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2021                         CloseHandle(hThreadArray[i]);
2022                         delete pDataArray[i];
2023                 }
2024                 
2025         #endif
2026         
2027         for (int i=0;i<processIDS.size();i++) { 
2028             ifstream in;
2029                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2030                         m->openInputFile(tempFile, in);
2031                         if (!in.eof()) { 
2032                 int tempNum = 0; 
2033                 in >> tempNum; 
2034                 if (tempNum != numFilesToComplete[i+1]) {
2035                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2036                 }
2037             }
2038                         in.close(); m->mothurRemove(tempFile);
2039             
2040             if (compositeFASTAFileName != "") {
2041                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2042                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2043                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2044                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2045             }
2046         }
2047         
2048         return 0;
2049         
2050     }
2051         catch(exception& e) {
2052                 m->errorOut(e, "ShhherCommand", "createProcesses");
2053                 exit(1);
2054         }
2055 }
2056 /**************************************************************************************************/
2057
2058 vector<string> ShhherCommand::parseFlowFiles(string filename){
2059     try {
2060         vector<string> files;
2061         int count = 0;
2062         
2063         ifstream in;
2064         m->openInputFile(filename, in);
2065         
2066         int thisNumFLows = 0;
2067         in >> thisNumFLows; m->gobble(in);
2068         
2069         while (!in.eof()) {
2070             if (m->control_pressed) { break; }
2071             
2072             ofstream out;
2073             string outputFileName = filename + toString(count) + ".temp";
2074             m->openOutputFile(outputFileName, out);
2075             out << thisNumFLows << endl;
2076             files.push_back(outputFileName);
2077             
2078             int numLinesWrote = 0;
2079             for (int i = 0; i < largeSize; i++) {
2080                 if (in.eof()) { break; }
2081                 string line = m->getline(in); m->gobble(in);
2082                 out << line << endl;
2083                 numLinesWrote++;
2084             }
2085             out.close();
2086             
2087             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2088             count++;
2089         }
2090         in.close();
2091         
2092         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2093         
2094         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2095         
2096         return files;
2097     }
2098         catch(exception& e) {
2099                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2100                 exit(1);
2101         }
2102 }
2103 /**************************************************************************************************/
2104
2105 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2106     try {
2107         
2108         int numCompleted = 0;
2109         
2110         for(int i=start;i<end;i++){
2111                         
2112                         if (m->control_pressed) { break; }
2113                         
2114             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2115             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2116             
2117             if (m->control_pressed) { break; }
2118             
2119             double begClock = clock();
2120             unsigned long long begTime;
2121             
2122             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2123                 
2124                 string flowFileName = theseFlowFileNames[g];
2125                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2126                 m->mothurOut("Reading flowgrams...\n");
2127                 
2128                 vector<string> seqNameVector;
2129                 vector<int> lengths;
2130                 vector<short> flowDataIntI;
2131                 vector<double> flowDataPrI;
2132                 map<string, int> nameMap;
2133                 vector<short> uniqueFlowgrams;
2134                 vector<int> uniqueCount;
2135                 vector<int> mapSeqToUnique;
2136                 vector<int> mapUniqueToSeq;
2137                 vector<int> uniqueLengths;
2138                 int numFlowCells;
2139                 
2140                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2141                 
2142                 if (m->control_pressed) { break; }
2143                 
2144                 m->mothurOut("Identifying unique flowgrams...\n");
2145                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2146                 
2147                 if (m->control_pressed) { break; }
2148                 
2149                 m->mothurOut("Calculating distances between flowgrams...\n");
2150                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2151                 begTime = time(NULL);
2152                
2153                 
2154                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); 
2155                 
2156                 m->mothurOutEndLine();
2157                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2158                 
2159                 
2160                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2161                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2162                 
2163                 if (m->control_pressed) { break; }
2164                 
2165                 m->mothurOut("\nClustering flowgrams...\n");
2166                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2167                 cluster(listFileName, distFileName, namesFileName);
2168                 
2169                 if (m->control_pressed) { break; }
2170                 
2171                 vector<int> otuData;
2172                 vector<int> cumNumSeqs;
2173                 vector<int> nSeqsPerOTU;
2174                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2175                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2176                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2177                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2178                 
2179                 
2180                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2181                 
2182                 if (m->control_pressed) { break; }
2183                 
2184                 m->mothurRemove(distFileName);
2185                 m->mothurRemove(namesFileName);
2186                 m->mothurRemove(listFileName);
2187                 
2188                 vector<double> dist;            //adDist - distance of sequences to centroids
2189                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2190                 vector<int> centroids;          //the representative flowgram for each cluster m
2191                 vector<double> weight;
2192                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2193                 vector<int> nSeqsBreaks;
2194                 vector<int> nOTUsBreaks;
2195                 
2196                 dist.assign(numSeqs * numOTUs, 0);
2197                 change.assign(numOTUs, 1);
2198                 centroids.assign(numOTUs, -1);
2199                 weight.assign(numOTUs, 0);
2200                 singleTau.assign(numSeqs, 1.0);
2201                 
2202                 nSeqsBreaks.assign(2, 0);
2203                 nOTUsBreaks.assign(2, 0);
2204                 
2205                 nSeqsBreaks[0] = 0;
2206                 nSeqsBreaks[1] = numSeqs;
2207                 nOTUsBreaks[1] = numOTUs;
2208                 
2209                 if (m->control_pressed) { break; }
2210                 
2211                 double maxDelta = 0;
2212                 int iter = 0;
2213                 
2214                 begClock = clock();
2215                 begTime = time(NULL);
2216                 
2217                 m->mothurOut("\nDenoising flowgrams...\n");
2218                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2219                 
2220                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2221                     
2222                     if (m->control_pressed) { break; }
2223                     
2224                     double cycClock = clock();
2225                     unsigned long long cycTime = time(NULL);
2226                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2227                     
2228                     if (m->control_pressed) { break; }
2229                     
2230                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2231                     
2232                     if (m->control_pressed) { break; }
2233                     
2234                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2235                     
2236                     if (m->control_pressed) { break; }
2237                     
2238                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2239                     
2240                     if (m->control_pressed) { break; }
2241                     
2242                     checkCentroids(numOTUs, centroids, weight);
2243                     
2244                     if (m->control_pressed) { break; }
2245                     
2246                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2247                     
2248                     if (m->control_pressed) { break; }
2249                     
2250                     iter++;
2251                     
2252                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2253                     
2254                 }       
2255                 
2256                 if (m->control_pressed) { break; }
2257                 
2258                 m->mothurOut("\nFinalizing...\n");
2259                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2260                 
2261                 if (m->control_pressed) { break; }
2262                 
2263                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2264                 
2265                 if (m->control_pressed) { break; }
2266                 
2267                 vector<int> otuCounts(numOTUs, 0);
2268                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2269                 
2270                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2271                 
2272                 if (m->control_pressed) { break; }
2273                 
2274                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2275                 string thisOutputDir = outputDir;
2276                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2277                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
2278                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
2279                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
2280                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
2281                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2282                 string groupFileName = fileRoot + "shhh.groups";
2283
2284                 
2285                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2286                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2287                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2288                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2289                 writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector);                                           if (m->control_pressed) { break; }
2290                 
2291                 if (large) {
2292                     if (g > 0) {
2293                         m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.qual"));
2294                         m->mothurRemove(qualityFileName);
2295                         m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.fasta"));
2296                         m->mothurRemove(fastaFileName);
2297                         m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.names"));
2298                         m->mothurRemove(nameFileName);
2299                         m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.counts"));
2300                         m->mothurRemove(otuCountsFileName);
2301                         m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.groups"));
2302                         m->mothurRemove(groupFileName);
2303                     }
2304                     m->mothurRemove(theseFlowFileNames[g]);
2305                 }
2306                         }
2307             
2308             numCompleted++;
2309                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2310                 }
2311                 
2312         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2313         
2314         return numCompleted;
2315         
2316     }catch(exception& e) {
2317             m->errorOut(e, "ShhherCommand", "driver");
2318             exit(1);
2319     }
2320 }
2321
2322 /**************************************************************************************************/
2323 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2324         try{
2325        
2326                 ifstream flowFile;
2327        
2328                 m->openInputFile(filename, flowFile);
2329                 
2330                 string seqName;
2331                 int currentNumFlowCells;
2332                 float intensity;
2333         thisSeqNameVector.clear();
2334                 thisLengths.clear();
2335                 thisFlowDataIntI.clear();
2336                 thisNameMap.clear();
2337                 
2338                 flowFile >> numFlowCells;
2339                 int index = 0;//pcluster
2340                 while(!flowFile.eof()){
2341                         
2342                         if (m->control_pressed) { break; }
2343                         
2344                         flowFile >> seqName >> currentNumFlowCells;
2345                         thisLengths.push_back(currentNumFlowCells);
2346            
2347                         thisSeqNameVector.push_back(seqName);
2348                         thisNameMap[seqName] = index++;//pcluster
2349
2350                         for(int i=0;i<numFlowCells;i++){
2351                                 flowFile >> intensity;
2352                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2353                                 int intI = int(100 * intensity + 0.0001);
2354                                 thisFlowDataIntI.push_back(intI);
2355                         }
2356                         m->gobble(flowFile);
2357                 }
2358                 flowFile.close();
2359                 
2360                 int numSeqs = thisSeqNameVector.size();         
2361                 
2362                 for(int i=0;i<numSeqs;i++){
2363                         
2364                         if (m->control_pressed) { break; }
2365                         
2366                         int iNumFlowCells = i * numFlowCells;
2367                         for(int j=thisLengths[i];j<numFlowCells;j++){
2368                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2369                         }
2370                 }
2371         
2372         return numSeqs;
2373                 
2374         }
2375         catch(exception& e) {
2376                 m->errorOut(e, "ShhherCommand", "getFlowData");
2377                 exit(1);
2378         }
2379 }
2380 /**************************************************************************************************/
2381
2382 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2383         try{            
2384         
2385                 ostringstream outStream;
2386                 outStream.setf(ios::fixed, ios::floatfield);
2387                 outStream.setf(ios::dec, ios::basefield);
2388                 outStream.setf(ios::showpoint);
2389                 outStream.precision(6);
2390                 
2391                 int begTime = time(NULL);
2392                 double begClock = clock();
2393         
2394                 for(int i=0;i<stopSeq;i++){
2395                         
2396                         if (m->control_pressed) { break; }
2397                         
2398                         for(int j=0;j<i;j++){
2399                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2400                 
2401                                 if(flowDistance < 1e-6){
2402                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2403                                 }
2404                                 else if(flowDistance <= cutoff){
2405                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2406                                 }
2407                         }
2408                         if(i % 100 == 0){
2409                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2410                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2411                                 m->mothurOutEndLine();
2412                         }
2413                 }
2414                 
2415                 ofstream distFile(distFileName.c_str());
2416                 distFile << outStream.str();            
2417                 distFile.close();
2418                 
2419                 if (m->control_pressed) {}
2420                 else {
2421                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2422                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2423                         m->mothurOutEndLine();
2424                 }
2425         
2426         return 0;
2427         }
2428         catch(exception& e) {
2429                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2430                 exit(1);
2431         }
2432 }
2433 /**************************************************************************************************/
2434
2435 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2436         try{
2437                 int minLength = lengths[mapSeqToUnique[seqA]];
2438                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2439                 
2440                 int ANumFlowCells = seqA * numFlowCells;
2441                 int BNumFlowCells = seqB * numFlowCells;
2442                 
2443                 float dist = 0;
2444                 
2445                 for(int i=0;i<minLength;i++){
2446                         
2447                         if (m->control_pressed) { break; }
2448                         
2449                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2450                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2451                         
2452                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2453                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2454                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2455                 }
2456                 
2457                 dist /= (float) minLength;
2458                 return dist;
2459         }
2460         catch(exception& e) {
2461                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2462                 exit(1);
2463         }
2464 }
2465
2466 /**************************************************************************************************/
2467
2468 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2469         try{
2470                 int numUniques = 0;
2471                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2472                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2473                 uniqueLengths.assign(numSeqs, 0);
2474                 mapSeqToUnique.assign(numSeqs, -1);
2475                 mapUniqueToSeq.assign(numSeqs, -1);
2476                 
2477                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2478                 
2479                 for(int i=0;i<numSeqs;i++){
2480                         
2481                         if (m->control_pressed) { break; }
2482                         
2483                         int index = 0;
2484                         
2485                         vector<short> current(numFlowCells);
2486                         for(int j=0;j<numFlowCells;j++){
2487                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2488                         }
2489             
2490                         for(int j=0;j<numUniques;j++){
2491                                 int offset = j * numFlowCells;
2492                                 bool toEnd = 1;
2493                                 
2494                                 int shorterLength;
2495                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2496                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2497                 
2498                                 for(int k=0;k<shorterLength;k++){
2499                                         if(current[k] != uniqueFlowgrams[offset + k]){
2500                                                 toEnd = 0;
2501                                                 break;
2502                                         }
2503                                 }
2504                                 
2505                                 if(toEnd){
2506                                         mapSeqToUnique[i] = j;
2507                                         uniqueCount[j]++;
2508                                         index = j;
2509                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2510                                         break;
2511                                 }
2512                                 index++;
2513                         }
2514                         
2515                         if(index == numUniques){
2516                                 uniqueLengths[numUniques] = lengths[i];
2517                                 uniqueCount[numUniques] = 1;
2518                                 mapSeqToUnique[i] = numUniques;//anMap
2519                                 mapUniqueToSeq[numUniques] = i;//anF
2520                                 
2521                                 for(int k=0;k<numFlowCells;k++){
2522                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2523                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2524                                 }
2525                                 
2526                                 numUniques++;
2527                         }
2528                 }
2529                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2530                 uniqueLengths.resize(numUniques);       
2531                 
2532                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2533                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2534         
2535         return numUniques;
2536         }
2537         catch(exception& e) {
2538                 m->errorOut(e, "ShhherCommand", "getUniques");
2539                 exit(1);
2540         }
2541 }
2542 /**************************************************************************************************/
2543 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2544         try{
2545                 
2546                 vector<string> duplicateNames(numUniques, "");
2547                 for(int i=0;i<numSeqs;i++){
2548                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2549                 }
2550                 
2551                 ofstream nameFile;
2552                 m->openOutputFile(filename, nameFile);
2553                 
2554                 for(int i=0;i<numUniques;i++){
2555                         
2556                         if (m->control_pressed) { break; }
2557                         
2558             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2559                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2560                 }
2561                 
2562                 nameFile.close();
2563         
2564                 return 0;
2565         }
2566         catch(exception& e) {
2567                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2568                 exit(1);
2569         }
2570 }
2571 //**********************************************************************************************************************
2572
2573 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2574         try {
2575                 
2576                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2577                 read->setCutoff(cutoff);
2578                 
2579                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2580                 clusterNameMap->readMap();
2581                 read->read(clusterNameMap);
2582         
2583                 ListVector* list = read->getListVector();
2584                 SparseMatrix* matrix = read->getMatrix();
2585                 
2586                 delete read; 
2587                 delete clusterNameMap; 
2588         
2589                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2590                 
2591                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2592                 string tag = cluster->getTag();
2593                 
2594                 double clusterCutoff = cutoff;
2595                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2596                         
2597                         if (m->control_pressed) { break; }
2598                         
2599                         cluster->update(clusterCutoff);
2600                 }
2601                 
2602                 list->setLabel(toString(cutoff));
2603                 
2604                 ofstream listFile;
2605                 m->openOutputFile(filename, listFile);
2606                 list->print(listFile);
2607                 listFile.close();
2608                 
2609                 delete matrix;  delete cluster; delete rabund; delete list;
2610         
2611                 return 0;
2612         }
2613         catch(exception& e) {
2614                 m->errorOut(e, "ShhherCommand", "cluster");
2615                 exit(1);        
2616         }               
2617 }
2618 /**************************************************************************************************/
2619
2620 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2621                                vector<int>& cumNumSeqs,
2622                                vector<int>& nSeqsPerOTU,
2623                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2624                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2625                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2626                                vector<int>& seqIndex,
2627                                map<string, int>& nameMap){
2628         try {
2629         
2630                 ifstream listFile;
2631                 m->openInputFile(fileName, listFile);
2632                 string label;
2633         int numOTUs;
2634                 
2635                 listFile >> label >> numOTUs;
2636         
2637                 otuData.assign(numSeqs, 0);
2638                 cumNumSeqs.assign(numOTUs, 0);
2639                 nSeqsPerOTU.assign(numOTUs, 0);
2640                 aaP.clear();aaP.resize(numOTUs);
2641                 
2642                 seqNumber.clear();
2643                 aaI.clear();
2644                 seqIndex.clear();
2645                 
2646                 string singleOTU = "";
2647                 
2648                 for(int i=0;i<numOTUs;i++){
2649                         
2650                         if (m->control_pressed) { break; }
2651             
2652                         listFile >> singleOTU;
2653                         
2654                         istringstream otuString(singleOTU);
2655             
2656                         while(otuString){
2657                                 
2658                                 string seqName = "";
2659                                 
2660                                 for(int j=0;j<singleOTU.length();j++){
2661                                         char letter = otuString.get();
2662                                         
2663                                         if(letter != ','){
2664                                                 seqName += letter;
2665                                         }
2666                                         else{
2667                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2668                                                 int index = nmIt->second;
2669                                                 
2670                                                 nameMap.erase(nmIt);
2671                                                 
2672                                                 otuData[index] = i;
2673                                                 nSeqsPerOTU[i]++;
2674                                                 aaP[i].push_back(index);
2675                                                 seqName = "";
2676                                         }
2677                                 }
2678                                 
2679                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2680                 
2681                                 int index = nmIt->second;
2682                                 nameMap.erase(nmIt);
2683                 
2684                                 otuData[index] = i;
2685                                 nSeqsPerOTU[i]++;
2686                                 aaP[i].push_back(index);        
2687                                 
2688                                 otuString.get();
2689                         }
2690                         
2691                         sort(aaP[i].begin(), aaP[i].end());
2692                         for(int j=0;j<nSeqsPerOTU[i];j++){
2693                                 seqNumber.push_back(aaP[i][j]);
2694                         }
2695                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2696                                 aaP[i].push_back(0);
2697                         }
2698                         
2699                         
2700                 }
2701                 
2702                 for(int i=1;i<numOTUs;i++){
2703                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2704                 }
2705                 aaI = aaP;
2706                 seqIndex = seqNumber;
2707                 
2708                 listFile.close();       
2709         
2710         return numOTUs;
2711                 
2712         }
2713         catch(exception& e) {
2714                 m->errorOut(e, "ShhherCommand", "getOTUData");
2715                 exit(1);        
2716         }               
2717 }
2718 /**************************************************************************************************/
2719
2720 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2721                                           vector<int>& cumNumSeqs,
2722                                           vector<int>& nSeqsPerOTU,
2723                                           vector<int>& seqIndex,
2724                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2725                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2726                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2727                                           vector<int>& mapSeqToUnique,
2728                                           vector<short>& uniqueFlowgrams,
2729                                           vector<short>& flowDataIntI,
2730                                           vector<int>& lengths,
2731                                           int numFlowCells,
2732                                           vector<int>& seqNumber){                          
2733         
2734         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2735         //within an otu
2736         
2737         try{
2738                 
2739                 for(int i=0;i<numOTUs;i++){
2740                         
2741                         if (m->control_pressed) { break; }
2742                         
2743                         double count = 0;
2744                         int position = 0;
2745                         int minFlowGram = 100000000;
2746                         double minFlowValue = 1e8;
2747                         change[i] = 0; //FALSE
2748                         
2749                         for(int j=0;j<nSeqsPerOTU[i];j++){
2750                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2751                         }
2752             
2753                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2754                                 vector<double> adF(nSeqsPerOTU[i]);
2755                                 vector<int> anL(nSeqsPerOTU[i]);
2756                                 
2757                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2758                                         int index = cumNumSeqs[i] + j;
2759                                         int nI = seqIndex[index];
2760                                         int nIU = mapSeqToUnique[nI];
2761                                         
2762                                         int k;
2763                                         for(k=0;k<position;k++){
2764                                                 if(nIU == anL[k]){
2765                                                         break;
2766                                                 }
2767                                         }
2768                                         if(k == position){
2769                                                 anL[position] = nIU;
2770                                                 adF[position] = 0.0000;
2771                                                 position++;
2772                                         }                                               
2773                                 }
2774                                 
2775                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2776                                         int index = cumNumSeqs[i] + j;
2777                                         int nI = seqIndex[index];
2778                                         
2779                                         double tauValue = singleTau[seqNumber[index]];
2780                                         
2781                                         for(int k=0;k<position;k++){
2782                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2783                                                 adF[k] += dist * tauValue;
2784                                         }
2785                                 }
2786                                 
2787                                 for(int j=0;j<position;j++){
2788                                         if(adF[j] < minFlowValue){
2789                                                 minFlowGram = j;
2790                                                 minFlowValue = adF[j];
2791                                         }
2792                                 }
2793                                 
2794                                 if(centroids[i] != anL[minFlowGram]){
2795                                         change[i] = 1;
2796                                         centroids[i] = anL[minFlowGram];
2797                                 }
2798                         }
2799                         else if(centroids[i] != -1){
2800                                 change[i] = 1;
2801                                 centroids[i] = -1;                      
2802                         }
2803                 }
2804         
2805         return 0;
2806         }
2807         catch(exception& e) {
2808                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2809                 exit(1);        
2810         }               
2811 }
2812 /**************************************************************************************************/
2813
2814 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2815                                         vector<short>& flowDataIntI, int numFlowCells){
2816         try{
2817                 
2818                 int flowAValue = cent * numFlowCells;
2819                 int flowBValue = flow * numFlowCells;
2820                 
2821                 double dist = 0;
2822         
2823                 for(int i=0;i<length;i++){
2824                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2825                         flowAValue++;
2826                         flowBValue++;
2827                 }
2828                 
2829                 return dist / (double)length;
2830         }
2831         catch(exception& e) {
2832                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2833                 exit(1);        
2834         }               
2835 }
2836 /**************************************************************************************************/
2837
2838 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2839         try{
2840                 
2841                 double maxChange = 0;
2842                 
2843                 for(int i=0;i<numOTUs;i++){
2844                         
2845                         if (m->control_pressed) { break; }
2846                         
2847                         double difference = weight[i];
2848                         weight[i] = 0;
2849                         
2850                         for(int j=0;j<nSeqsPerOTU[i];j++){
2851                                 int index = cumNumSeqs[i] + j;
2852                                 double tauValue = singleTau[seqNumber[index]];
2853                                 weight[i] += tauValue;
2854                         }
2855                         
2856                         difference = fabs(weight[i] - difference);
2857                         if(difference > maxChange){     maxChange = difference; }
2858                 }
2859                 return maxChange;
2860         }
2861         catch(exception& e) {
2862                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2863                 exit(1);        
2864         }               
2865 }
2866
2867 /**************************************************************************************************/
2868
2869 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2870         
2871         try{
2872                 
2873                 vector<long double> P(numSeqs, 0);
2874                 int effNumOTUs = 0;
2875                 
2876                 for(int i=0;i<numOTUs;i++){
2877                         if(weight[i] > MIN_WEIGHT){
2878                                 effNumOTUs++;
2879                         }
2880                 }
2881                 
2882                 string hold;
2883                 for(int i=0;i<numOTUs;i++){
2884                         
2885                         if (m->control_pressed) { break; }
2886                         
2887                         for(int j=0;j<nSeqsPerOTU[i];j++){
2888                                 int index = cumNumSeqs[i] + j;
2889                                 int nI = seqIndex[index];
2890                                 double singleDist = dist[seqNumber[index]];
2891                                 
2892                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2893                         }
2894                 }
2895                 double nLL = 0.00;
2896                 for(int i=0;i<numSeqs;i++){
2897                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2898             
2899                         nLL += -log(P[i]);
2900                 }
2901                 
2902                 nLL = nLL -(double)numSeqs * log(sigma);
2903         
2904                 return nLL; 
2905         }
2906         catch(exception& e) {
2907                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2908                 exit(1);        
2909         }               
2910 }
2911
2912 /**************************************************************************************************/
2913
2914 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2915         try{
2916                 vector<int> unique(numOTUs, 1);
2917                 
2918                 for(int i=0;i<numOTUs;i++){
2919                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2920                                 unique[i] = -1;
2921                         }
2922                 }
2923                 
2924                 for(int i=0;i<numOTUs;i++){
2925                         
2926                         if (m->control_pressed) { break; }
2927                         
2928                         if(unique[i] == 1){
2929                                 for(int j=i+1;j<numOTUs;j++){
2930                                         if(unique[j] == 1){
2931                                                 
2932                                                 if(centroids[j] == centroids[i]){
2933                                                         unique[j] = 0;
2934                                                         centroids[j] = -1;
2935                                                         
2936                                                         weight[i] += weight[j];
2937                                                         weight[j] = 0.0;
2938                                                 }
2939                                         }
2940                                 }
2941                         }
2942                 }
2943         
2944         return 0;
2945         }
2946         catch(exception& e) {
2947                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2948                 exit(1);        
2949         }               
2950 }
2951 /**************************************************************************************************/
2952
2953 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
2954                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
2955                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
2956                                      vector<int>& seqNumber, vector<int>& seqIndex,
2957                                      vector<short>& uniqueFlowgrams,
2958                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2959         
2960         try{
2961                 
2962                 int total = 0;
2963                 vector<double> newTau(numOTUs,0);
2964                 vector<double> norms(numSeqs, 0);
2965                 nSeqsPerOTU.assign(numOTUs, 0);
2966         
2967                 for(int i=0;i<numSeqs;i++){
2968                         
2969                         if (m->control_pressed) { break; }
2970                         
2971                         int indexOffset = i * numOTUs;
2972             
2973                         double offset = 1e8;
2974                         
2975                         for(int j=0;j<numOTUs;j++){
2976                 
2977                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2978                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2979                                 }
2980                 
2981                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2982                                         offset = dist[indexOffset + j];
2983                                 }
2984                         }
2985             
2986                         for(int j=0;j<numOTUs;j++){
2987                                 if(weight[j] > MIN_WEIGHT){
2988                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2989                                         norms[i] += newTau[j];
2990                                 }
2991                                 else{
2992                                         newTau[j] = 0.0;
2993                                 }
2994                         }
2995             
2996                         for(int j=0;j<numOTUs;j++){
2997                                 newTau[j] /= norms[i];
2998                         }
2999             
3000                         for(int j=0;j<numOTUs;j++){
3001                                 if(newTau[j] > MIN_TAU){
3002                                         
3003                                         int oldTotal = total;
3004                                         
3005                                         total++;
3006                                         
3007                                         singleTau.resize(total, 0);
3008                                         seqNumber.resize(total, 0);
3009                                         seqIndex.resize(total, 0);
3010                                         
3011                                         singleTau[oldTotal] = newTau[j];
3012                                         
3013                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3014                                         aaI[j][nSeqsPerOTU[j]] = i;
3015                                         nSeqsPerOTU[j]++;
3016                                 }
3017                         }
3018             
3019                 }
3020         
3021         }
3022         catch(exception& e) {
3023                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3024                 exit(1);        
3025         }               
3026 }
3027 /**************************************************************************************************/
3028
3029 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3030         try {
3031                 int index = 0;
3032                 for(int i=0;i<numOTUs;i++){
3033                         
3034                         if (m->control_pressed) { return 0; }
3035                         
3036                         cumNumSeqs[i] = index;
3037                         for(int j=0;j<nSeqsPerOTU[i];j++){
3038                                 seqNumber[index] = aaP[i][j];
3039                                 seqIndex[index] = aaI[i][j];
3040                                 
3041                                 index++;
3042                         }
3043                 }
3044         
3045         return 0; 
3046         }
3047         catch(exception& e) {
3048                 m->errorOut(e, "ShhherCommand", "fill");
3049                 exit(1);        
3050         }               
3051 }
3052 /**************************************************************************************************/
3053
3054 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3055                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3056         
3057         try {
3058                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3059                 
3060                 for(int i=0;i<numOTUs;i++){
3061                         
3062                         if (m->control_pressed) { break; }
3063                         
3064                         for(int j=0;j<nSeqsPerOTU[i];j++){
3065                                 int index = cumNumSeqs[i] + j;
3066                                 double tauValue = singleTau[seqNumber[index]];
3067                                 int sIndex = seqIndex[index];
3068                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3069                         }
3070                 }
3071                 
3072                 for(int i=0;i<numSeqs;i++){
3073                         double maxTau = -1.0000;
3074                         int maxOTU = -1;
3075                         for(int j=0;j<numOTUs;j++){
3076                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3077                                         maxTau = bigTauMatrix[i * numOTUs + j];
3078                                         maxOTU = j;
3079                                 }
3080                         }
3081                         
3082                         otuData[i] = maxOTU;
3083                 }
3084                 
3085                 nSeqsPerOTU.assign(numOTUs, 0);         
3086                 
3087                 for(int i=0;i<numSeqs;i++){
3088                         int index = otuData[i];
3089                         
3090                         singleTau[i] = 1.0000;
3091                         dist[i] = 0.0000;
3092                         
3093                         aaP[index][nSeqsPerOTU[index]] = i;
3094                         aaI[index][nSeqsPerOTU[index]] = i;
3095                         
3096                         nSeqsPerOTU[index]++;
3097                 }
3098         
3099                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3100         }
3101         catch(exception& e) {
3102                 m->errorOut(e, "ShhherCommand", "setOTUs");
3103                 exit(1);        
3104         }               
3105 }
3106 /**************************************************************************************************/
3107
3108 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3109                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3110                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3111         
3112         try {
3113         
3114                 ofstream qualityFile;
3115                 m->openOutputFile(qualityFileName, qualityFile);
3116         
3117                 qualityFile.setf(ios::fixed, ios::floatfield);
3118                 qualityFile.setf(ios::showpoint);
3119                 qualityFile << setprecision(6);
3120                 
3121                 vector<vector<int> > qualities(numOTUs);
3122                 vector<double> pr(HOMOPS, 0);
3123                 
3124                 
3125                 for(int i=0;i<numOTUs;i++){
3126                         
3127                         if (m->control_pressed) { break; }
3128                         
3129                         int index = 0;
3130                         int base = 0;
3131                         
3132                         if(nSeqsPerOTU[i] > 0){
3133                                 qualities[i].assign(1024, -1);
3134                                 
3135                                 while(index < numFlowCells){
3136                                         double maxPrValue = 1e8;
3137                                         short maxPrIndex = -1;
3138                                         double count = 0.0000;
3139                                         
3140                                         pr.assign(HOMOPS, 0);
3141                                         
3142                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3143                                                 int lIndex = cumNumSeqs[i] + j;
3144                                                 double tauValue = singleTau[seqNumber[lIndex]];
3145                                                 int sequenceIndex = aaI[i][j];
3146                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3147                                                 
3148                                                 count += tauValue;
3149                                                 
3150                                                 for(int s=0;s<HOMOPS;s++){
3151                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3152                                                 }
3153                                         }
3154                                         
3155                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3156                                         maxPrValue = pr[maxPrIndex];
3157                                         
3158                                         if(count > MIN_COUNT){
3159                                                 double U = 0.0000;
3160                                                 double norm = 0.0000;
3161                                                 
3162                                                 for(int s=0;s<HOMOPS;s++){
3163                                                         norm += exp(-(pr[s] - maxPrValue));
3164                                                 }
3165                                                 
3166                                                 for(int s=1;s<=maxPrIndex;s++){
3167                                                         int value = 0;
3168                                                         double temp = 0.0000;
3169                                                         
3170                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3171                                                         
3172                                                         if(U>0.00){
3173                                                                 temp = log10(U);
3174                                                         }
3175                                                         else{
3176                                                                 temp = -10.1;
3177                                                         }
3178                                                         temp = floor(-10 * temp);
3179                                                         value = (int)floor(temp);
3180                                                         if(value > 100){        value = 100;    }
3181                                                         
3182                                                         qualities[i][base] = (int)value;
3183                                                         base++;
3184                                                 }
3185                                         }
3186                                         
3187                                         index++;
3188                                 }
3189                         }
3190                         
3191                         
3192                         if(otuCounts[i] > 0){
3193                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3194                         
3195                                 int j=4;        //need to get past the first four bases
3196                                 while(qualities[i][j] != -1){
3197                     qualityFile << qualities[i][j] << ' ';
3198                     if (j > qualities[i].size()) { break; }
3199                     j++;
3200                                 }
3201                                 qualityFile << endl;
3202                         }
3203                 }
3204                 qualityFile.close();
3205                 outputNames.push_back(qualityFileName);
3206         
3207         }
3208         catch(exception& e) {
3209                 m->errorOut(e, "ShhherCommand", "writeQualities");
3210                 exit(1);        
3211         }               
3212 }
3213
3214 /**************************************************************************************************/
3215
3216 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3217         try {
3218                 
3219                 ofstream fastaFile;
3220                 m->openOutputFile(fastaFileName, fastaFile);
3221                 
3222                 vector<string> names(numOTUs, "");
3223                 
3224                 for(int i=0;i<numOTUs;i++){
3225                         
3226                         if (m->control_pressed) { break; }
3227                         
3228                         int index = centroids[i];
3229                         
3230                         if(otuCounts[i] > 0){
3231                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3232                                 
3233                                 string newSeq = "";
3234                                 
3235                                 for(int j=0;j<numFlowCells;j++){
3236                                         
3237                                         char base = flowOrder[j % 4];
3238                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3239                                                 newSeq += base;
3240                                         }
3241                                 }
3242                                 
3243                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3244                 else {  fastaFile << "NNNN" << endl;  }
3245                         }
3246                 }
3247                 fastaFile.close();
3248         
3249                 outputNames.push_back(fastaFileName);
3250         
3251                 if(thisCompositeFASTAFileName != ""){
3252                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3253                 }
3254         }
3255         catch(exception& e) {
3256                 m->errorOut(e, "ShhherCommand", "writeSequences");
3257                 exit(1);        
3258         }               
3259 }
3260
3261 /**************************************************************************************************/
3262
3263 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3264         try {
3265                 
3266                 ofstream nameFile;
3267                 m->openOutputFile(nameFileName, nameFile);
3268                 
3269                 for(int i=0;i<numOTUs;i++){
3270                         
3271                         if (m->control_pressed) { break; }
3272                         
3273                         if(otuCounts[i] > 0){
3274                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3275                                 
3276                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3277                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3278                                 }
3279                                 
3280                                 nameFile << endl;
3281                         }
3282                 }
3283                 nameFile.close();
3284                 outputNames.push_back(nameFileName);
3285                 
3286                 
3287                 if(thisCompositeNamesFileName != ""){
3288                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3289                 }               
3290         }
3291         catch(exception& e) {
3292                 m->errorOut(e, "ShhherCommand", "writeNames");
3293                 exit(1);        
3294         }               
3295 }
3296
3297 /**************************************************************************************************/
3298
3299 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3300         try {
3301         ofstream groupFile;
3302                 m->openOutputFile(groupFileName, groupFile);
3303                 
3304                 for(int i=0;i<numSeqs;i++){
3305                         if (m->control_pressed) { break; }
3306                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3307                 }
3308                 groupFile.close();
3309                 outputNames.push_back(groupFileName);
3310         
3311         }
3312         catch(exception& e) {
3313                 m->errorOut(e, "ShhherCommand", "writeGroups");
3314                 exit(1);        
3315         }               
3316 }
3317
3318 /**************************************************************************************************/
3319
3320 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3321         try {
3322                 ofstream otuCountsFile;
3323                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3324                 
3325                 string bases = flowOrder;
3326                 
3327                 for(int i=0;i<numOTUs;i++){
3328                         
3329                         if (m->control_pressed) {
3330                                 break;
3331                         }
3332                         //output the translated version of the centroid sequence for the otu
3333                         if(otuCounts[i] > 0){
3334                                 int index = centroids[i];
3335                                 
3336                                 otuCountsFile << "ideal\t";
3337                                 for(int j=8;j<numFlowCells;j++){
3338                                         char base = bases[j % 4];
3339                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3340                                                 otuCountsFile << base;
3341                                         }
3342                                 }
3343                                 otuCountsFile << endl;
3344                                 
3345                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3346                                         int sequence = aaI[i][j];
3347                                         otuCountsFile << seqNameVector[sequence] << '\t';
3348                                         
3349                                         string newSeq = "";
3350                                         
3351                                         for(int k=0;k<lengths[sequence];k++){
3352                                                 char base = bases[k % 4];
3353                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3354                         
3355                                                 for(int s=0;s<freq;s++){
3356                                                         newSeq += base;
3357                                                         //otuCountsFile << base;
3358                                                 }
3359                                         }
3360                                         
3361                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3362                     else {  otuCountsFile << "NNNN" << endl;  }
3363                                 }
3364                                 otuCountsFile << endl;
3365                         }
3366                 }
3367                 otuCountsFile.close();
3368                 outputNames.push_back(otuCountsFileName);
3369         
3370         }
3371         catch(exception& e) {
3372                 m->errorOut(e, "ShhherCommand", "writeClusters");
3373                 exit(1);        
3374         }               
3375 }
3376
3377 /**************************************************************************************************/
3378
3379 void ShhherCommand::getSingleLookUp(){
3380         try{
3381                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3382                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3383                 
3384                 int index = 0;
3385                 ifstream lookUpFile;
3386                 m->openInputFile(lookupFileName, lookUpFile);
3387                 
3388                 for(int i=0;i<HOMOPS;i++){
3389                         
3390                         if (m->control_pressed) { break; }
3391                         
3392                         float logFracFreq;
3393                         lookUpFile >> logFracFreq;
3394                         
3395                         for(int j=0;j<NUMBINS;j++)      {
3396                                 lookUpFile >> singleLookUp[index];
3397                                 index++;
3398                         }
3399                 }       
3400                 lookUpFile.close();
3401         }
3402         catch(exception& e) {
3403                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3404                 exit(1);
3405         }
3406 }
3407
3408 /**************************************************************************************************/
3409
3410 void ShhherCommand::getJointLookUp(){
3411         try{
3412                 
3413                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3414                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3415                 
3416                 for(int i=0;i<NUMBINS;i++){
3417                         
3418                         if (m->control_pressed) { break; }
3419                         
3420                         for(int j=0;j<NUMBINS;j++){             
3421                                 
3422                                 double minSum = 100000000;
3423                                 
3424                                 for(int k=0;k<HOMOPS;k++){
3425                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3426                                         
3427                                         if(sum < minSum)        {       minSum = sum;           }
3428                                 }       
3429                                 jointLookUp[i * NUMBINS + j] = minSum;
3430                         }
3431                 }
3432         }
3433         catch(exception& e) {
3434                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3435                 exit(1);
3436         }
3437 }
3438
3439 /**************************************************************************************************/
3440
3441 double ShhherCommand::getProbIntensity(int intIntensity){                          
3442         try{
3443
3444                 double minNegLogProb = 100000000; 
3445
3446                 
3447                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3448                         
3449                         if (m->control_pressed) { break; }
3450                         
3451                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3452                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3453                 }
3454                 
3455                 return minNegLogProb;
3456         }
3457         catch(exception& e) {
3458                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3459                 exit(1);
3460         }
3461 }
3462
3463
3464
3465