]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
fixed bug with dist.shared subsampling. added mode parameter to dist.shared so...
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
24                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ShhherCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string ShhherCommand::getHelpString(){  
39         try {
40                 string helpString = "";
41                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
42                 return helpString;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ShhherCommand", "getHelpString");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50
51 ShhherCommand::ShhherCommand(){ 
52         try {
53                 abort = true; calledHelp = true;
54                 setParameters();
55                 
56                 //initialize outputTypes
57 //              vector<string> tempOutNames;
58 //              outputTypes["pn.dist"] = tempOutNames;
59
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
63                 exit(1);
64         }
65 }
66
67 //**********************************************************************************************************************
68
69 ShhherCommand::ShhherCommand(string option) {
70         try {
71         
72 #ifdef USE_MPI
73                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
74                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
75         
76                 if(pid == 0){
77 #endif
78                 abort = false; calledHelp = false;   
79                 
80                 //allow user to run help
81                 if(option == "help") { help(); abort = true; calledHelp = true; }
82                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
83                 
84                 else {
85                         vector<string> myArray = setParameters();
86                         
87                         OptionParser parser(option);
88                         map<string,string> parameters = parser.getParameters();
89                         
90                         ValidParameters validParameter;
91                         map<string,string>::iterator it;
92                         
93                         //check to make sure all parameters are valid for command
94                         for (it = parameters.begin(); it != parameters.end(); it++) { 
95                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
96                         }
97                         
98                         //initialize outputTypes
99                         vector<string> tempOutNames;
100 //                      outputTypes["pn.dist"] = tempOutNames;
101                         //                      outputTypes["fasta"] = tempOutNames;
102                         
103                         //if the user changes the input directory command factory will send this info to us in the output parameter 
104                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
105                         if (inputDir == "not found"){   inputDir = "";          }
106                         else {
107                                 string path;
108                                 it = parameters.find("flow");
109                                 //user has given a template file
110                                 if(it != parameters.end()){ 
111                                         path = m->hasPath(it->second);
112                                         //if the user has not given a path then, add inputdir. else leave path alone.
113                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
114                                 }
115                                 
116                                 it = parameters.find("lookup");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
122                                 }
123
124                                 it = parameters.find("file");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
130                                 }
131                         }
132             
133             //if the user changes the output directory command factory will send this info to us in the output parameter 
134                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
135             
136                         //check for required parameters
137                         flowFileName = validParameter.validFile(parameters, "flow", true);
138                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
139                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
140                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
141                                 m->mothurOutEndLine();
142                                 abort = true; 
143                         }
144                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
145                         
146                         if(flowFileName != "not found"){
147                                 compositeFASTAFileName = "";    
148                                 compositeNamesFileName = "";    
149                         }
150                         else{
151                                 ofstream temp;
152                 
153                 string thisoutputDir = outputDir;
154                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
155                 
156                                 //we want to rip off .files, and also .flow if its there
157                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
158                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
159                 string extension = m->getExtension(fileroot);
160                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
161                 else { fileroot += "."; } //add back if needed
162                 
163                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
164                                 m->openOutputFile(compositeFASTAFileName, temp);
165                                 temp.close();
166                                 
167                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
168                                 m->openOutputFile(compositeNamesFileName, temp);
169                                 temp.close();
170                         }
171             
172             if(flowFilesFileName != "not found"){
173                 string fName;
174                 
175                 ifstream flowFilesFile;
176                 m->openInputFile(flowFilesFileName, flowFilesFile);
177                 while(flowFilesFile){
178                     fName = m->getline(flowFilesFile);
179                     
180                     //test if file is valid
181                     ifstream in;
182                     int ableToOpen = m->openInputFile(fName, in, "noerror");
183                     in.close(); 
184                     if (ableToOpen == 1) {
185                         if (inputDir != "") { //default path is set
186                             string tryPath = inputDir + fName;
187                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
188                             ifstream in2;
189                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
190                             in2.close();
191                             fName = tryPath;
192                         }
193                     }
194                     
195                     if (ableToOpen == 1) {
196                         if (m->getDefaultPath() != "") { //default path is set
197                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
198                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
199                             ifstream in2;
200                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
201                             in2.close();
202                             fName = tryPath;
203                         }
204                     }
205                     
206                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
207                     if (ableToOpen == 1) {
208                         string exepath = m->argv;
209                         string tempPath = exepath;
210                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
211                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
212                         
213                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
214                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
215                         ifstream in2;
216                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
217                         in2.close();
218                         fName = tryPath;
219                     }
220                     
221                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
222                     else { flowFileVector.push_back(fName); }
223                     m->gobble(flowFilesFile);
224                 }
225                 flowFilesFile.close();
226                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
227             }
228             else{
229                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
230                 flowFileVector.push_back(flowFileName);
231             }
232                 
233                         //check for optional parameter and set defaults
234                         // ...at some point should added some additional type checking...
235                         string temp;
236                         temp = validParameter.validFile(parameters, "lookup", true);
237                         if (temp == "not found")        {       
238                                 lookupFileName = "LookUp_Titanium.pat"; 
239                                 
240                                 int ableToOpen;
241                                 ifstream in;
242                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
243                                 in.close();     
244                                 
245                                 //if you can't open it, try input location
246                                 if (ableToOpen == 1) {
247                                         if (inputDir != "") { //default path is set
248                                                 string tryPath = inputDir + lookupFileName;
249                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
250                                                 ifstream in2;
251                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
252                                                 in2.close();
253                                                 lookupFileName = tryPath;
254                                         }
255                                 }
256                                 
257                                 //if you can't open it, try default location
258                                 if (ableToOpen == 1) {
259                                         if (m->getDefaultPath() != "") { //default path is set
260                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
261                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
262                                                 ifstream in2;
263                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
264                                                 in2.close();
265                                                 lookupFileName = tryPath;
266                                         }
267                                 }
268                                 
269                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
270                                 if (ableToOpen == 1) {
271                                         string exepath = m->argv;
272                                         string tempPath = exepath;
273                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
274                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
275                                         
276                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
277                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
278                                         ifstream in2;
279                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
280                                         in2.close();
281                                         lookupFileName = tryPath;
282                                 }
283                                 
284                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
285                         }
286                         else if(temp == "not open")     {       
287                                 
288                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
289                                 
290                                 //if you can't open it its not inputDir, try mothur excutable location
291                                 string exepath = m->argv;
292                                 string tempPath = exepath;
293                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
294                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
295                                         
296                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
297                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
298                                 ifstream in2;
299                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
300                                 in2.close();
301                                 lookupFileName = tryPath;
302                                 
303                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
304                         }else                                           {       lookupFileName = temp;  }
305                         
306                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
307                         m->setProcessors(temp);
308                         m->mothurConvert(temp, processors);
309
310                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
311                         m->mothurConvert(temp, cutoff); 
312                         
313                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
314                         m->mothurConvert(temp, minDelta); 
315
316                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
317                         m->mothurConvert(temp, maxIters); 
318             
319             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
320                         m->mothurConvert(temp, largeSize); 
321             if (largeSize != 0) { large = true; }
322             else { large = false;  }
323             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
324             
325 #ifdef USE_MPI
326             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
327 #endif
328             
329
330                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
331                         m->mothurConvert(temp, sigma); 
332                         
333                         flowOrder = validParameter.validFile(parameters, "order", false);
334                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
335                         else if(flowOrder.length() != 4){
336                                 m->mothurOut("The value of the order option must be four bases long\n");
337                         }
338                         
339                 }
340 #ifdef USE_MPI
341                 }                               
342 #endif
343         }
344         catch(exception& e) {
345                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
346                 exit(1);
347         }
348 }
349 //**********************************************************************************************************************
350 #ifdef USE_MPI
351 int ShhherCommand::execute(){
352         try {
353                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
354                 
355                 int tag = 1976;
356                 MPI_Status status; 
357                         
358                 if(pid == 0){
359
360                         for(int i=1;i<ncpus;i++){
361                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
362                         }
363                         if(abort == 1){ return 0;       }
364
365                         processors = ncpus;
366                         
367                         m->mothurOut("\nGetting preliminary data...\n");
368                         getSingleLookUp();      if (m->control_pressed) { return 0; }
369                         getJointLookUp();       if (m->control_pressed) { return 0; }
370                         
371             vector<string> flowFileVector;
372                         if(flowFilesFileName != "not found"){
373                                 string fName;
374                 
375                                 ifstream flowFilesFile;
376                                 m->openInputFile(flowFilesFileName, flowFilesFile);
377                                 while(flowFilesFile){
378                                         fName = m->getline(flowFilesFile);
379                                         flowFileVector.push_back(fName);
380                                         m->gobble(flowFilesFile);
381                                 }
382                         }
383                         else{
384                                 flowFileVector.push_back(flowFileName);
385                         }
386             
387                         int numFiles = flowFileVector.size();
388
389                         for(int i=1;i<ncpus;i++){
390                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
391                         }
392                         
393                         for(int i=0;i<numFiles;i++){
394                                 
395                                 if (m->control_pressed) { break; }
396                                 
397                                 double begClock = clock();
398                                 unsigned long long begTime = time(NULL);
399
400                                 flowFileName = flowFileVector[i];
401                                 
402                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
403                                 m->mothurOut("Reading flowgrams...\n");
404                                 getFlowData();
405                                 
406                                 if (m->control_pressed) { break; }
407
408                                 m->mothurOut("Identifying unique flowgrams...\n");
409                                 getUniques();
410                                 
411                                 if (m->control_pressed) { break; }
412
413                                 m->mothurOut("Calculating distances between flowgrams...\n");
414                                 char fileName[1024];
415                                 strcpy(fileName, flowFileName.c_str());
416
417                                 for(int i=1;i<ncpus;i++){
418                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
419
420                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
421                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
422                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
423                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
424                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
425                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
426                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
427                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
428                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
429                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
430                                 }                               
431                                                         
432                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
433                                 
434                                 if (m->control_pressed) { break; }
435                                 
436                                 int done;
437                                 for(int i=1;i<ncpus;i++){
438                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
439                                         
440                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
441                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
442                                 }
443
444                                 string namesFileName = createNamesFile();
445                                 
446                                 if (m->control_pressed) { break; }
447                                 
448                                 m->mothurOut("\nClustering flowgrams...\n");
449                                 string listFileName = cluster(distFileName, namesFileName);
450                                 
451                                 if (m->control_pressed) { break; }
452                                 
453                 
454                 
455                                 getOTUData(listFileName);
456
457                                 m->mothurRemove(distFileName);
458                                 m->mothurRemove(namesFileName);
459                                 m->mothurRemove(listFileName);
460                                 
461                                 if (m->control_pressed) { break; }
462                                 
463                                 initPyroCluster();
464
465                                 if (m->control_pressed) { break; }
466                                 
467             
468                                 for(int i=1;i<ncpus;i++){
469                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
470                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
471                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
472                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
473                                 }
474                                 
475                                 if (m->control_pressed) { break; }
476                                 
477                                 double maxDelta = 0;
478                                 int iter = 0;
479                                 
480                                 int numOTUsOnCPU = numOTUs / ncpus;
481                                 int numSeqsOnCPU = numSeqs / ncpus;
482                                 m->mothurOut("\nDenoising flowgrams...\n");
483                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
484                                 
485                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
486                                         
487                                         double cycClock = clock();
488                                         unsigned long long cycTime = time(NULL);
489                                         fill();
490                                         
491                                         if (m->control_pressed) { break; }
492
493                                         int total = singleTau.size();
494                                         for(int i=1;i<ncpus;i++){
495                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
496                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
497                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
498                                                 
499                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
500                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
501                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
502                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
503                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
504                                         }
505                                         
506                                         calcCentroidsDriver(0, numOTUsOnCPU);
507                                         
508                                         for(int i=1;i<ncpus;i++){
509                                                 int otuStart = i * numOTUs / ncpus;
510                                                 int otuStop = (i + 1) * numOTUs / ncpus;
511                                                 
512                                                 vector<int> tempCentroids(numOTUs, 0);
513                                                 vector<short> tempChange(numOTUs, 0);
514                                                 
515                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
516                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
517                                                 
518                                                 for(int j=otuStart;j<otuStop;j++){
519                                                         centroids[j] = tempCentroids[j];
520                                                         change[j] = tempChange[j];
521                                                 }
522                                         }
523                                                                         
524                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
525                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
526                                         checkCentroids(); if (m->control_pressed) { break; }
527                                         
528                                         for(int i=1;i<ncpus;i++){
529                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
530                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
531                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
532                                         }
533                                         
534                                         calcNewDistancesParent(0, numSeqsOnCPU);
535
536                                         total = singleTau.size();
537
538                                         for(int i=1;i<ncpus;i++){
539                                                 int childTotal;
540                                                 int seqStart = i * numSeqs / ncpus;
541                                                 int seqStop = (i + 1) * numSeqs / ncpus;
542                                                 
543                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
544
545                                                 vector<int> childSeqIndex(childTotal, 0);
546                                                 vector<double> childSingleTau(childTotal, 0);
547                                                 vector<double> childDist(numSeqs * numOTUs, 0);
548                                                 vector<int> otuIndex(childTotal, 0);
549                                                 
550                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
551                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
552                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
553                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
554
555                                                 int oldTotal = total;
556                                                 total += childTotal;
557                                                 singleTau.resize(total, 0);
558                                                 seqIndex.resize(total, 0);
559                                                 seqNumber.resize(total, 0);
560                                                 
561                                                 int childIndex = 0;
562                                                 
563                                                 for(int j=oldTotal;j<total;j++){
564                                                         int otuI = otuIndex[childIndex];
565                                                         int seqI = childSeqIndex[childIndex];
566                                                         
567                                                         singleTau[j] = childSingleTau[childIndex];
568                                                         
569                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
570                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
571                                                         nSeqsPerOTU[otuI]++;
572                                                         childIndex++;
573                                                 }
574                                                 
575                                                 int index = seqStart * numOTUs;
576                                                 for(int j=seqStart;j<seqStop;j++){
577                                                         for(int k=0;k<numOTUs;k++){
578                                                                 dist[index] = childDist[index];
579                                                                 index++;
580                                                         }
581                                                 }                                       
582                                         }
583                                         
584                                         iter++;
585                                         
586                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
587
588                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
589                                                 int live = 1;
590                                                 for(int i=1;i<ncpus;i++){
591                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
592                                                 }
593                                         }
594                                         else{
595                                                 int live = 0;
596                                                 for(int i=1;i<ncpus;i++){
597                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
598                                                 }
599                                         }
600                                         
601                                 }       
602                                 
603                                 if (m->control_pressed) { break; }
604                                 
605                                 m->mothurOut("\nFinalizing...\n");
606                                 fill();
607                                 
608                                 if (m->control_pressed) { break; }
609                                 
610                                 setOTUs();
611                                 
612                                 vector<int> otuCounts(numOTUs, 0);
613                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
614                                 calcCentroidsDriver(0, numOTUs);
615                                 
616                                 if (m->control_pressed) { break; }
617                                 
618                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
619                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
620                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
621                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
622                                 writeGroups();                          if (m->control_pressed) { break; }
623                                 
624                                                                  
625                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
626                         }
627                 }
628                 else{
629                         int abort = 1;
630
631                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
632                         if(abort){      return 0;       }
633
634                         int numFiles;
635                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
636
637                         for(int i=0;i<numFiles;i++){
638                                 
639                                 if (m->control_pressed) { break; }
640                                 
641                                 //Now into the pyrodist part
642                                 bool live = 1;
643
644                                 char fileName[1024];
645                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
646                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
647                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
648                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
649                                 
650                                 flowDataIntI.resize(numSeqs * numFlowCells);
651                                 flowDataPrI.resize(numSeqs * numFlowCells);
652                                 mapUniqueToSeq.resize(numSeqs);
653                                 mapSeqToUnique.resize(numSeqs);
654                                 lengths.resize(numSeqs);
655                                 jointLookUp.resize(NUMBINS * NUMBINS);
656
657                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
658                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
659                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
660                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
661                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
662                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
663                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
664
665                                 flowFileName = string(fileName);
666                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
667                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
668                                 
669                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
670                                 
671                                 if (m->control_pressed) { break; }
672                                 
673                                 int done = 1;
674                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
675
676                                 //Now into the pyronoise part
677                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
678                                 
679                                 singleLookUp.resize(HOMOPS * NUMBINS);
680                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
681                                 weight.resize(numOTUs);
682                                 centroids.resize(numOTUs);
683                                 change.resize(numOTUs);
684                                 dist.assign(numOTUs * numSeqs, 0);
685                                 nSeqsPerOTU.resize(numOTUs);
686                                 cumNumSeqs.resize(numOTUs);
687
688                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
689                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
690                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
691                                 
692                                 int startOTU = pid * numOTUs / ncpus;
693                                 int endOTU = (pid + 1) * numOTUs / ncpus;
694
695                                 int startSeq = pid * numSeqs / ncpus;
696                                 int endSeq = (pid + 1) * numSeqs /ncpus;
697                                 
698                                 int total;
699
700                                 while(live){
701                                         
702                                         if (m->control_pressed) { break; }
703                                         
704                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
705                                         singleTau.assign(total, 0.0000);
706                                         seqNumber.assign(total, 0);
707                                         seqIndex.assign(total, 0);
708
709                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
710                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
711                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
712                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
713                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
714                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
715                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
716
717                                         calcCentroidsDriver(startOTU, endOTU);
718
719                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
720                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
721
722                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
723                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
724                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
725
726                                         vector<int> otuIndex(total, 0);
727                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
728                                         total = otuIndex.size();
729                                         
730                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
731                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
732                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
733                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
734                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
735                                 
736                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
737                                 }
738                         }
739                 }
740                 
741                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
742                 
743                 MPI_Barrier(MPI_COMM_WORLD);
744
745                 
746                 if(compositeFASTAFileName != ""){
747                         outputNames.push_back(compositeFASTAFileName);
748                         outputNames.push_back(compositeNamesFileName);
749                 }
750
751                 m->mothurOutEndLine();
752                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
753                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
754                 m->mothurOutEndLine();
755                 
756                 return 0;
757
758         }
759         catch(exception& e) {
760                 m->errorOut(e, "ShhherCommand", "execute");
761                 exit(1);
762         }
763 }
764 /**************************************************************************************************/
765 string ShhherCommand::createNamesFile(){
766         try{
767                 
768                 vector<string> duplicateNames(numUniques, "");
769                 for(int i=0;i<numSeqs;i++){
770                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
771                 }
772                 
773                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
774                 
775                 ofstream nameFile;
776                 m->openOutputFile(nameFileName, nameFile);
777                 
778                 for(int i=0;i<numUniques;i++){
779                         
780                         if (m->control_pressed) { break; }
781                         
782             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
783                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
784                 }
785                 
786                 nameFile.close();
787                 return  nameFileName;
788         }
789         catch(exception& e) {
790                 m->errorOut(e, "ShhherCommand", "createNamesFile");
791                 exit(1);
792         }
793 }
794 /**************************************************************************************************/
795
796 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
797         try{            
798                 ostringstream outStream;
799                 outStream.setf(ios::fixed, ios::floatfield);
800                 outStream.setf(ios::dec, ios::basefield);
801                 outStream.setf(ios::showpoint);
802                 outStream.precision(6);
803                 
804                 int begTime = time(NULL);
805                 double begClock = clock();
806                 
807                 for(int i=startSeq;i<stopSeq;i++){
808                         
809                         if (m->control_pressed) { break; }
810                         
811                         for(int j=0;j<i;j++){
812                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
813                                 
814                                 if(flowDistance < 1e-6){
815                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
816                                 }
817                                 else if(flowDistance <= cutoff){
818                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
819                                 }
820                         }
821                         if(i % 100 == 0){
822                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
823                         }
824                 }
825                 
826                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
827                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
828                 
829                 if (m->control_pressed) { return fDistFileName; }
830                 
831                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
832
833                 ofstream distFile(fDistFileName.c_str());
834                 distFile << outStream.str();            
835                 distFile.close();
836                 
837                 return fDistFileName;
838         }
839         catch(exception& e) {
840                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
841                 exit(1);
842         }
843 }
844 /**************************************************************************************************/
845  
846 void ShhherCommand::getOTUData(string listFileName){
847     try {
848         
849         ifstream listFile;
850         m->openInputFile(listFileName, listFile);
851         string label;
852         
853         listFile >> label >> numOTUs;
854         
855         otuData.assign(numSeqs, 0);
856         cumNumSeqs.assign(numOTUs, 0);
857         nSeqsPerOTU.assign(numOTUs, 0);
858         aaP.clear();aaP.resize(numOTUs);
859         
860         seqNumber.clear();
861         aaI.clear();
862         seqIndex.clear();
863         
864         string singleOTU = "";
865         
866         for(int i=0;i<numOTUs;i++){
867             
868             if (m->control_pressed) { break; }
869             
870             listFile >> singleOTU;
871             
872             istringstream otuString(singleOTU);
873             
874             while(otuString){
875                 
876                 string seqName = "";
877                 
878                 for(int j=0;j<singleOTU.length();j++){
879                     char letter = otuString.get();
880                     
881                     if(letter != ','){
882                         seqName += letter;
883                     }
884                     else{
885                         map<string,int>::iterator nmIt = nameMap.find(seqName);
886                         int index = nmIt->second;
887                         
888                         nameMap.erase(nmIt);
889                         
890                         otuData[index] = i;
891                         nSeqsPerOTU[i]++;
892                         aaP[i].push_back(index);
893                         seqName = "";
894                     }
895                 }
896                 
897                 map<string,int>::iterator nmIt = nameMap.find(seqName);
898                 
899                 int index = nmIt->second;
900                 nameMap.erase(nmIt);
901                 
902                 otuData[index] = i;
903                 nSeqsPerOTU[i]++;
904                 aaP[i].push_back(index);        
905                 
906                 otuString.get();
907             }
908             
909             sort(aaP[i].begin(), aaP[i].end());
910             for(int j=0;j<nSeqsPerOTU[i];j++){
911                 seqNumber.push_back(aaP[i][j]);
912             }
913             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
914                 aaP[i].push_back(0);
915             }
916             
917             
918         }
919         
920         for(int i=1;i<numOTUs;i++){
921             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
922         }
923         aaI = aaP;
924         seqIndex = seqNumber;
925         
926         listFile.close();       
927         
928     }
929     catch(exception& e) {
930         m->errorOut(e, "ShhherCommand", "getOTUData");
931         exit(1);        
932     }           
933 }
934
935 /**************************************************************************************************/
936
937 void ShhherCommand::initPyroCluster(){                          
938     try{
939         if (numOTUs < processors) { processors = 1; }
940         
941         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
942         
943         dist.assign(numSeqs * numOTUs, 0);
944         change.assign(numOTUs, 1);
945         centroids.assign(numOTUs, -1);
946         weight.assign(numOTUs, 0);
947         singleTau.assign(numSeqs, 1.0);
948         
949         nSeqsBreaks.assign(processors+1, 0);
950         nOTUsBreaks.assign(processors+1, 0);
951         
952         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
953         
954         nSeqsBreaks[0] = 0;
955         for(int i=0;i<processors;i++){
956             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
957             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
958         }
959         nSeqsBreaks[processors] = numSeqs;
960         nOTUsBreaks[processors] = numOTUs;
961     }
962     catch(exception& e) {
963         m->errorOut(e, "ShhherCommand", "initPyroCluster");
964         exit(1);        
965     }
966 }
967
968 /**************************************************************************************************/
969
970 void ShhherCommand::fill(){
971     try {
972         int index = 0;
973         for(int i=0;i<numOTUs;i++){
974             
975             if (m->control_pressed) { break; }
976             
977             cumNumSeqs[i] = index;
978             for(int j=0;j<nSeqsPerOTU[i];j++){
979                 seqNumber[index] = aaP[i][j];
980                 seqIndex[index] = aaI[i][j];
981                 
982                 index++;
983             }
984         }
985     }
986     catch(exception& e) {
987         m->errorOut(e, "ShhherCommand", "fill");
988         exit(1);        
989     }           
990 }
991
992 /**************************************************************************************************/
993  
994 void ShhherCommand::getFlowData(){
995     try{
996         ifstream flowFile;
997         m->openInputFile(flowFileName, flowFile);
998         
999         string seqName;
1000         seqNameVector.clear();
1001         lengths.clear();
1002         flowDataIntI.clear();
1003         nameMap.clear();
1004         
1005         
1006         int currentNumFlowCells;
1007         
1008         float intensity;
1009         
1010         flowFile >> numFlowCells;
1011         int index = 0;//pcluster
1012         while(!flowFile.eof()){
1013             
1014             if (m->control_pressed) { break; }
1015             
1016             flowFile >> seqName >> currentNumFlowCells;
1017             lengths.push_back(currentNumFlowCells);
1018             
1019             seqNameVector.push_back(seqName);
1020             nameMap[seqName] = index++;//pcluster
1021             
1022             for(int i=0;i<numFlowCells;i++){
1023                 flowFile >> intensity;
1024                 if(intensity > 9.99)    {       intensity = 9.99;       }
1025                 int intI = int(100 * intensity + 0.0001);
1026                 flowDataIntI.push_back(intI);
1027             }
1028             m->gobble(flowFile);
1029         }
1030         flowFile.close();
1031         
1032         numSeqs = seqNameVector.size();         
1033         
1034         for(int i=0;i<numSeqs;i++){
1035             
1036             if (m->control_pressed) { break; }
1037             
1038             int iNumFlowCells = i * numFlowCells;
1039             for(int j=lengths[i];j<numFlowCells;j++){
1040                 flowDataIntI[iNumFlowCells + j] = 0;
1041             }
1042         }
1043         
1044     }
1045     catch(exception& e) {
1046         m->errorOut(e, "ShhherCommand", "getFlowData");
1047         exit(1);
1048     }
1049 }
1050 /**************************************************************************************************/
1051 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1052         
1053         try{
1054                 vector<double> newTau(numOTUs,0);
1055                 vector<double> norms(numSeqs, 0);
1056                 otuIndex.clear();
1057                 seqIndex.clear();
1058                 singleTau.clear();
1059                 
1060                 for(int i=startSeq;i<stopSeq;i++){
1061                         
1062                         if (m->control_pressed) { break; }
1063                         
1064                         double offset = 1e8;
1065                         int indexOffset = i * numOTUs;
1066                         
1067                         for(int j=0;j<numOTUs;j++){
1068                                 
1069                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1070                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1071                                 }
1072                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1073                                         offset = dist[indexOffset + j];
1074                                 }
1075                         }
1076                         
1077                         for(int j=0;j<numOTUs;j++){
1078                                 if(weight[j] > MIN_WEIGHT){
1079                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1080                                         norms[i] += newTau[j];
1081                                 }
1082                                 else{
1083                                         newTau[j] = 0.0;
1084                                 }
1085                         }
1086                         
1087                         for(int j=0;j<numOTUs;j++){
1088                 
1089                                 newTau[j] /= norms[i];
1090                                 
1091                                 if(newTau[j] > MIN_TAU){
1092                                         otuIndex.push_back(j);
1093                                         seqIndex.push_back(i);
1094                                         singleTau.push_back(newTau[j]);
1095                                 }
1096                         }
1097                         
1098                 }
1099         }
1100         catch(exception& e) {
1101                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1102                 exit(1);        
1103         }               
1104 }
1105
1106 /**************************************************************************************************/
1107
1108 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1109     
1110     try{
1111         
1112         int total = 0;
1113         vector<double> newTau(numOTUs,0);
1114         vector<double> norms(numSeqs, 0);
1115         nSeqsPerOTU.assign(numOTUs, 0);
1116         
1117         for(int i=startSeq;i<stopSeq;i++){
1118             
1119             if (m->control_pressed) { break; }
1120             
1121             int indexOffset = i * numOTUs;
1122             
1123             double offset = 1e8;
1124             
1125             for(int j=0;j<numOTUs;j++){
1126                 
1127                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1128                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1129                 }
1130                 
1131                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1132                     offset = dist[indexOffset + j];
1133                 }
1134             }
1135             
1136             for(int j=0;j<numOTUs;j++){
1137                 if(weight[j] > MIN_WEIGHT){
1138                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1139                     norms[i] += newTau[j];
1140                 }
1141                 else{
1142                     newTau[j] = 0.0;
1143                 }
1144             }
1145             
1146             for(int j=0;j<numOTUs;j++){
1147                 newTau[j] /= norms[i];
1148             }
1149             
1150             for(int j=0;j<numOTUs;j++){
1151                 if(newTau[j] > MIN_TAU){
1152                     
1153                     int oldTotal = total;
1154                     
1155                     total++;
1156                     
1157                     singleTau.resize(total, 0);
1158                     seqNumber.resize(total, 0);
1159                     seqIndex.resize(total, 0);
1160                     
1161                     singleTau[oldTotal] = newTau[j];
1162                     
1163                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1164                     aaI[j][nSeqsPerOTU[j]] = i;
1165                     nSeqsPerOTU[j]++;
1166                 }
1167             }
1168             
1169         }
1170         
1171     }
1172     catch(exception& e) {
1173         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1174         exit(1);        
1175     }           
1176 }
1177
1178 /**************************************************************************************************/
1179
1180 void ShhherCommand::setOTUs(){
1181     
1182     try {
1183         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1184         
1185         for(int i=0;i<numOTUs;i++){
1186             
1187             if (m->control_pressed) { break; }
1188             
1189             for(int j=0;j<nSeqsPerOTU[i];j++){
1190                 int index = cumNumSeqs[i] + j;
1191                 double tauValue = singleTau[seqNumber[index]];
1192                 int sIndex = seqIndex[index];
1193                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1194             }
1195         }
1196         
1197         for(int i=0;i<numSeqs;i++){
1198             double maxTau = -1.0000;
1199             int maxOTU = -1;
1200             for(int j=0;j<numOTUs;j++){
1201                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1202                     maxTau = bigTauMatrix[i * numOTUs + j];
1203                     maxOTU = j;
1204                 }
1205             }
1206             
1207             otuData[i] = maxOTU;
1208         }
1209         
1210         nSeqsPerOTU.assign(numOTUs, 0);         
1211         
1212         for(int i=0;i<numSeqs;i++){
1213             int index = otuData[i];
1214             
1215             singleTau[i] = 1.0000;
1216             dist[i] = 0.0000;
1217             
1218             aaP[index][nSeqsPerOTU[index]] = i;
1219             aaI[index][nSeqsPerOTU[index]] = i;
1220             
1221             nSeqsPerOTU[index]++;
1222         }
1223         fill(); 
1224     }
1225     catch(exception& e) {
1226         m->errorOut(e, "ShhherCommand", "setOTUs");
1227         exit(1);        
1228     }           
1229 }
1230
1231 /**************************************************************************************************/
1232  
1233 void ShhherCommand::getUniques(){
1234     try{
1235         
1236         
1237         numUniques = 0;
1238         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1239         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1240         uniqueLengths.assign(numSeqs, 0);
1241         mapSeqToUnique.assign(numSeqs, -1);
1242         mapUniqueToSeq.assign(numSeqs, -1);
1243         
1244         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1245         
1246         for(int i=0;i<numSeqs;i++){
1247             
1248             if (m->control_pressed) { break; }
1249             
1250             int index = 0;
1251             
1252             vector<short> current(numFlowCells);
1253             for(int j=0;j<numFlowCells;j++){
1254                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1255             }
1256             
1257             for(int j=0;j<numUniques;j++){
1258                 int offset = j * numFlowCells;
1259                 bool toEnd = 1;
1260                 
1261                 int shorterLength;
1262                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1263                 else                                                            {       shorterLength = uniqueLengths[j];       }
1264                 
1265                 for(int k=0;k<shorterLength;k++){
1266                     if(current[k] != uniqueFlowgrams[offset + k]){
1267                         toEnd = 0;
1268                         break;
1269                     }
1270                 }
1271                 
1272                 if(toEnd){
1273                     mapSeqToUnique[i] = j;
1274                     uniqueCount[j]++;
1275                     index = j;
1276                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1277                     break;
1278                 }
1279                 index++;
1280             }
1281             
1282             if(index == numUniques){
1283                 uniqueLengths[numUniques] = lengths[i];
1284                 uniqueCount[numUniques] = 1;
1285                 mapSeqToUnique[i] = numUniques;//anMap
1286                 mapUniqueToSeq[numUniques] = i;//anF
1287                 
1288                 for(int k=0;k<numFlowCells;k++){
1289                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1290                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1291                 }
1292                 
1293                 numUniques++;
1294             }
1295         }
1296         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1297         uniqueLengths.resize(numUniques);       
1298         
1299         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1300         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1301     }
1302     catch(exception& e) {
1303         m->errorOut(e, "ShhherCommand", "getUniques");
1304         exit(1);
1305     }
1306 }
1307
1308 /**************************************************************************************************/
1309
1310 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1311     try{
1312         int minLength = lengths[mapSeqToUnique[seqA]];
1313         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1314         
1315         int ANumFlowCells = seqA * numFlowCells;
1316         int BNumFlowCells = seqB * numFlowCells;
1317         
1318         float dist = 0;
1319         
1320         for(int i=0;i<minLength;i++){
1321             
1322             if (m->control_pressed) { break; }
1323             
1324             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1325             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1326             
1327             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1328             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1329             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1330         }
1331         
1332         dist /= (float) minLength;
1333         return dist;
1334     }
1335     catch(exception& e) {
1336         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1337         exit(1);
1338     }
1339 }
1340
1341 //**********************************************************************************************************************/
1342
1343 string ShhherCommand::cluster(string distFileName, string namesFileName){
1344     try {
1345         
1346         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1347         read->setCutoff(cutoff);
1348         
1349         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1350         clusterNameMap->readMap();
1351         read->read(clusterNameMap);
1352         
1353         ListVector* list = read->getListVector();
1354         SparseMatrix* matrix = read->getMatrix();
1355         
1356         delete read; 
1357         delete clusterNameMap; 
1358         
1359         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1360         
1361         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1362         string tag = cluster->getTag();
1363         
1364         double clusterCutoff = cutoff;
1365         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1366             
1367             if (m->control_pressed) { break; }
1368             
1369             cluster->update(clusterCutoff);
1370         }
1371         
1372         list->setLabel(toString(cutoff));
1373         
1374         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1375         ofstream listFile;
1376         m->openOutputFile(listFileName, listFile);
1377         list->print(listFile);
1378         listFile.close();
1379         
1380         delete matrix;  delete cluster; delete rabund; delete list;
1381         
1382         return listFileName;
1383     }
1384     catch(exception& e) {
1385         m->errorOut(e, "ShhherCommand", "cluster");
1386         exit(1);        
1387     }           
1388 }
1389
1390 /**************************************************************************************************/
1391
1392 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1393     
1394     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1395     //within an otu
1396     
1397     try{
1398         
1399         for(int i=start;i<finish;i++){
1400             
1401             if (m->control_pressed) { break; }
1402             
1403             double count = 0;
1404             int position = 0;
1405             int minFlowGram = 100000000;
1406             double minFlowValue = 1e8;
1407             change[i] = 0; //FALSE
1408             
1409             for(int j=0;j<nSeqsPerOTU[i];j++){
1410                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1411             }
1412             
1413             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1414                 vector<double> adF(nSeqsPerOTU[i]);
1415                 vector<int> anL(nSeqsPerOTU[i]);
1416                 
1417                 for(int j=0;j<nSeqsPerOTU[i];j++){
1418                     int index = cumNumSeqs[i] + j;
1419                     int nI = seqIndex[index];
1420                     int nIU = mapSeqToUnique[nI];
1421                     
1422                     int k;
1423                     for(k=0;k<position;k++){
1424                         if(nIU == anL[k]){
1425                             break;
1426                         }
1427                     }
1428                     if(k == position){
1429                         anL[position] = nIU;
1430                         adF[position] = 0.0000;
1431                         position++;
1432                     }                                           
1433                 }
1434                 
1435                 for(int j=0;j<nSeqsPerOTU[i];j++){
1436                     int index = cumNumSeqs[i] + j;
1437                     int nI = seqIndex[index];
1438                     
1439                     double tauValue = singleTau[seqNumber[index]];
1440                     
1441                     for(int k=0;k<position;k++){
1442                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1443                         adF[k] += dist * tauValue;
1444                     }
1445                 }
1446                 
1447                 for(int j=0;j<position;j++){
1448                     if(adF[j] < minFlowValue){
1449                         minFlowGram = j;
1450                         minFlowValue = adF[j];
1451                     }
1452                 }
1453                 
1454                 if(centroids[i] != anL[minFlowGram]){
1455                     change[i] = 1;
1456                     centroids[i] = anL[minFlowGram];
1457                 }
1458             }
1459             else if(centroids[i] != -1){
1460                 change[i] = 1;
1461                 centroids[i] = -1;                      
1462             }
1463         }
1464     }
1465     catch(exception& e) {
1466         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1467         exit(1);        
1468     }           
1469 }
1470
1471 /**************************************************************************************************/
1472
1473 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1474     try{
1475         
1476         int flowAValue = cent * numFlowCells;
1477         int flowBValue = flow * numFlowCells;
1478         
1479         double dist = 0;
1480         
1481         for(int i=0;i<length;i++){
1482             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1483             flowAValue++;
1484             flowBValue++;
1485         }
1486         
1487         return dist / (double)length;
1488     }
1489     catch(exception& e) {
1490         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1491         exit(1);        
1492     }           
1493 }
1494
1495 /**************************************************************************************************/
1496
1497 double ShhherCommand::getNewWeights(){
1498     try{
1499         
1500         double maxChange = 0;
1501         
1502         for(int i=0;i<numOTUs;i++){
1503             
1504             if (m->control_pressed) { break; }
1505             
1506             double difference = weight[i];
1507             weight[i] = 0;
1508             
1509             for(int j=0;j<nSeqsPerOTU[i];j++){
1510                 int index = cumNumSeqs[i] + j;
1511                 double tauValue = singleTau[seqNumber[index]];
1512                 weight[i] += tauValue;
1513             }
1514             
1515             difference = fabs(weight[i] - difference);
1516             if(difference > maxChange){ maxChange = difference; }
1517         }
1518         return maxChange;
1519     }
1520     catch(exception& e) {
1521         m->errorOut(e, "ShhherCommand", "getNewWeights");
1522         exit(1);        
1523     }           
1524 }
1525  
1526  /**************************************************************************************************/
1527  
1528 double ShhherCommand::getLikelihood(){
1529     
1530     try{
1531         
1532         vector<long double> P(numSeqs, 0);
1533         int effNumOTUs = 0;
1534         
1535         for(int i=0;i<numOTUs;i++){
1536             if(weight[i] > MIN_WEIGHT){
1537                 effNumOTUs++;
1538             }
1539         }
1540         
1541         string hold;
1542         for(int i=0;i<numOTUs;i++){
1543             
1544             if (m->control_pressed) { break; }
1545             
1546             for(int j=0;j<nSeqsPerOTU[i];j++){
1547                 int index = cumNumSeqs[i] + j;
1548                 int nI = seqIndex[index];
1549                 double singleDist = dist[seqNumber[index]];
1550                 
1551                 P[nI] += weight[i] * exp(-singleDist * sigma);
1552             }
1553         }
1554         double nLL = 0.00;
1555         for(int i=0;i<numSeqs;i++){
1556             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1557             
1558             nLL += -log(P[i]);
1559         }
1560         
1561         nLL = nLL -(double)numSeqs * log(sigma);
1562         
1563         return nLL; 
1564     }
1565     catch(exception& e) {
1566         m->errorOut(e, "ShhherCommand", "getNewWeights");
1567         exit(1);        
1568     }           
1569 }
1570
1571 /**************************************************************************************************/
1572
1573 void ShhherCommand::checkCentroids(){
1574     try{
1575         vector<int> unique(numOTUs, 1);
1576         
1577         for(int i=0;i<numOTUs;i++){
1578             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1579                 unique[i] = -1;
1580             }
1581         }
1582         
1583         for(int i=0;i<numOTUs;i++){
1584             
1585             if (m->control_pressed) { break; }
1586             
1587             if(unique[i] == 1){
1588                 for(int j=i+1;j<numOTUs;j++){
1589                     if(unique[j] == 1){
1590                         
1591                         if(centroids[j] == centroids[i]){
1592                             unique[j] = 0;
1593                             centroids[j] = -1;
1594                             
1595                             weight[i] += weight[j];
1596                             weight[j] = 0.0;
1597                         }
1598                     }
1599                 }
1600             }
1601         }
1602     }
1603     catch(exception& e) {
1604         m->errorOut(e, "ShhherCommand", "checkCentroids");
1605         exit(1);        
1606     }           
1607 }
1608  /**************************************************************************************************/
1609
1610
1611  
1612 void ShhherCommand::writeQualities(vector<int> otuCounts){
1613     
1614     try {
1615         string thisOutputDir = outputDir;
1616         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1617         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1618         
1619         ofstream qualityFile;
1620         m->openOutputFile(qualityFileName, qualityFile);
1621         
1622         qualityFile.setf(ios::fixed, ios::floatfield);
1623         qualityFile.setf(ios::showpoint);
1624         qualityFile << setprecision(6);
1625         
1626         vector<vector<int> > qualities(numOTUs);
1627         vector<double> pr(HOMOPS, 0);
1628         
1629         
1630         for(int i=0;i<numOTUs;i++){
1631             
1632             if (m->control_pressed) { break; }
1633             
1634             int index = 0;
1635             int base = 0;
1636             
1637             if(nSeqsPerOTU[i] > 0){
1638                 qualities[i].assign(1024, -1);
1639                 
1640                 while(index < numFlowCells){
1641                     double maxPrValue = 1e8;
1642                     short maxPrIndex = -1;
1643                     double count = 0.0000;
1644                     
1645                     pr.assign(HOMOPS, 0);
1646                     
1647                     for(int j=0;j<nSeqsPerOTU[i];j++){
1648                         int lIndex = cumNumSeqs[i] + j;
1649                         double tauValue = singleTau[seqNumber[lIndex]];
1650                         int sequenceIndex = aaI[i][j];
1651                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1652                         
1653                         count += tauValue;
1654                         
1655                         for(int s=0;s<HOMOPS;s++){
1656                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1657                         }
1658                     }
1659                     
1660                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1661                     maxPrValue = pr[maxPrIndex];
1662                     
1663                     if(count > MIN_COUNT){
1664                         double U = 0.0000;
1665                         double norm = 0.0000;
1666                         
1667                         for(int s=0;s<HOMOPS;s++){
1668                             norm += exp(-(pr[s] - maxPrValue));
1669                         }
1670                         
1671                         for(int s=1;s<=maxPrIndex;s++){
1672                             int value = 0;
1673                             double temp = 0.0000;
1674                             
1675                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1676                             
1677                             if(U>0.00){
1678                                 temp = log10(U);
1679                             }
1680                             else{
1681                                 temp = -10.1;
1682                             }
1683                             temp = floor(-10 * temp);
1684                             value = (int)floor(temp);
1685                             if(value > 100){    value = 100;    }
1686                             
1687                             qualities[i][base] = (int)value;
1688                             base++;
1689                         }
1690                     }
1691                     
1692                     index++;
1693                 }
1694             }
1695             
1696             
1697             if(otuCounts[i] > 0){
1698                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1699                 
1700                 int j=4;        //need to get past the first four bases
1701                 while(qualities[i][j] != -1){
1702                     qualityFile << qualities[i][j] << ' ';
1703                     j++;
1704                 }
1705                 qualityFile << endl;
1706             }
1707         }
1708         qualityFile.close();
1709         outputNames.push_back(qualityFileName);
1710         
1711     }
1712     catch(exception& e) {
1713         m->errorOut(e, "ShhherCommand", "writeQualities");
1714         exit(1);        
1715     }           
1716 }
1717
1718 /**************************************************************************************************/
1719
1720 void ShhherCommand::writeSequences(vector<int> otuCounts){
1721     try {
1722         string thisOutputDir = outputDir;
1723         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1724         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1725         ofstream fastaFile;
1726         m->openOutputFile(fastaFileName, fastaFile);
1727         
1728         vector<string> names(numOTUs, "");
1729         
1730         for(int i=0;i<numOTUs;i++){
1731             
1732             if (m->control_pressed) { break; }
1733             
1734             int index = centroids[i];
1735             
1736             if(otuCounts[i] > 0){
1737                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1738                 
1739                 string newSeq = "";
1740                 
1741                 for(int j=0;j<numFlowCells;j++){
1742                     
1743                     char base = flowOrder[j % 4];
1744                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1745                         newSeq += base;
1746                     }
1747                 }
1748                 
1749                 fastaFile << newSeq.substr(4) << endl;
1750             }
1751         }
1752         fastaFile.close();
1753         
1754         outputNames.push_back(fastaFileName);
1755         
1756         if(compositeFASTAFileName != ""){
1757             m->appendFiles(fastaFileName, compositeFASTAFileName);
1758         }
1759     }
1760     catch(exception& e) {
1761         m->errorOut(e, "ShhherCommand", "writeSequences");
1762         exit(1);        
1763     }           
1764 }
1765
1766 /**************************************************************************************************/
1767
1768 void ShhherCommand::writeNames(vector<int> otuCounts){
1769     try {
1770         string thisOutputDir = outputDir;
1771         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1772         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1773         ofstream nameFile;
1774         m->openOutputFile(nameFileName, nameFile);
1775         
1776         for(int i=0;i<numOTUs;i++){
1777             
1778             if (m->control_pressed) { break; }
1779             
1780             if(otuCounts[i] > 0){
1781                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1782                 
1783                 for(int j=1;j<nSeqsPerOTU[i];j++){
1784                     nameFile << ',' << seqNameVector[aaI[i][j]];
1785                 }
1786                 
1787                 nameFile << endl;
1788             }
1789         }
1790         nameFile.close();
1791         outputNames.push_back(nameFileName);
1792         
1793         
1794         if(compositeNamesFileName != ""){
1795             m->appendFiles(nameFileName, compositeNamesFileName);
1796         }               
1797     }
1798     catch(exception& e) {
1799         m->errorOut(e, "ShhherCommand", "writeNames");
1800         exit(1);        
1801     }           
1802 }
1803
1804 /**************************************************************************************************/
1805
1806 void ShhherCommand::writeGroups(){
1807     try {
1808         string thisOutputDir = outputDir;
1809         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1810         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1811         string groupFileName = fileRoot + "shhh.groups";
1812         ofstream groupFile;
1813         m->openOutputFile(groupFileName, groupFile);
1814         
1815         for(int i=0;i<numSeqs;i++){
1816             if (m->control_pressed) { break; }
1817             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1818         }
1819         groupFile.close();
1820         outputNames.push_back(groupFileName);
1821         
1822     }
1823     catch(exception& e) {
1824         m->errorOut(e, "ShhherCommand", "writeGroups");
1825         exit(1);        
1826     }           
1827 }
1828
1829 /**************************************************************************************************/
1830
1831 void ShhherCommand::writeClusters(vector<int> otuCounts){
1832     try {
1833         string thisOutputDir = outputDir;
1834         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1835         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1836         ofstream otuCountsFile;
1837         m->openOutputFile(otuCountsFileName, otuCountsFile);
1838         
1839         string bases = flowOrder;
1840         
1841         for(int i=0;i<numOTUs;i++){
1842             
1843             if (m->control_pressed) {
1844                 break;
1845             }
1846             //output the translated version of the centroid sequence for the otu
1847             if(otuCounts[i] > 0){
1848                 int index = centroids[i];
1849                 
1850                 otuCountsFile << "ideal\t";
1851                 for(int j=8;j<numFlowCells;j++){
1852                     char base = bases[j % 4];
1853                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1854                         otuCountsFile << base;
1855                     }
1856                 }
1857                 otuCountsFile << endl;
1858                 
1859                 for(int j=0;j<nSeqsPerOTU[i];j++){
1860                     int sequence = aaI[i][j];
1861                     otuCountsFile << seqNameVector[sequence] << '\t';
1862                     
1863                     string newSeq = "";
1864                     
1865                     for(int k=0;k<lengths[sequence];k++){
1866                         char base = bases[k % 4];
1867                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1868                         
1869                         for(int s=0;s<freq;s++){
1870                             newSeq += base;
1871                             //otuCountsFile << base;
1872                         }
1873                     }
1874                     otuCountsFile << newSeq.substr(4) << endl;
1875                 }
1876                 otuCountsFile << endl;
1877             }
1878         }
1879         otuCountsFile.close();
1880         outputNames.push_back(otuCountsFileName);
1881         
1882     }
1883     catch(exception& e) {
1884         m->errorOut(e, "ShhherCommand", "writeClusters");
1885         exit(1);        
1886     }           
1887 }
1888
1889 #else
1890 //**********************************************************************************************************************
1891
1892 int ShhherCommand::execute(){
1893         try {
1894                 if (abort == true) { return 0; }
1895                 
1896                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1897                 getJointLookUp();       if (m->control_pressed) { return 0; }
1898                 
1899         int numFiles = flowFileVector.size();
1900                 
1901         if (numFiles < processors) { processors = numFiles; }
1902         
1903 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1904         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1905         else { createProcesses(flowFileVector); } //each processor processes one file
1906 #else
1907         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1908 #endif
1909         
1910                 if(compositeFASTAFileName != ""){
1911                         outputNames.push_back(compositeFASTAFileName);
1912                         outputNames.push_back(compositeNamesFileName);
1913                 }
1914
1915                 m->mothurOutEndLine();
1916                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1917                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1918                 m->mothurOutEndLine();
1919                 
1920                 return 0;
1921         }
1922         catch(exception& e) {
1923                 m->errorOut(e, "ShhherCommand", "execute");
1924                 exit(1);
1925         }
1926 }
1927 #endif
1928 /**************************************************************************************************/
1929
1930 int ShhherCommand::createProcesses(vector<string> filenames){
1931     try {
1932         vector<int> processIDS;
1933                 int process = 1;
1934                 int num = 0;
1935                 
1936                 //sanity check
1937                 if (filenames.size() < processors) { processors = filenames.size(); }
1938                 
1939                 //divide the groups between the processors
1940                 vector<linePair> lines;
1941         vector<int> numFilesToComplete;
1942                 int numFilesPerProcessor = filenames.size() / processors;
1943                 for (int i = 0; i < processors; i++) {
1944                         int startIndex =  i * numFilesPerProcessor;
1945                         int endIndex = (i+1) * numFilesPerProcessor;
1946                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1947                         lines.push_back(linePair(startIndex, endIndex));
1948             numFilesToComplete.push_back((endIndex-startIndex));
1949                 }
1950                 
1951         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1952                 
1953                 //loop through and create all the processes you want
1954                 while (process != processors) {
1955                         int pid = fork();
1956                         
1957                         if (pid > 0) {
1958                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1959                                 process++;
1960                         }else if (pid == 0){
1961                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1962                 
1963                 //pass numSeqs to parent
1964                                 ofstream out;
1965                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
1966                                 m->openOutputFile(tempFile, out);
1967                                 out << num << endl;
1968                                 out.close();
1969                 
1970                                 exit(0);
1971                         }else { 
1972                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1973                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1974                                 exit(0);
1975                         }
1976                 }
1977                 
1978                 //do my part
1979                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1980                 
1981                 //force parent to wait until all the processes are done
1982                 for (int i=0;i<processIDS.size();i++) { 
1983                         int temp = processIDS[i];
1984                         wait(&temp);
1985                 }
1986         
1987         #else
1988         
1989         //////////////////////////////////////////////////////////////////////////////////////////////////////
1990         
1991         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1992         
1993         //////////////////////////////////////////////////////////////////////////////////////////////////////
1994                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
1995                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1996                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1997                 
1998                 vector<shhhFlowsData*> pDataArray; 
1999                 DWORD   dwThreadIdArray[processors-1];
2000                 HANDLE  hThreadArray[processors-1]; 
2001                 
2002                 //Create processor worker threads.
2003                 for( int i=0; i<processors-1; i++ ){
2004                         // Allocate memory for thread data.
2005                         string extension = "";
2006                         if (i != 0) { extension = toString(i) + ".temp"; }
2007                         
2008             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2009                         pDataArray.push_back(tempFlow);
2010                         processIDS.push_back(i);
2011             
2012                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2013                 }
2014                 
2015                 //using the main process as a worker saves time and memory
2016                 //do my part
2017                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2018                 
2019                 //Wait until all threads have terminated.
2020                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2021                 
2022                 //Close all thread handles and free memory allocations.
2023                 for(int i=0; i < pDataArray.size(); i++){
2024                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2025                         CloseHandle(hThreadArray[i]);
2026                         delete pDataArray[i];
2027                 }
2028                 
2029         #endif
2030         
2031         for (int i=0;i<processIDS.size();i++) { 
2032             ifstream in;
2033                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2034                         m->openInputFile(tempFile, in);
2035                         if (!in.eof()) { 
2036                 int tempNum = 0; 
2037                 in >> tempNum; 
2038                 if (tempNum != numFilesToComplete[i+1]) {
2039                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2040                 }
2041             }
2042                         in.close(); m->mothurRemove(tempFile);
2043             
2044             if (compositeFASTAFileName != "") {
2045                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2046                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2047                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2048                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2049             }
2050         }
2051         
2052         return 0;
2053         
2054     }
2055         catch(exception& e) {
2056                 m->errorOut(e, "ShhherCommand", "createProcesses");
2057                 exit(1);
2058         }
2059 }
2060 /**************************************************************************************************/
2061
2062 vector<string> ShhherCommand::parseFlowFiles(string filename){
2063     try {
2064         vector<string> files;
2065         int count = 0;
2066         
2067         ifstream in;
2068         m->openInputFile(filename, in);
2069         
2070         int thisNumFLows = 0;
2071         in >> thisNumFLows; m->gobble(in);
2072         
2073         while (!in.eof()) {
2074             if (m->control_pressed) { break; }
2075             
2076             ofstream out;
2077             string outputFileName = filename + toString(count) + ".temp";
2078             m->openOutputFile(outputFileName, out);
2079             out << thisNumFLows << endl;
2080             files.push_back(outputFileName);
2081             
2082             int numLinesWrote = 0;
2083             for (int i = 0; i < largeSize; i++) {
2084                 if (in.eof()) { break; }
2085                 string line = m->getline(in); m->gobble(in);
2086                 out << line << endl;
2087                 numLinesWrote++;
2088             }
2089             out.close();
2090             
2091             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2092             count++;
2093         }
2094         in.close();
2095         
2096         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2097         
2098         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2099         
2100         return files;
2101     }
2102         catch(exception& e) {
2103                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2104                 exit(1);
2105         }
2106 }
2107 /**************************************************************************************************/
2108
2109 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2110     try {
2111         
2112         int numCompleted = 0;
2113         
2114         for(int i=start;i<end;i++){
2115                         
2116                         if (m->control_pressed) { break; }
2117                         
2118             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2119             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2120             
2121             if (m->control_pressed) { break; }
2122             
2123             double begClock = clock();
2124             unsigned long long begTime;
2125             
2126             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2127                 
2128                 string flowFileName = theseFlowFileNames[g];
2129                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2130                 m->mothurOut("Reading flowgrams...\n");
2131                 
2132                 vector<string> seqNameVector;
2133                 vector<int> lengths;
2134                 vector<short> flowDataIntI;
2135                 vector<double> flowDataPrI;
2136                 map<string, int> nameMap;
2137                 vector<short> uniqueFlowgrams;
2138                 vector<int> uniqueCount;
2139                 vector<int> mapSeqToUnique;
2140                 vector<int> mapUniqueToSeq;
2141                 vector<int> uniqueLengths;
2142                 int numFlowCells;
2143                 
2144                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2145                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2146                 
2147                 if (m->control_pressed) { break; }
2148                 
2149                 m->mothurOut("Identifying unique flowgrams...\n");
2150                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2151                 
2152                 if (m->control_pressed) { break; }
2153                 
2154                 m->mothurOut("Calculating distances between flowgrams...\n");
2155                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2156                 begTime = time(NULL);
2157                
2158                 
2159                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); 
2160                 
2161                 m->mothurOutEndLine();
2162                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2163                 
2164                 
2165                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2166                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2167                 
2168                 if (m->control_pressed) { break; }
2169                 
2170                 m->mothurOut("\nClustering flowgrams...\n");
2171                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2172                 cluster(listFileName, distFileName, namesFileName);
2173                 
2174                 if (m->control_pressed) { break; }
2175                 
2176                 vector<int> otuData;
2177                 vector<int> cumNumSeqs;
2178                 vector<int> nSeqsPerOTU;
2179                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2180                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2181                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2182                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2183                 
2184                 
2185                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2186                 
2187                 if (m->control_pressed) { break; }
2188                 
2189                 m->mothurRemove(distFileName);
2190                 m->mothurRemove(namesFileName);
2191                 m->mothurRemove(listFileName);
2192                 
2193                 vector<double> dist;            //adDist - distance of sequences to centroids
2194                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2195                 vector<int> centroids;          //the representative flowgram for each cluster m
2196                 vector<double> weight;
2197                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2198                 vector<int> nSeqsBreaks;
2199                 vector<int> nOTUsBreaks;
2200                 
2201                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2202                 
2203                 dist.assign(numSeqs * numOTUs, 0);
2204                 change.assign(numOTUs, 1);
2205                 centroids.assign(numOTUs, -1);
2206                 weight.assign(numOTUs, 0);
2207                 singleTau.assign(numSeqs, 1.0);
2208                 
2209                 nSeqsBreaks.assign(2, 0);
2210                 nOTUsBreaks.assign(2, 0);
2211                 
2212                 nSeqsBreaks[0] = 0;
2213                 nSeqsBreaks[1] = numSeqs;
2214                 nOTUsBreaks[1] = numOTUs;
2215                 
2216                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2217                 
2218                 if (m->control_pressed) { break; }
2219                 
2220                 double maxDelta = 0;
2221                 int iter = 0;
2222                 
2223                 begClock = clock();
2224                 begTime = time(NULL);
2225                 
2226                 m->mothurOut("\nDenoising flowgrams...\n");
2227                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2228                 
2229                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2230                     
2231                     if (m->control_pressed) { break; }
2232                     
2233                     double cycClock = clock();
2234                     unsigned long long cycTime = time(NULL);
2235                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2236                     
2237                     if (m->control_pressed) { break; }
2238                     
2239                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2240                     
2241                     if (m->control_pressed) { break; }
2242                     
2243                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2244                     
2245                     if (m->control_pressed) { break; }
2246                     
2247                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2248                     
2249                     if (m->control_pressed) { break; }
2250                     
2251                     checkCentroids(numOTUs, centroids, weight);
2252                     
2253                     if (m->control_pressed) { break; }
2254                     
2255                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2256                     
2257                     if (m->control_pressed) { break; }
2258                     
2259                     iter++;
2260                     
2261                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2262                     
2263                 }       
2264                 
2265                 if (m->control_pressed) { break; }
2266                 
2267                 m->mothurOut("\nFinalizing...\n");
2268                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2269                 
2270                 if (m->control_pressed) { break; }
2271                 
2272                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2273                 
2274                 if (m->control_pressed) { break; }
2275                 
2276                 vector<int> otuCounts(numOTUs, 0);
2277                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2278                 
2279                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2280                 
2281                 if (m->control_pressed) { break; }
2282                 
2283                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2284                 string thisOutputDir = outputDir;
2285                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2286                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
2287                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
2288                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
2289                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
2290                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2291                 string groupFileName = fileRoot + "shhh.groups";
2292
2293                 
2294                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2295                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2296                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2297                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2298                 writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector);                                           if (m->control_pressed) { break; }
2299                 
2300                 if (large) {
2301                     if (g > 0) {
2302                         m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.qual"));
2303                         m->mothurRemove(qualityFileName);
2304                         m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.fasta"));
2305                         m->mothurRemove(fastaFileName);
2306                         m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.names"));
2307                         m->mothurRemove(nameFileName);
2308                         m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.counts"));
2309                         m->mothurRemove(otuCountsFileName);
2310                         m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.groups"));
2311                         m->mothurRemove(groupFileName);
2312                     }
2313                     m->mothurRemove(theseFlowFileNames[g]);
2314                 }
2315                         }
2316             
2317             numCompleted++;
2318                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2319                 }
2320                 
2321         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2322         
2323         return numCompleted;
2324         
2325     }catch(exception& e) {
2326             m->errorOut(e, "ShhherCommand", "driver");
2327             exit(1);
2328     }
2329 }
2330
2331 /**************************************************************************************************/
2332 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2333         try{
2334        
2335                 ifstream flowFile;
2336        
2337                 m->openInputFile(filename, flowFile);
2338                 
2339                 string seqName;
2340                 int currentNumFlowCells;
2341                 float intensity;
2342         thisSeqNameVector.clear();
2343                 thisLengths.clear();
2344                 thisFlowDataIntI.clear();
2345                 thisNameMap.clear();
2346                 
2347                 flowFile >> numFlowCells;
2348         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2349                 int index = 0;//pcluster
2350                 while(!flowFile.eof()){
2351                         
2352                         if (m->control_pressed) { break; }
2353                         
2354                         flowFile >> seqName >> currentNumFlowCells;
2355             
2356                         thisLengths.push_back(currentNumFlowCells);
2357            
2358                         thisSeqNameVector.push_back(seqName);
2359                         thisNameMap[seqName] = index++;//pcluster
2360             
2361             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2362             
2363                         for(int i=0;i<numFlowCells;i++){
2364                                 flowFile >> intensity;
2365                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2366                                 int intI = int(100 * intensity + 0.0001);
2367                                 thisFlowDataIntI.push_back(intI);
2368                         }
2369                         m->gobble(flowFile);
2370                 }
2371                 flowFile.close();
2372                 
2373                 int numSeqs = thisSeqNameVector.size();         
2374                 
2375                 for(int i=0;i<numSeqs;i++){
2376                         
2377                         if (m->control_pressed) { break; }
2378                         
2379                         int iNumFlowCells = i * numFlowCells;
2380                         for(int j=thisLengths[i];j<numFlowCells;j++){
2381                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2382                         }
2383                 }
2384         
2385         return numSeqs;
2386                 
2387         }
2388         catch(exception& e) {
2389                 m->errorOut(e, "ShhherCommand", "getFlowData");
2390                 exit(1);
2391         }
2392 }
2393 /**************************************************************************************************/
2394
2395 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2396         try{            
2397         
2398                 ostringstream outStream;
2399                 outStream.setf(ios::fixed, ios::floatfield);
2400                 outStream.setf(ios::dec, ios::basefield);
2401                 outStream.setf(ios::showpoint);
2402                 outStream.precision(6);
2403                 
2404                 int begTime = time(NULL);
2405                 double begClock = clock();
2406         
2407                 for(int i=0;i<stopSeq;i++){
2408                         
2409                         if (m->control_pressed) { break; }
2410                         
2411                         for(int j=0;j<i;j++){
2412                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2413                 
2414                                 if(flowDistance < 1e-6){
2415                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2416                                 }
2417                                 else if(flowDistance <= cutoff){
2418                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2419                                 }
2420                         }
2421                         if(i % 100 == 0){
2422                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2423                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2424                                 m->mothurOutEndLine();
2425                         }
2426                 }
2427                 
2428                 ofstream distFile(distFileName.c_str());
2429                 distFile << outStream.str();            
2430                 distFile.close();
2431                 
2432                 if (m->control_pressed) {}
2433                 else {
2434                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2435                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2436                         m->mothurOutEndLine();
2437                 }
2438         
2439         return 0;
2440         }
2441         catch(exception& e) {
2442                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2443                 exit(1);
2444         }
2445 }
2446 /**************************************************************************************************/
2447
2448 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2449         try{
2450                 int minLength = lengths[mapSeqToUnique[seqA]];
2451                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2452                 
2453                 int ANumFlowCells = seqA * numFlowCells;
2454                 int BNumFlowCells = seqB * numFlowCells;
2455                 
2456                 float dist = 0;
2457                 
2458                 for(int i=0;i<minLength;i++){
2459                         
2460                         if (m->control_pressed) { break; }
2461                         
2462                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2463                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2464                         
2465                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2466                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2467                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2468                 }
2469                 
2470                 dist /= (float) minLength;
2471                 return dist;
2472         }
2473         catch(exception& e) {
2474                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2475                 exit(1);
2476         }
2477 }
2478
2479 /**************************************************************************************************/
2480
2481 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2482         try{
2483                 int numUniques = 0;
2484                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2485                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2486                 uniqueLengths.assign(numSeqs, 0);
2487                 mapSeqToUnique.assign(numSeqs, -1);
2488                 mapUniqueToSeq.assign(numSeqs, -1);
2489                 
2490                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2491                 
2492                 for(int i=0;i<numSeqs;i++){
2493                         
2494                         if (m->control_pressed) { break; }
2495                         
2496                         int index = 0;
2497                         
2498                         vector<short> current(numFlowCells);
2499                         for(int j=0;j<numFlowCells;j++){
2500                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2501                         }
2502             
2503                         for(int j=0;j<numUniques;j++){
2504                                 int offset = j * numFlowCells;
2505                                 bool toEnd = 1;
2506                                 
2507                                 int shorterLength;
2508                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2509                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2510                 
2511                                 for(int k=0;k<shorterLength;k++){
2512                                         if(current[k] != uniqueFlowgrams[offset + k]){
2513                                                 toEnd = 0;
2514                                                 break;
2515                                         }
2516                                 }
2517                                 
2518                                 if(toEnd){
2519                                         mapSeqToUnique[i] = j;
2520                                         uniqueCount[j]++;
2521                                         index = j;
2522                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2523                                         break;
2524                                 }
2525                                 index++;
2526                         }
2527                         
2528                         if(index == numUniques){
2529                                 uniqueLengths[numUniques] = lengths[i];
2530                                 uniqueCount[numUniques] = 1;
2531                                 mapSeqToUnique[i] = numUniques;//anMap
2532                                 mapUniqueToSeq[numUniques] = i;//anF
2533                                 
2534                                 for(int k=0;k<numFlowCells;k++){
2535                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2536                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2537                                 }
2538                                 
2539                                 numUniques++;
2540                         }
2541                 }
2542                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2543                 uniqueLengths.resize(numUniques);       
2544                 
2545                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2546                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2547         
2548         return numUniques;
2549         }
2550         catch(exception& e) {
2551                 m->errorOut(e, "ShhherCommand", "getUniques");
2552                 exit(1);
2553         }
2554 }
2555 /**************************************************************************************************/
2556 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2557         try{
2558                 
2559                 vector<string> duplicateNames(numUniques, "");
2560                 for(int i=0;i<numSeqs;i++){
2561                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2562                 }
2563                 
2564                 ofstream nameFile;
2565                 m->openOutputFile(filename, nameFile);
2566                 
2567                 for(int i=0;i<numUniques;i++){
2568                         
2569                         if (m->control_pressed) { break; }
2570                         
2571             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2572                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2573                 }
2574                 
2575                 nameFile.close();
2576         
2577                 return 0;
2578         }
2579         catch(exception& e) {
2580                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2581                 exit(1);
2582         }
2583 }
2584 //**********************************************************************************************************************
2585
2586 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2587         try {
2588                 
2589                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2590                 read->setCutoff(cutoff);
2591                 
2592                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2593                 clusterNameMap->readMap();
2594                 read->read(clusterNameMap);
2595         
2596                 ListVector* list = read->getListVector();
2597                 SparseMatrix* matrix = read->getMatrix();
2598                 
2599                 delete read; 
2600                 delete clusterNameMap; 
2601         
2602                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2603                 
2604                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2605                 string tag = cluster->getTag();
2606                 
2607                 double clusterCutoff = cutoff;
2608                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2609                         
2610                         if (m->control_pressed) { break; }
2611                         
2612                         cluster->update(clusterCutoff);
2613                 }
2614                 
2615                 list->setLabel(toString(cutoff));
2616                 
2617                 ofstream listFile;
2618                 m->openOutputFile(filename, listFile);
2619                 list->print(listFile);
2620                 listFile.close();
2621                 
2622                 delete matrix;  delete cluster; delete rabund; delete list;
2623         
2624                 return 0;
2625         }
2626         catch(exception& e) {
2627                 m->errorOut(e, "ShhherCommand", "cluster");
2628                 exit(1);        
2629         }               
2630 }
2631 /**************************************************************************************************/
2632
2633 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2634                                vector<int>& cumNumSeqs,
2635                                vector<int>& nSeqsPerOTU,
2636                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2637                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2638                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2639                                vector<int>& seqIndex,
2640                                map<string, int>& nameMap){
2641         try {
2642         
2643                 ifstream listFile;
2644                 m->openInputFile(fileName, listFile);
2645                 string label;
2646         int numOTUs;
2647                 
2648                 listFile >> label >> numOTUs;
2649         
2650         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2651         
2652                 otuData.assign(numSeqs, 0);
2653                 cumNumSeqs.assign(numOTUs, 0);
2654                 nSeqsPerOTU.assign(numOTUs, 0);
2655                 aaP.clear();aaP.resize(numOTUs);
2656                 
2657                 seqNumber.clear();
2658                 aaI.clear();
2659                 seqIndex.clear();
2660                 
2661                 string singleOTU = "";
2662                 
2663                 for(int i=0;i<numOTUs;i++){
2664                         
2665                         if (m->control_pressed) { break; }
2666             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2667             
2668                         listFile >> singleOTU;
2669                         
2670                         istringstream otuString(singleOTU);
2671             
2672                         while(otuString){
2673                                 
2674                                 string seqName = "";
2675                                 
2676                                 for(int j=0;j<singleOTU.length();j++){
2677                                         char letter = otuString.get();
2678                                         
2679                                         if(letter != ','){
2680                                                 seqName += letter;
2681                                         }
2682                                         else{
2683                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2684                                                 int index = nmIt->second;
2685                                                 
2686                                                 nameMap.erase(nmIt);
2687                                                 
2688                                                 otuData[index] = i;
2689                                                 nSeqsPerOTU[i]++;
2690                                                 aaP[i].push_back(index);
2691                                                 seqName = "";
2692                                         }
2693                                 }
2694                                 
2695                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2696                 
2697                                 int index = nmIt->second;
2698                                 nameMap.erase(nmIt);
2699                 
2700                                 otuData[index] = i;
2701                                 nSeqsPerOTU[i]++;
2702                                 aaP[i].push_back(index);        
2703                                 
2704                                 otuString.get();
2705                         }
2706                         
2707                         sort(aaP[i].begin(), aaP[i].end());
2708                         for(int j=0;j<nSeqsPerOTU[i];j++){
2709                                 seqNumber.push_back(aaP[i][j]);
2710                         }
2711                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2712                                 aaP[i].push_back(0);
2713                         }
2714                         
2715                         
2716                 }
2717                 
2718                 for(int i=1;i<numOTUs;i++){
2719                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2720                 }
2721                 aaI = aaP;
2722                 seqIndex = seqNumber;
2723                 
2724                 listFile.close();       
2725         
2726         return numOTUs;
2727                 
2728         }
2729         catch(exception& e) {
2730                 m->errorOut(e, "ShhherCommand", "getOTUData");
2731                 exit(1);        
2732         }               
2733 }
2734 /**************************************************************************************************/
2735
2736 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2737                                           vector<int>& cumNumSeqs,
2738                                           vector<int>& nSeqsPerOTU,
2739                                           vector<int>& seqIndex,
2740                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2741                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2742                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2743                                           vector<int>& mapSeqToUnique,
2744                                           vector<short>& uniqueFlowgrams,
2745                                           vector<short>& flowDataIntI,
2746                                           vector<int>& lengths,
2747                                           int numFlowCells,
2748                                           vector<int>& seqNumber){                          
2749         
2750         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2751         //within an otu
2752         
2753         try{
2754                 
2755                 for(int i=0;i<numOTUs;i++){
2756                         
2757                         if (m->control_pressed) { break; }
2758                         
2759                         double count = 0;
2760                         int position = 0;
2761                         int minFlowGram = 100000000;
2762                         double minFlowValue = 1e8;
2763                         change[i] = 0; //FALSE
2764                         
2765                         for(int j=0;j<nSeqsPerOTU[i];j++){
2766                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2767                         }
2768             
2769                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2770                                 vector<double> adF(nSeqsPerOTU[i]);
2771                                 vector<int> anL(nSeqsPerOTU[i]);
2772                                 
2773                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2774                                         int index = cumNumSeqs[i] + j;
2775                                         int nI = seqIndex[index];
2776                                         int nIU = mapSeqToUnique[nI];
2777                                         
2778                                         int k;
2779                                         for(k=0;k<position;k++){
2780                                                 if(nIU == anL[k]){
2781                                                         break;
2782                                                 }
2783                                         }
2784                                         if(k == position){
2785                                                 anL[position] = nIU;
2786                                                 adF[position] = 0.0000;
2787                                                 position++;
2788                                         }                                               
2789                                 }
2790                                 
2791                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2792                                         int index = cumNumSeqs[i] + j;
2793                                         int nI = seqIndex[index];
2794                                         
2795                                         double tauValue = singleTau[seqNumber[index]];
2796                                         
2797                                         for(int k=0;k<position;k++){
2798                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2799                                                 adF[k] += dist * tauValue;
2800                                         }
2801                                 }
2802                                 
2803                                 for(int j=0;j<position;j++){
2804                                         if(adF[j] < minFlowValue){
2805                                                 minFlowGram = j;
2806                                                 minFlowValue = adF[j];
2807                                         }
2808                                 }
2809                                 
2810                                 if(centroids[i] != anL[minFlowGram]){
2811                                         change[i] = 1;
2812                                         centroids[i] = anL[minFlowGram];
2813                                 }
2814                         }
2815                         else if(centroids[i] != -1){
2816                                 change[i] = 1;
2817                                 centroids[i] = -1;                      
2818                         }
2819                 }
2820         
2821         return 0;
2822         }
2823         catch(exception& e) {
2824                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2825                 exit(1);        
2826         }               
2827 }
2828 /**************************************************************************************************/
2829
2830 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2831                                         vector<short>& flowDataIntI, int numFlowCells){
2832         try{
2833                 
2834                 int flowAValue = cent * numFlowCells;
2835                 int flowBValue = flow * numFlowCells;
2836                 
2837                 double dist = 0;
2838         
2839                 for(int i=0;i<length;i++){
2840                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2841                         flowAValue++;
2842                         flowBValue++;
2843                 }
2844                 
2845                 return dist / (double)length;
2846         }
2847         catch(exception& e) {
2848                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2849                 exit(1);        
2850         }               
2851 }
2852 /**************************************************************************************************/
2853
2854 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2855         try{
2856                 
2857                 double maxChange = 0;
2858                 
2859                 for(int i=0;i<numOTUs;i++){
2860                         
2861                         if (m->control_pressed) { break; }
2862                         
2863                         double difference = weight[i];
2864                         weight[i] = 0;
2865                         
2866                         for(int j=0;j<nSeqsPerOTU[i];j++){
2867                                 int index = cumNumSeqs[i] + j;
2868                                 double tauValue = singleTau[seqNumber[index]];
2869                                 weight[i] += tauValue;
2870                         }
2871                         
2872                         difference = fabs(weight[i] - difference);
2873                         if(difference > maxChange){     maxChange = difference; }
2874                 }
2875                 return maxChange;
2876         }
2877         catch(exception& e) {
2878                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2879                 exit(1);        
2880         }               
2881 }
2882
2883 /**************************************************************************************************/
2884
2885 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2886         
2887         try{
2888                 
2889                 vector<long double> P(numSeqs, 0);
2890                 int effNumOTUs = 0;
2891                 
2892                 for(int i=0;i<numOTUs;i++){
2893                         if(weight[i] > MIN_WEIGHT){
2894                                 effNumOTUs++;
2895                         }
2896                 }
2897                 
2898                 string hold;
2899                 for(int i=0;i<numOTUs;i++){
2900                         
2901                         if (m->control_pressed) { break; }
2902                         
2903                         for(int j=0;j<nSeqsPerOTU[i];j++){
2904                                 int index = cumNumSeqs[i] + j;
2905                                 int nI = seqIndex[index];
2906                                 double singleDist = dist[seqNumber[index]];
2907                                 
2908                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2909                         }
2910                 }
2911                 double nLL = 0.00;
2912                 for(int i=0;i<numSeqs;i++){
2913                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2914             
2915                         nLL += -log(P[i]);
2916                 }
2917                 
2918                 nLL = nLL -(double)numSeqs * log(sigma);
2919         
2920                 return nLL; 
2921         }
2922         catch(exception& e) {
2923                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2924                 exit(1);        
2925         }               
2926 }
2927
2928 /**************************************************************************************************/
2929
2930 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2931         try{
2932                 vector<int> unique(numOTUs, 1);
2933                 
2934                 for(int i=0;i<numOTUs;i++){
2935                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2936                                 unique[i] = -1;
2937                         }
2938                 }
2939                 
2940                 for(int i=0;i<numOTUs;i++){
2941                         
2942                         if (m->control_pressed) { break; }
2943                         
2944                         if(unique[i] == 1){
2945                                 for(int j=i+1;j<numOTUs;j++){
2946                                         if(unique[j] == 1){
2947                                                 
2948                                                 if(centroids[j] == centroids[i]){
2949                                                         unique[j] = 0;
2950                                                         centroids[j] = -1;
2951                                                         
2952                                                         weight[i] += weight[j];
2953                                                         weight[j] = 0.0;
2954                                                 }
2955                                         }
2956                                 }
2957                         }
2958                 }
2959         
2960         return 0;
2961         }
2962         catch(exception& e) {
2963                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2964                 exit(1);        
2965         }               
2966 }
2967 /**************************************************************************************************/
2968
2969 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
2970                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
2971                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
2972                                      vector<int>& seqNumber, vector<int>& seqIndex,
2973                                      vector<short>& uniqueFlowgrams,
2974                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2975         
2976         try{
2977                 
2978                 int total = 0;
2979                 vector<double> newTau(numOTUs,0);
2980                 vector<double> norms(numSeqs, 0);
2981                 nSeqsPerOTU.assign(numOTUs, 0);
2982         
2983                 for(int i=0;i<numSeqs;i++){
2984                         
2985                         if (m->control_pressed) { break; }
2986                         
2987                         int indexOffset = i * numOTUs;
2988             
2989                         double offset = 1e8;
2990                         
2991                         for(int j=0;j<numOTUs;j++){
2992                 
2993                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2994                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2995                                 }
2996                 
2997                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2998                                         offset = dist[indexOffset + j];
2999                                 }
3000                         }
3001             
3002                         for(int j=0;j<numOTUs;j++){
3003                                 if(weight[j] > MIN_WEIGHT){
3004                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3005                                         norms[i] += newTau[j];
3006                                 }
3007                                 else{
3008                                         newTau[j] = 0.0;
3009                                 }
3010                         }
3011             
3012                         for(int j=0;j<numOTUs;j++){
3013                                 newTau[j] /= norms[i];
3014                         }
3015             
3016                         for(int j=0;j<numOTUs;j++){
3017                                 if(newTau[j] > MIN_TAU){
3018                                         
3019                                         int oldTotal = total;
3020                                         
3021                                         total++;
3022                                         
3023                                         singleTau.resize(total, 0);
3024                                         seqNumber.resize(total, 0);
3025                                         seqIndex.resize(total, 0);
3026                                         
3027                                         singleTau[oldTotal] = newTau[j];
3028                                         
3029                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3030                                         aaI[j][nSeqsPerOTU[j]] = i;
3031                                         nSeqsPerOTU[j]++;
3032                                 }
3033                         }
3034             
3035                 }
3036         
3037         }
3038         catch(exception& e) {
3039                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3040                 exit(1);        
3041         }               
3042 }
3043 /**************************************************************************************************/
3044
3045 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3046         try {
3047                 int index = 0;
3048                 for(int i=0;i<numOTUs;i++){
3049                         
3050                         if (m->control_pressed) { return 0; }
3051                         
3052                         cumNumSeqs[i] = index;
3053                         for(int j=0;j<nSeqsPerOTU[i];j++){
3054                                 seqNumber[index] = aaP[i][j];
3055                                 seqIndex[index] = aaI[i][j];
3056                                 
3057                                 index++;
3058                         }
3059                 }
3060         
3061         return 0; 
3062         }
3063         catch(exception& e) {
3064                 m->errorOut(e, "ShhherCommand", "fill");
3065                 exit(1);        
3066         }               
3067 }
3068 /**************************************************************************************************/
3069
3070 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3071                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3072         
3073         try {
3074                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3075                 
3076                 for(int i=0;i<numOTUs;i++){
3077                         
3078                         if (m->control_pressed) { break; }
3079                         
3080                         for(int j=0;j<nSeqsPerOTU[i];j++){
3081                                 int index = cumNumSeqs[i] + j;
3082                                 double tauValue = singleTau[seqNumber[index]];
3083                                 int sIndex = seqIndex[index];
3084                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3085                         }
3086                 }
3087                 
3088                 for(int i=0;i<numSeqs;i++){
3089                         double maxTau = -1.0000;
3090                         int maxOTU = -1;
3091                         for(int j=0;j<numOTUs;j++){
3092                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3093                                         maxTau = bigTauMatrix[i * numOTUs + j];
3094                                         maxOTU = j;
3095                                 }
3096                         }
3097                         
3098                         otuData[i] = maxOTU;
3099                 }
3100                 
3101                 nSeqsPerOTU.assign(numOTUs, 0);         
3102                 
3103                 for(int i=0;i<numSeqs;i++){
3104                         int index = otuData[i];
3105                         
3106                         singleTau[i] = 1.0000;
3107                         dist[i] = 0.0000;
3108                         
3109                         aaP[index][nSeqsPerOTU[index]] = i;
3110                         aaI[index][nSeqsPerOTU[index]] = i;
3111                         
3112                         nSeqsPerOTU[index]++;
3113                 }
3114         
3115                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3116         }
3117         catch(exception& e) {
3118                 m->errorOut(e, "ShhherCommand", "setOTUs");
3119                 exit(1);        
3120         }               
3121 }
3122 /**************************************************************************************************/
3123
3124 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3125                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3126                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3127         
3128         try {
3129         
3130                 ofstream qualityFile;
3131                 m->openOutputFile(qualityFileName, qualityFile);
3132         
3133                 qualityFile.setf(ios::fixed, ios::floatfield);
3134                 qualityFile.setf(ios::showpoint);
3135                 qualityFile << setprecision(6);
3136                 
3137                 vector<vector<int> > qualities(numOTUs);
3138                 vector<double> pr(HOMOPS, 0);
3139                 
3140                 
3141                 for(int i=0;i<numOTUs;i++){
3142                         
3143                         if (m->control_pressed) { break; }
3144                         
3145                         int index = 0;
3146                         int base = 0;
3147                         
3148                         if(nSeqsPerOTU[i] > 0){
3149                                 qualities[i].assign(1024, -1);
3150                                 
3151                                 while(index < numFlowCells){
3152                                         double maxPrValue = 1e8;
3153                                         short maxPrIndex = -1;
3154                                         double count = 0.0000;
3155                                         
3156                                         pr.assign(HOMOPS, 0);
3157                                         
3158                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3159                                                 int lIndex = cumNumSeqs[i] + j;
3160                                                 double tauValue = singleTau[seqNumber[lIndex]];
3161                                                 int sequenceIndex = aaI[i][j];
3162                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3163                                                 
3164                                                 count += tauValue;
3165                                                 
3166                                                 for(int s=0;s<HOMOPS;s++){
3167                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3168                                                 }
3169                                         }
3170                                         
3171                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3172                                         maxPrValue = pr[maxPrIndex];
3173                                         
3174                                         if(count > MIN_COUNT){
3175                                                 double U = 0.0000;
3176                                                 double norm = 0.0000;
3177                                                 
3178                                                 for(int s=0;s<HOMOPS;s++){
3179                                                         norm += exp(-(pr[s] - maxPrValue));
3180                                                 }
3181                                                 
3182                                                 for(int s=1;s<=maxPrIndex;s++){
3183                                                         int value = 0;
3184                                                         double temp = 0.0000;
3185                                                         
3186                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3187                                                         
3188                                                         if(U>0.00){
3189                                                                 temp = log10(U);
3190                                                         }
3191                                                         else{
3192                                                                 temp = -10.1;
3193                                                         }
3194                                                         temp = floor(-10 * temp);
3195                                                         value = (int)floor(temp);
3196                                                         if(value > 100){        value = 100;    }
3197                                                         
3198                                                         qualities[i][base] = (int)value;
3199                                                         base++;
3200                                                 }
3201                                         }
3202                                         
3203                                         index++;
3204                                 }
3205                         }
3206                         
3207                         
3208                         if(otuCounts[i] > 0){
3209                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3210                         
3211                                 int j=4;        //need to get past the first four bases
3212                                 while(qualities[i][j] != -1){
3213                     qualityFile << qualities[i][j] << ' ';
3214                     if (j > qualities[i].size()) { break; }
3215                     j++;
3216                                 }
3217                                 qualityFile << endl;
3218                         }
3219                 }
3220                 qualityFile.close();
3221                 outputNames.push_back(qualityFileName);
3222         
3223         }
3224         catch(exception& e) {
3225                 m->errorOut(e, "ShhherCommand", "writeQualities");
3226                 exit(1);        
3227         }               
3228 }
3229
3230 /**************************************************************************************************/
3231
3232 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3233         try {
3234                 
3235                 ofstream fastaFile;
3236                 m->openOutputFile(fastaFileName, fastaFile);
3237                 
3238                 vector<string> names(numOTUs, "");
3239                 
3240                 for(int i=0;i<numOTUs;i++){
3241                         
3242                         if (m->control_pressed) { break; }
3243                         
3244                         int index = centroids[i];
3245                         
3246                         if(otuCounts[i] > 0){
3247                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3248                                 
3249                                 string newSeq = "";
3250                                 
3251                                 for(int j=0;j<numFlowCells;j++){
3252                                         
3253                                         char base = flowOrder[j % 4];
3254                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3255                                                 newSeq += base;
3256                                         }
3257                                 }
3258                                 
3259                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3260                 else {  fastaFile << "NNNN" << endl;  }
3261                         }
3262                 }
3263                 fastaFile.close();
3264         
3265                 outputNames.push_back(fastaFileName);
3266         
3267                 if(thisCompositeFASTAFileName != ""){
3268                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3269                 }
3270         }
3271         catch(exception& e) {
3272                 m->errorOut(e, "ShhherCommand", "writeSequences");
3273                 exit(1);        
3274         }               
3275 }
3276
3277 /**************************************************************************************************/
3278
3279 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3280         try {
3281                 
3282                 ofstream nameFile;
3283                 m->openOutputFile(nameFileName, nameFile);
3284                 
3285                 for(int i=0;i<numOTUs;i++){
3286                         
3287                         if (m->control_pressed) { break; }
3288                         
3289                         if(otuCounts[i] > 0){
3290                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3291                                 
3292                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3293                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3294                                 }
3295                                 
3296                                 nameFile << endl;
3297                         }
3298                 }
3299                 nameFile.close();
3300                 outputNames.push_back(nameFileName);
3301                 
3302                 
3303                 if(thisCompositeNamesFileName != ""){
3304                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3305                 }               
3306         }
3307         catch(exception& e) {
3308                 m->errorOut(e, "ShhherCommand", "writeNames");
3309                 exit(1);        
3310         }               
3311 }
3312
3313 /**************************************************************************************************/
3314
3315 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3316         try {
3317         ofstream groupFile;
3318                 m->openOutputFile(groupFileName, groupFile);
3319                 
3320                 for(int i=0;i<numSeqs;i++){
3321                         if (m->control_pressed) { break; }
3322                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3323                 }
3324                 groupFile.close();
3325                 outputNames.push_back(groupFileName);
3326         
3327         }
3328         catch(exception& e) {
3329                 m->errorOut(e, "ShhherCommand", "writeGroups");
3330                 exit(1);        
3331         }               
3332 }
3333
3334 /**************************************************************************************************/
3335
3336 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3337         try {
3338                 ofstream otuCountsFile;
3339                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3340                 
3341                 string bases = flowOrder;
3342                 
3343                 for(int i=0;i<numOTUs;i++){
3344                         
3345                         if (m->control_pressed) {
3346                                 break;
3347                         }
3348                         //output the translated version of the centroid sequence for the otu
3349                         if(otuCounts[i] > 0){
3350                                 int index = centroids[i];
3351                                 
3352                                 otuCountsFile << "ideal\t";
3353                                 for(int j=8;j<numFlowCells;j++){
3354                                         char base = bases[j % 4];
3355                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3356                                                 otuCountsFile << base;
3357                                         }
3358                                 }
3359                                 otuCountsFile << endl;
3360                                 
3361                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3362                                         int sequence = aaI[i][j];
3363                                         otuCountsFile << seqNameVector[sequence] << '\t';
3364                                         
3365                                         string newSeq = "";
3366                                         
3367                                         for(int k=0;k<lengths[sequence];k++){
3368                                                 char base = bases[k % 4];
3369                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3370                         
3371                                                 for(int s=0;s<freq;s++){
3372                                                         newSeq += base;
3373                                                         //otuCountsFile << base;
3374                                                 }
3375                                         }
3376                                         
3377                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3378                     else {  otuCountsFile << "NNNN" << endl;  }
3379                                 }
3380                                 otuCountsFile << endl;
3381                         }
3382                 }
3383                 otuCountsFile.close();
3384                 outputNames.push_back(otuCountsFileName);
3385         
3386         }
3387         catch(exception& e) {
3388                 m->errorOut(e, "ShhherCommand", "writeClusters");
3389                 exit(1);        
3390         }               
3391 }
3392
3393 /**************************************************************************************************/
3394
3395 void ShhherCommand::getSingleLookUp(){
3396         try{
3397                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3398                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3399                 
3400                 int index = 0;
3401                 ifstream lookUpFile;
3402                 m->openInputFile(lookupFileName, lookUpFile);
3403                 
3404                 for(int i=0;i<HOMOPS;i++){
3405                         
3406                         if (m->control_pressed) { break; }
3407                         
3408                         float logFracFreq;
3409                         lookUpFile >> logFracFreq;
3410                         
3411                         for(int j=0;j<NUMBINS;j++)      {
3412                                 lookUpFile >> singleLookUp[index];
3413                                 index++;
3414                         }
3415                 }       
3416                 lookUpFile.close();
3417         }
3418         catch(exception& e) {
3419                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3420                 exit(1);
3421         }
3422 }
3423
3424 /**************************************************************************************************/
3425
3426 void ShhherCommand::getJointLookUp(){
3427         try{
3428                 
3429                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3430                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3431                 
3432                 for(int i=0;i<NUMBINS;i++){
3433                         
3434                         if (m->control_pressed) { break; }
3435                         
3436                         for(int j=0;j<NUMBINS;j++){             
3437                                 
3438                                 double minSum = 100000000;
3439                                 
3440                                 for(int k=0;k<HOMOPS;k++){
3441                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3442                                         
3443                                         if(sum < minSum)        {       minSum = sum;           }
3444                                 }       
3445                                 jointLookUp[i * NUMBINS + j] = minSum;
3446                         }
3447                 }
3448         }
3449         catch(exception& e) {
3450                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3451                 exit(1);
3452         }
3453 }
3454
3455 /**************************************************************************************************/
3456
3457 double ShhherCommand::getProbIntensity(int intIntensity){                          
3458         try{
3459
3460                 double minNegLogProb = 100000000; 
3461
3462                 
3463                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3464                         
3465                         if (m->control_pressed) { break; }
3466                         
3467                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3468                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3469                 }
3470                 
3471                 return minNegLogProb;
3472         }
3473         catch(exception& e) {
3474                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3475                 exit(1);
3476         }
3477 }
3478
3479
3480
3481