]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added debug output classify.seqs. worked on make.contigs command. added large paramet...
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
24                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ShhherCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string ShhherCommand::getHelpString(){  
39         try {
40                 string helpString = "";
41                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
42                 return helpString;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ShhherCommand", "getHelpString");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50
51 ShhherCommand::ShhherCommand(){ 
52         try {
53                 abort = true; calledHelp = true;
54                 setParameters();
55                 
56                 //initialize outputTypes
57 //              vector<string> tempOutNames;
58 //              outputTypes["pn.dist"] = tempOutNames;
59
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
63                 exit(1);
64         }
65 }
66
67 //**********************************************************************************************************************
68
69 ShhherCommand::ShhherCommand(string option) {
70         try {
71         
72 #ifdef USE_MPI
73                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
74                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
75         
76                 if(pid == 0){
77 #endif
78                 abort = false; calledHelp = false;   
79                 
80                 //allow user to run help
81                 if(option == "help") { help(); abort = true; calledHelp = true; }
82                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
83                 
84                 else {
85                         vector<string> myArray = setParameters();
86                         
87                         OptionParser parser(option);
88                         map<string,string> parameters = parser.getParameters();
89                         
90                         ValidParameters validParameter;
91                         map<string,string>::iterator it;
92                         
93                         //check to make sure all parameters are valid for command
94                         for (it = parameters.begin(); it != parameters.end(); it++) { 
95                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
96                         }
97                         
98                         //initialize outputTypes
99                         vector<string> tempOutNames;
100 //                      outputTypes["pn.dist"] = tempOutNames;
101                         //                      outputTypes["fasta"] = tempOutNames;
102                         
103                         //if the user changes the input directory command factory will send this info to us in the output parameter 
104                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
105                         if (inputDir == "not found"){   inputDir = "";          }
106                         else {
107                                 string path;
108                                 it = parameters.find("flow");
109                                 //user has given a template file
110                                 if(it != parameters.end()){ 
111                                         path = m->hasPath(it->second);
112                                         //if the user has not given a path then, add inputdir. else leave path alone.
113                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
114                                 }
115                                 
116                                 it = parameters.find("lookup");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
122                                 }
123
124                                 it = parameters.find("file");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
130                                 }
131                         }
132             
133             //if the user changes the output directory command factory will send this info to us in the output parameter 
134                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
135             
136                         //check for required parameters
137                         flowFileName = validParameter.validFile(parameters, "flow", true);
138                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
139                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
140                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
141                                 m->mothurOutEndLine();
142                                 abort = true; 
143                         }
144                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
145                         
146                         if(flowFileName != "not found"){
147                                 compositeFASTAFileName = "";    
148                                 compositeNamesFileName = "";    
149                         }
150                         else{
151                                 ofstream temp;
152                 
153                 string thisoutputDir = m->hasPath(flowFilesFileName); //if user entered a file with a path then preserve it
154                 
155                                 //flow.files = 9 character offset
156                                 compositeFASTAFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.fasta";
157                                 m->openOutputFile(compositeFASTAFileName, temp);
158                                 temp.close();
159                                 
160                                 compositeNamesFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.names";
161                                 m->openOutputFile(compositeNamesFileName, temp);
162                                 temp.close();
163                         }
164             
165             if(flowFilesFileName != "not found"){
166                 string fName;
167                 
168                 ifstream flowFilesFile;
169                 m->openInputFile(flowFilesFileName, flowFilesFile);
170                 while(flowFilesFile){
171                     fName = m->getline(flowFilesFile);
172                     
173                     //test if file is valid
174                     ifstream in;
175                     int ableToOpen = m->openInputFile(fName, in, "noerror");
176                     in.close(); 
177                     if (ableToOpen == 1) {
178                         if (inputDir != "") { //default path is set
179                             string tryPath = inputDir + fName;
180                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
181                             ifstream in2;
182                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
183                             in2.close();
184                             fName = tryPath;
185                         }
186                     }
187                     
188                     if (ableToOpen == 1) {
189                         if (m->getDefaultPath() != "") { //default path is set
190                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
191                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
192                             ifstream in2;
193                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
194                             in2.close();
195                             fName = tryPath;
196                         }
197                     }
198                     
199                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
200                     if (ableToOpen == 1) {
201                         string exepath = m->argv;
202                         string tempPath = exepath;
203                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
204                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
205                         
206                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
207                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
208                         ifstream in2;
209                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
210                         in2.close();
211                         fName = tryPath;
212                     }
213                     
214                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
215                     else { flowFileVector.push_back(fName); }
216                     m->gobble(flowFilesFile);
217                 }
218                 flowFilesFile.close();
219                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
220             }
221             else{
222                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
223                 flowFileVector.push_back(flowFileName);
224             }
225                 
226                         //check for optional parameter and set defaults
227                         // ...at some point should added some additional type checking...
228                         string temp;
229                         temp = validParameter.validFile(parameters, "lookup", true);
230                         if (temp == "not found")        {       
231                                 lookupFileName = "LookUp_Titanium.pat"; 
232                                 
233                                 int ableToOpen;
234                                 ifstream in;
235                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
236                                 in.close();     
237                                 
238                                 //if you can't open it, try input location
239                                 if (ableToOpen == 1) {
240                                         if (inputDir != "") { //default path is set
241                                                 string tryPath = inputDir + lookupFileName;
242                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
243                                                 ifstream in2;
244                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
245                                                 in2.close();
246                                                 lookupFileName = tryPath;
247                                         }
248                                 }
249                                 
250                                 //if you can't open it, try default location
251                                 if (ableToOpen == 1) {
252                                         if (m->getDefaultPath() != "") { //default path is set
253                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
254                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
255                                                 ifstream in2;
256                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
257                                                 in2.close();
258                                                 lookupFileName = tryPath;
259                                         }
260                                 }
261                                 
262                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
263                                 if (ableToOpen == 1) {
264                                         string exepath = m->argv;
265                                         string tempPath = exepath;
266                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
267                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
268                                         
269                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
270                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
271                                         ifstream in2;
272                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
273                                         in2.close();
274                                         lookupFileName = tryPath;
275                                 }
276                                 
277                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
278                         }
279                         else if(temp == "not open")     {       
280                                 
281                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
282                                 
283                                 //if you can't open it its not inputDir, try mothur excutable location
284                                 string exepath = m->argv;
285                                 string tempPath = exepath;
286                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
287                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
288                                         
289                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
290                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
291                                 ifstream in2;
292                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
293                                 in2.close();
294                                 lookupFileName = tryPath;
295                                 
296                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
297                         }else                                           {       lookupFileName = temp;  }
298                         
299                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
300                         m->setProcessors(temp);
301                         m->mothurConvert(temp, processors);
302
303                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
304                         m->mothurConvert(temp, cutoff); 
305                         
306                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
307                         m->mothurConvert(temp, minDelta); 
308
309                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
310                         m->mothurConvert(temp, maxIters); 
311             
312             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
313                         m->mothurConvert(temp, largeSize); 
314             if (largeSize != 0) { large = true; }
315             else { large = false;  }
316             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
317             
318 #ifdef USE_MPI
319             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
320 #endif
321             
322
323                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
324                         m->mothurConvert(temp, sigma); 
325                         
326                         flowOrder = validParameter.validFile(parameters, "order", false);
327                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
328                         else if(flowOrder.length() != 4){
329                                 m->mothurOut("The value of the order option must be four bases long\n");
330                         }
331                         
332                 }
333 #ifdef USE_MPI
334                 }                               
335 #endif
336         }
337         catch(exception& e) {
338                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
339                 exit(1);
340         }
341 }
342 //**********************************************************************************************************************
343 #ifdef USE_MPI
344 int ShhherCommand::execute(){
345         try {
346                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
347                 
348                 int tag = 1976;
349                 MPI_Status status; 
350                         
351                 if(pid == 0){
352
353                         for(int i=1;i<ncpus;i++){
354                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
355                         }
356                         if(abort == 1){ return 0;       }
357
358                         processors = ncpus;
359                         
360                         m->mothurOut("\nGetting preliminary data...\n");
361                         getSingleLookUp();      if (m->control_pressed) { return 0; }
362                         getJointLookUp();       if (m->control_pressed) { return 0; }
363                         
364             vector<string> flowFileVector;
365                         if(flowFilesFileName != "not found"){
366                                 string fName;
367                 
368                                 ifstream flowFilesFile;
369                                 m->openInputFile(flowFilesFileName, flowFilesFile);
370                                 while(flowFilesFile){
371                                         fName = m->getline(flowFilesFile);
372                                         flowFileVector.push_back(fName);
373                                         m->gobble(flowFilesFile);
374                                 }
375                         }
376                         else{
377                                 flowFileVector.push_back(flowFileName);
378                         }
379             
380                         int numFiles = flowFileVector.size();
381
382                         for(int i=1;i<ncpus;i++){
383                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
384                         }
385                         
386                         for(int i=0;i<numFiles;i++){
387                                 
388                                 if (m->control_pressed) { break; }
389                                 
390                                 double begClock = clock();
391                                 unsigned long long begTime = time(NULL);
392
393                                 flowFileName = flowFileVector[i];
394                                 
395                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
396                                 m->mothurOut("Reading flowgrams...\n");
397                                 getFlowData();
398                                 
399                                 if (m->control_pressed) { break; }
400
401                                 m->mothurOut("Identifying unique flowgrams...\n");
402                                 getUniques();
403                                 
404                                 if (m->control_pressed) { break; }
405
406                                 m->mothurOut("Calculating distances between flowgrams...\n");
407                                 char fileName[1024];
408                                 strcpy(fileName, flowFileName.c_str());
409
410                                 for(int i=1;i<ncpus;i++){
411                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
412
413                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
414                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
415                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
416                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
417                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
418                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
419                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
420                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
421                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
422                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
423                                 }                               
424                                                         
425                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
426                                 
427                                 if (m->control_pressed) { break; }
428                                 
429                                 int done;
430                                 for(int i=1;i<ncpus;i++){
431                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
432                                         
433                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
434                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
435                                 }
436
437                                 string namesFileName = createNamesFile();
438                                 
439                                 if (m->control_pressed) { break; }
440                                 
441                                 m->mothurOut("\nClustering flowgrams...\n");
442                                 string listFileName = cluster(distFileName, namesFileName);
443                                 
444                                 if (m->control_pressed) { break; }
445                                 
446                 
447                 
448                                 getOTUData(listFileName);
449
450                                 m->mothurRemove(distFileName);
451                                 m->mothurRemove(namesFileName);
452                                 m->mothurRemove(listFileName);
453                                 
454                                 if (m->control_pressed) { break; }
455                                 
456                                 initPyroCluster();
457
458                                 if (m->control_pressed) { break; }
459                                 
460             
461                                 for(int i=1;i<ncpus;i++){
462                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
463                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
464                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
465                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
466                                 }
467                                 
468                                 if (m->control_pressed) { break; }
469                                 
470                                 double maxDelta = 0;
471                                 int iter = 0;
472                                 
473                                 int numOTUsOnCPU = numOTUs / ncpus;
474                                 int numSeqsOnCPU = numSeqs / ncpus;
475                                 m->mothurOut("\nDenoising flowgrams...\n");
476                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
477                                 
478                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
479                                         
480                                         double cycClock = clock();
481                                         unsigned long long cycTime = time(NULL);
482                                         fill();
483                                         
484                                         if (m->control_pressed) { break; }
485
486                                         int total = singleTau.size();
487                                         for(int i=1;i<ncpus;i++){
488                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
489                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
490                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
491                                                 
492                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
493                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
494                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
495                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
496                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
497                                         }
498                                         
499                                         calcCentroidsDriver(0, numOTUsOnCPU);
500                                         
501                                         for(int i=1;i<ncpus;i++){
502                                                 int otuStart = i * numOTUs / ncpus;
503                                                 int otuStop = (i + 1) * numOTUs / ncpus;
504                                                 
505                                                 vector<int> tempCentroids(numOTUs, 0);
506                                                 vector<short> tempChange(numOTUs, 0);
507                                                 
508                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
509                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
510                                                 
511                                                 for(int j=otuStart;j<otuStop;j++){
512                                                         centroids[j] = tempCentroids[j];
513                                                         change[j] = tempChange[j];
514                                                 }
515                                         }
516                                                                         
517                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
518                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
519                                         checkCentroids(); if (m->control_pressed) { break; }
520                                         
521                                         for(int i=1;i<ncpus;i++){
522                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
523                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
524                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
525                                         }
526                                         
527                                         calcNewDistancesParent(0, numSeqsOnCPU);
528
529                                         total = singleTau.size();
530
531                                         for(int i=1;i<ncpus;i++){
532                                                 int childTotal;
533                                                 int seqStart = i * numSeqs / ncpus;
534                                                 int seqStop = (i + 1) * numSeqs / ncpus;
535                                                 
536                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
537
538                                                 vector<int> childSeqIndex(childTotal, 0);
539                                                 vector<double> childSingleTau(childTotal, 0);
540                                                 vector<double> childDist(numSeqs * numOTUs, 0);
541                                                 vector<int> otuIndex(childTotal, 0);
542                                                 
543                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
544                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
545                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
546                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
547
548                                                 int oldTotal = total;
549                                                 total += childTotal;
550                                                 singleTau.resize(total, 0);
551                                                 seqIndex.resize(total, 0);
552                                                 seqNumber.resize(total, 0);
553                                                 
554                                                 int childIndex = 0;
555                                                 
556                                                 for(int j=oldTotal;j<total;j++){
557                                                         int otuI = otuIndex[childIndex];
558                                                         int seqI = childSeqIndex[childIndex];
559                                                         
560                                                         singleTau[j] = childSingleTau[childIndex];
561                                                         
562                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
563                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
564                                                         nSeqsPerOTU[otuI]++;
565                                                         childIndex++;
566                                                 }
567                                                 
568                                                 int index = seqStart * numOTUs;
569                                                 for(int j=seqStart;j<seqStop;j++){
570                                                         for(int k=0;k<numOTUs;k++){
571                                                                 dist[index] = childDist[index];
572                                                                 index++;
573                                                         }
574                                                 }                                       
575                                         }
576                                         
577                                         iter++;
578                                         
579                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
580
581                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
582                                                 int live = 1;
583                                                 for(int i=1;i<ncpus;i++){
584                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
585                                                 }
586                                         }
587                                         else{
588                                                 int live = 0;
589                                                 for(int i=1;i<ncpus;i++){
590                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
591                                                 }
592                                         }
593                                         
594                                 }       
595                                 
596                                 if (m->control_pressed) { break; }
597                                 
598                                 m->mothurOut("\nFinalizing...\n");
599                                 fill();
600                                 
601                                 if (m->control_pressed) { break; }
602                                 
603                                 setOTUs();
604                                 
605                                 vector<int> otuCounts(numOTUs, 0);
606                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
607                                 calcCentroidsDriver(0, numOTUs);
608                                 
609                                 if (m->control_pressed) { break; }
610                                 
611                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
612                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
613                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
614                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
615                                 writeGroups();                          if (m->control_pressed) { break; }
616                                 
617                                                                  
618                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
619                         }
620                 }
621                 else{
622                         int abort = 1;
623
624                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
625                         if(abort){      return 0;       }
626
627                         int numFiles;
628                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
629
630                         for(int i=0;i<numFiles;i++){
631                                 
632                                 if (m->control_pressed) { break; }
633                                 
634                                 //Now into the pyrodist part
635                                 bool live = 1;
636
637                                 char fileName[1024];
638                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
639                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
640                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
641                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
642                                 
643                                 flowDataIntI.resize(numSeqs * numFlowCells);
644                                 flowDataPrI.resize(numSeqs * numFlowCells);
645                                 mapUniqueToSeq.resize(numSeqs);
646                                 mapSeqToUnique.resize(numSeqs);
647                                 lengths.resize(numSeqs);
648                                 jointLookUp.resize(NUMBINS * NUMBINS);
649
650                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
651                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
652                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
653                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
654                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
655                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
656                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
657
658                                 flowFileName = string(fileName);
659                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
660                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
661                                 
662                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
663                                 
664                                 if (m->control_pressed) { break; }
665                                 
666                                 int done = 1;
667                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
668
669                                 //Now into the pyronoise part
670                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
671                                 
672                                 singleLookUp.resize(HOMOPS * NUMBINS);
673                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
674                                 weight.resize(numOTUs);
675                                 centroids.resize(numOTUs);
676                                 change.resize(numOTUs);
677                                 dist.assign(numOTUs * numSeqs, 0);
678                                 nSeqsPerOTU.resize(numOTUs);
679                                 cumNumSeqs.resize(numOTUs);
680
681                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
682                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
683                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
684                                 
685                                 int startOTU = pid * numOTUs / ncpus;
686                                 int endOTU = (pid + 1) * numOTUs / ncpus;
687
688                                 int startSeq = pid * numSeqs / ncpus;
689                                 int endSeq = (pid + 1) * numSeqs /ncpus;
690                                 
691                                 int total;
692
693                                 while(live){
694                                         
695                                         if (m->control_pressed) { break; }
696                                         
697                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698                                         singleTau.assign(total, 0.0000);
699                                         seqNumber.assign(total, 0);
700                                         seqIndex.assign(total, 0);
701
702                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
703                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
704                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
705                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
706                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
707                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
708                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
709
710                                         calcCentroidsDriver(startOTU, endOTU);
711
712                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
713                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
714
715                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
716                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
717                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
718
719                                         vector<int> otuIndex(total, 0);
720                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
721                                         total = otuIndex.size();
722                                         
723                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
724                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
725                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
726                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
727                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
728                                 
729                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
730                                 }
731                         }
732                 }
733                 
734                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
735                 
736                 MPI_Barrier(MPI_COMM_WORLD);
737
738                 
739                 if(compositeFASTAFileName != ""){
740                         outputNames.push_back(compositeFASTAFileName);
741                         outputNames.push_back(compositeNamesFileName);
742                 }
743
744                 m->mothurOutEndLine();
745                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
746                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
747                 m->mothurOutEndLine();
748                 
749                 return 0;
750
751         }
752         catch(exception& e) {
753                 m->errorOut(e, "ShhherCommand", "execute");
754                 exit(1);
755         }
756 }
757 /**************************************************************************************************/
758 string ShhherCommand::createNamesFile(){
759         try{
760                 
761                 vector<string> duplicateNames(numUniques, "");
762                 for(int i=0;i<numSeqs;i++){
763                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
764                 }
765                 
766                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
767                 
768                 ofstream nameFile;
769                 m->openOutputFile(nameFileName, nameFile);
770                 
771                 for(int i=0;i<numUniques;i++){
772                         
773                         if (m->control_pressed) { break; }
774                         
775             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
776                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
777                 }
778                 
779                 nameFile.close();
780                 return  nameFileName;
781         }
782         catch(exception& e) {
783                 m->errorOut(e, "ShhherCommand", "createNamesFile");
784                 exit(1);
785         }
786 }
787 /**************************************************************************************************/
788
789 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
790         try{            
791                 ostringstream outStream;
792                 outStream.setf(ios::fixed, ios::floatfield);
793                 outStream.setf(ios::dec, ios::basefield);
794                 outStream.setf(ios::showpoint);
795                 outStream.precision(6);
796                 
797                 int begTime = time(NULL);
798                 double begClock = clock();
799                 
800                 for(int i=startSeq;i<stopSeq;i++){
801                         
802                         if (m->control_pressed) { break; }
803                         
804                         for(int j=0;j<i;j++){
805                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
806                                 
807                                 if(flowDistance < 1e-6){
808                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
809                                 }
810                                 else if(flowDistance <= cutoff){
811                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
812                                 }
813                         }
814                         if(i % 100 == 0){
815                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
816                         }
817                 }
818                 
819                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
820                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
821                 
822                 if (m->control_pressed) { return fDistFileName; }
823                 
824                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
825
826                 ofstream distFile(fDistFileName.c_str());
827                 distFile << outStream.str();            
828                 distFile.close();
829                 
830                 return fDistFileName;
831         }
832         catch(exception& e) {
833                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
834                 exit(1);
835         }
836 }
837 /**************************************************************************************************/
838  
839 void ShhherCommand::getOTUData(string listFileName){
840     try {
841         
842         ifstream listFile;
843         m->openInputFile(listFileName, listFile);
844         string label;
845         
846         listFile >> label >> numOTUs;
847         
848         otuData.assign(numSeqs, 0);
849         cumNumSeqs.assign(numOTUs, 0);
850         nSeqsPerOTU.assign(numOTUs, 0);
851         aaP.clear();aaP.resize(numOTUs);
852         
853         seqNumber.clear();
854         aaI.clear();
855         seqIndex.clear();
856         
857         string singleOTU = "";
858         
859         for(int i=0;i<numOTUs;i++){
860             
861             if (m->control_pressed) { break; }
862             
863             listFile >> singleOTU;
864             
865             istringstream otuString(singleOTU);
866             
867             while(otuString){
868                 
869                 string seqName = "";
870                 
871                 for(int j=0;j<singleOTU.length();j++){
872                     char letter = otuString.get();
873                     
874                     if(letter != ','){
875                         seqName += letter;
876                     }
877                     else{
878                         map<string,int>::iterator nmIt = nameMap.find(seqName);
879                         int index = nmIt->second;
880                         
881                         nameMap.erase(nmIt);
882                         
883                         otuData[index] = i;
884                         nSeqsPerOTU[i]++;
885                         aaP[i].push_back(index);
886                         seqName = "";
887                     }
888                 }
889                 
890                 map<string,int>::iterator nmIt = nameMap.find(seqName);
891                 
892                 int index = nmIt->second;
893                 nameMap.erase(nmIt);
894                 
895                 otuData[index] = i;
896                 nSeqsPerOTU[i]++;
897                 aaP[i].push_back(index);        
898                 
899                 otuString.get();
900             }
901             
902             sort(aaP[i].begin(), aaP[i].end());
903             for(int j=0;j<nSeqsPerOTU[i];j++){
904                 seqNumber.push_back(aaP[i][j]);
905             }
906             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
907                 aaP[i].push_back(0);
908             }
909             
910             
911         }
912         
913         for(int i=1;i<numOTUs;i++){
914             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
915         }
916         aaI = aaP;
917         seqIndex = seqNumber;
918         
919         listFile.close();       
920         
921     }
922     catch(exception& e) {
923         m->errorOut(e, "ShhherCommand", "getOTUData");
924         exit(1);        
925     }           
926 }
927
928 /**************************************************************************************************/
929
930 void ShhherCommand::initPyroCluster(){                          
931     try{
932         if (numOTUs < processors) { processors = 1; }
933         
934         dist.assign(numSeqs * numOTUs, 0);
935         change.assign(numOTUs, 1);
936         centroids.assign(numOTUs, -1);
937         weight.assign(numOTUs, 0);
938         singleTau.assign(numSeqs, 1.0);
939         
940         nSeqsBreaks.assign(processors+1, 0);
941         nOTUsBreaks.assign(processors+1, 0);
942         
943         nSeqsBreaks[0] = 0;
944         for(int i=0;i<processors;i++){
945             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
946             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
947         }
948         nSeqsBreaks[processors] = numSeqs;
949         nOTUsBreaks[processors] = numOTUs;
950     }
951     catch(exception& e) {
952         m->errorOut(e, "ShhherCommand", "initPyroCluster");
953         exit(1);        
954     }
955 }
956
957 /**************************************************************************************************/
958
959 void ShhherCommand::fill(){
960     try {
961         int index = 0;
962         for(int i=0;i<numOTUs;i++){
963             
964             if (m->control_pressed) { break; }
965             
966             cumNumSeqs[i] = index;
967             for(int j=0;j<nSeqsPerOTU[i];j++){
968                 seqNumber[index] = aaP[i][j];
969                 seqIndex[index] = aaI[i][j];
970                 
971                 index++;
972             }
973         }
974     }
975     catch(exception& e) {
976         m->errorOut(e, "ShhherCommand", "fill");
977         exit(1);        
978     }           
979 }
980
981 /**************************************************************************************************/
982  
983 void ShhherCommand::getFlowData(){
984     try{
985         ifstream flowFile;
986         m->openInputFile(flowFileName, flowFile);
987         
988         string seqName;
989         seqNameVector.clear();
990         lengths.clear();
991         flowDataIntI.clear();
992         nameMap.clear();
993         
994         
995         int currentNumFlowCells;
996         
997         float intensity;
998         
999         flowFile >> numFlowCells;
1000         int index = 0;//pcluster
1001         while(!flowFile.eof()){
1002             
1003             if (m->control_pressed) { break; }
1004             
1005             flowFile >> seqName >> currentNumFlowCells;
1006             lengths.push_back(currentNumFlowCells);
1007             
1008             seqNameVector.push_back(seqName);
1009             nameMap[seqName] = index++;//pcluster
1010             
1011             for(int i=0;i<numFlowCells;i++){
1012                 flowFile >> intensity;
1013                 if(intensity > 9.99)    {       intensity = 9.99;       }
1014                 int intI = int(100 * intensity + 0.0001);
1015                 flowDataIntI.push_back(intI);
1016             }
1017             m->gobble(flowFile);
1018         }
1019         flowFile.close();
1020         
1021         numSeqs = seqNameVector.size();         
1022         
1023         for(int i=0;i<numSeqs;i++){
1024             
1025             if (m->control_pressed) { break; }
1026             
1027             int iNumFlowCells = i * numFlowCells;
1028             for(int j=lengths[i];j<numFlowCells;j++){
1029                 flowDataIntI[iNumFlowCells + j] = 0;
1030             }
1031         }
1032         
1033     }
1034     catch(exception& e) {
1035         m->errorOut(e, "ShhherCommand", "getFlowData");
1036         exit(1);
1037     }
1038 }
1039 /**************************************************************************************************/
1040 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1041         
1042         try{
1043                 vector<double> newTau(numOTUs,0);
1044                 vector<double> norms(numSeqs, 0);
1045                 otuIndex.clear();
1046                 seqIndex.clear();
1047                 singleTau.clear();
1048                 
1049                 for(int i=startSeq;i<stopSeq;i++){
1050                         
1051                         if (m->control_pressed) { break; }
1052                         
1053                         double offset = 1e8;
1054                         int indexOffset = i * numOTUs;
1055                         
1056                         for(int j=0;j<numOTUs;j++){
1057                                 
1058                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1059                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1060                                 }
1061                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1062                                         offset = dist[indexOffset + j];
1063                                 }
1064                         }
1065                         
1066                         for(int j=0;j<numOTUs;j++){
1067                                 if(weight[j] > MIN_WEIGHT){
1068                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1069                                         norms[i] += newTau[j];
1070                                 }
1071                                 else{
1072                                         newTau[j] = 0.0;
1073                                 }
1074                         }
1075                         
1076                         for(int j=0;j<numOTUs;j++){
1077                 
1078                                 newTau[j] /= norms[i];
1079                                 
1080                                 if(newTau[j] > MIN_TAU){
1081                                         otuIndex.push_back(j);
1082                                         seqIndex.push_back(i);
1083                                         singleTau.push_back(newTau[j]);
1084                                 }
1085                         }
1086                         
1087                 }
1088         }
1089         catch(exception& e) {
1090                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1091                 exit(1);        
1092         }               
1093 }
1094
1095 /**************************************************************************************************/
1096
1097 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1098     
1099     try{
1100         
1101         int total = 0;
1102         vector<double> newTau(numOTUs,0);
1103         vector<double> norms(numSeqs, 0);
1104         nSeqsPerOTU.assign(numOTUs, 0);
1105         
1106         for(int i=startSeq;i<stopSeq;i++){
1107             
1108             if (m->control_pressed) { break; }
1109             
1110             int indexOffset = i * numOTUs;
1111             
1112             double offset = 1e8;
1113             
1114             for(int j=0;j<numOTUs;j++){
1115                 
1116                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1117                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1118                 }
1119                 
1120                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1121                     offset = dist[indexOffset + j];
1122                 }
1123             }
1124             
1125             for(int j=0;j<numOTUs;j++){
1126                 if(weight[j] > MIN_WEIGHT){
1127                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1128                     norms[i] += newTau[j];
1129                 }
1130                 else{
1131                     newTau[j] = 0.0;
1132                 }
1133             }
1134             
1135             for(int j=0;j<numOTUs;j++){
1136                 newTau[j] /= norms[i];
1137             }
1138             
1139             for(int j=0;j<numOTUs;j++){
1140                 if(newTau[j] > MIN_TAU){
1141                     
1142                     int oldTotal = total;
1143                     
1144                     total++;
1145                     
1146                     singleTau.resize(total, 0);
1147                     seqNumber.resize(total, 0);
1148                     seqIndex.resize(total, 0);
1149                     
1150                     singleTau[oldTotal] = newTau[j];
1151                     
1152                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1153                     aaI[j][nSeqsPerOTU[j]] = i;
1154                     nSeqsPerOTU[j]++;
1155                 }
1156             }
1157             
1158         }
1159         
1160     }
1161     catch(exception& e) {
1162         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1163         exit(1);        
1164     }           
1165 }
1166
1167 /**************************************************************************************************/
1168
1169 void ShhherCommand::setOTUs(){
1170     
1171     try {
1172         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1173         
1174         for(int i=0;i<numOTUs;i++){
1175             
1176             if (m->control_pressed) { break; }
1177             
1178             for(int j=0;j<nSeqsPerOTU[i];j++){
1179                 int index = cumNumSeqs[i] + j;
1180                 double tauValue = singleTau[seqNumber[index]];
1181                 int sIndex = seqIndex[index];
1182                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1183             }
1184         }
1185         
1186         for(int i=0;i<numSeqs;i++){
1187             double maxTau = -1.0000;
1188             int maxOTU = -1;
1189             for(int j=0;j<numOTUs;j++){
1190                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1191                     maxTau = bigTauMatrix[i * numOTUs + j];
1192                     maxOTU = j;
1193                 }
1194             }
1195             
1196             otuData[i] = maxOTU;
1197         }
1198         
1199         nSeqsPerOTU.assign(numOTUs, 0);         
1200         
1201         for(int i=0;i<numSeqs;i++){
1202             int index = otuData[i];
1203             
1204             singleTau[i] = 1.0000;
1205             dist[i] = 0.0000;
1206             
1207             aaP[index][nSeqsPerOTU[index]] = i;
1208             aaI[index][nSeqsPerOTU[index]] = i;
1209             
1210             nSeqsPerOTU[index]++;
1211         }
1212         fill(); 
1213     }
1214     catch(exception& e) {
1215         m->errorOut(e, "ShhherCommand", "setOTUs");
1216         exit(1);        
1217     }           
1218 }
1219
1220 /**************************************************************************************************/
1221  
1222 void ShhherCommand::getUniques(){
1223     try{
1224         
1225         
1226         numUniques = 0;
1227         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1228         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1229         uniqueLengths.assign(numSeqs, 0);
1230         mapSeqToUnique.assign(numSeqs, -1);
1231         mapUniqueToSeq.assign(numSeqs, -1);
1232         
1233         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1234         
1235         for(int i=0;i<numSeqs;i++){
1236             
1237             if (m->control_pressed) { break; }
1238             
1239             int index = 0;
1240             
1241             vector<short> current(numFlowCells);
1242             for(int j=0;j<numFlowCells;j++){
1243                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1244             }
1245             
1246             for(int j=0;j<numUniques;j++){
1247                 int offset = j * numFlowCells;
1248                 bool toEnd = 1;
1249                 
1250                 int shorterLength;
1251                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1252                 else                                                            {       shorterLength = uniqueLengths[j];       }
1253                 
1254                 for(int k=0;k<shorterLength;k++){
1255                     if(current[k] != uniqueFlowgrams[offset + k]){
1256                         toEnd = 0;
1257                         break;
1258                     }
1259                 }
1260                 
1261                 if(toEnd){
1262                     mapSeqToUnique[i] = j;
1263                     uniqueCount[j]++;
1264                     index = j;
1265                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1266                     break;
1267                 }
1268                 index++;
1269             }
1270             
1271             if(index == numUniques){
1272                 uniqueLengths[numUniques] = lengths[i];
1273                 uniqueCount[numUniques] = 1;
1274                 mapSeqToUnique[i] = numUniques;//anMap
1275                 mapUniqueToSeq[numUniques] = i;//anF
1276                 
1277                 for(int k=0;k<numFlowCells;k++){
1278                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1279                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1280                 }
1281                 
1282                 numUniques++;
1283             }
1284         }
1285         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1286         uniqueLengths.resize(numUniques);       
1287         
1288         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1289         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1290     }
1291     catch(exception& e) {
1292         m->errorOut(e, "ShhherCommand", "getUniques");
1293         exit(1);
1294     }
1295 }
1296
1297 /**************************************************************************************************/
1298
1299 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1300     try{
1301         int minLength = lengths[mapSeqToUnique[seqA]];
1302         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1303         
1304         int ANumFlowCells = seqA * numFlowCells;
1305         int BNumFlowCells = seqB * numFlowCells;
1306         
1307         float dist = 0;
1308         
1309         for(int i=0;i<minLength;i++){
1310             
1311             if (m->control_pressed) { break; }
1312             
1313             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1314             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1315             
1316             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1317             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1318             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1319         }
1320         
1321         dist /= (float) minLength;
1322         return dist;
1323     }
1324     catch(exception& e) {
1325         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1326         exit(1);
1327     }
1328 }
1329
1330 //**********************************************************************************************************************/
1331
1332 string ShhherCommand::cluster(string distFileName, string namesFileName){
1333     try {
1334         
1335         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1336         read->setCutoff(cutoff);
1337         
1338         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1339         clusterNameMap->readMap();
1340         read->read(clusterNameMap);
1341         
1342         ListVector* list = read->getListVector();
1343         SparseMatrix* matrix = read->getMatrix();
1344         
1345         delete read; 
1346         delete clusterNameMap; 
1347         
1348         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1349         
1350         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1351         string tag = cluster->getTag();
1352         
1353         double clusterCutoff = cutoff;
1354         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1355             
1356             if (m->control_pressed) { break; }
1357             
1358             cluster->update(clusterCutoff);
1359         }
1360         
1361         list->setLabel(toString(cutoff));
1362         
1363         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1364         ofstream listFile;
1365         m->openOutputFile(listFileName, listFile);
1366         list->print(listFile);
1367         listFile.close();
1368         
1369         delete matrix;  delete cluster; delete rabund; delete list;
1370         
1371         return listFileName;
1372     }
1373     catch(exception& e) {
1374         m->errorOut(e, "ShhherCommand", "cluster");
1375         exit(1);        
1376     }           
1377 }
1378
1379 /**************************************************************************************************/
1380
1381 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1382     
1383     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1384     //within an otu
1385     
1386     try{
1387         
1388         for(int i=start;i<finish;i++){
1389             
1390             if (m->control_pressed) { break; }
1391             
1392             double count = 0;
1393             int position = 0;
1394             int minFlowGram = 100000000;
1395             double minFlowValue = 1e8;
1396             change[i] = 0; //FALSE
1397             
1398             for(int j=0;j<nSeqsPerOTU[i];j++){
1399                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1400             }
1401             
1402             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1403                 vector<double> adF(nSeqsPerOTU[i]);
1404                 vector<int> anL(nSeqsPerOTU[i]);
1405                 
1406                 for(int j=0;j<nSeqsPerOTU[i];j++){
1407                     int index = cumNumSeqs[i] + j;
1408                     int nI = seqIndex[index];
1409                     int nIU = mapSeqToUnique[nI];
1410                     
1411                     int k;
1412                     for(k=0;k<position;k++){
1413                         if(nIU == anL[k]){
1414                             break;
1415                         }
1416                     }
1417                     if(k == position){
1418                         anL[position] = nIU;
1419                         adF[position] = 0.0000;
1420                         position++;
1421                     }                                           
1422                 }
1423                 
1424                 for(int j=0;j<nSeqsPerOTU[i];j++){
1425                     int index = cumNumSeqs[i] + j;
1426                     int nI = seqIndex[index];
1427                     
1428                     double tauValue = singleTau[seqNumber[index]];
1429                     
1430                     for(int k=0;k<position;k++){
1431                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1432                         adF[k] += dist * tauValue;
1433                     }
1434                 }
1435                 
1436                 for(int j=0;j<position;j++){
1437                     if(adF[j] < minFlowValue){
1438                         minFlowGram = j;
1439                         minFlowValue = adF[j];
1440                     }
1441                 }
1442                 
1443                 if(centroids[i] != anL[minFlowGram]){
1444                     change[i] = 1;
1445                     centroids[i] = anL[minFlowGram];
1446                 }
1447             }
1448             else if(centroids[i] != -1){
1449                 change[i] = 1;
1450                 centroids[i] = -1;                      
1451             }
1452         }
1453     }
1454     catch(exception& e) {
1455         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1456         exit(1);        
1457     }           
1458 }
1459
1460 /**************************************************************************************************/
1461
1462 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1463     try{
1464         
1465         int flowAValue = cent * numFlowCells;
1466         int flowBValue = flow * numFlowCells;
1467         
1468         double dist = 0;
1469         
1470         for(int i=0;i<length;i++){
1471             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1472             flowAValue++;
1473             flowBValue++;
1474         }
1475         
1476         return dist / (double)length;
1477     }
1478     catch(exception& e) {
1479         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1480         exit(1);        
1481     }           
1482 }
1483
1484 /**************************************************************************************************/
1485
1486 double ShhherCommand::getNewWeights(){
1487     try{
1488         
1489         double maxChange = 0;
1490         
1491         for(int i=0;i<numOTUs;i++){
1492             
1493             if (m->control_pressed) { break; }
1494             
1495             double difference = weight[i];
1496             weight[i] = 0;
1497             
1498             for(int j=0;j<nSeqsPerOTU[i];j++){
1499                 int index = cumNumSeqs[i] + j;
1500                 double tauValue = singleTau[seqNumber[index]];
1501                 weight[i] += tauValue;
1502             }
1503             
1504             difference = fabs(weight[i] - difference);
1505             if(difference > maxChange){ maxChange = difference; }
1506         }
1507         return maxChange;
1508     }
1509     catch(exception& e) {
1510         m->errorOut(e, "ShhherCommand", "getNewWeights");
1511         exit(1);        
1512     }           
1513 }
1514  
1515  /**************************************************************************************************/
1516  
1517 double ShhherCommand::getLikelihood(){
1518     
1519     try{
1520         
1521         vector<long double> P(numSeqs, 0);
1522         int effNumOTUs = 0;
1523         
1524         for(int i=0;i<numOTUs;i++){
1525             if(weight[i] > MIN_WEIGHT){
1526                 effNumOTUs++;
1527             }
1528         }
1529         
1530         string hold;
1531         for(int i=0;i<numOTUs;i++){
1532             
1533             if (m->control_pressed) { break; }
1534             
1535             for(int j=0;j<nSeqsPerOTU[i];j++){
1536                 int index = cumNumSeqs[i] + j;
1537                 int nI = seqIndex[index];
1538                 double singleDist = dist[seqNumber[index]];
1539                 
1540                 P[nI] += weight[i] * exp(-singleDist * sigma);
1541             }
1542         }
1543         double nLL = 0.00;
1544         for(int i=0;i<numSeqs;i++){
1545             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1546             
1547             nLL += -log(P[i]);
1548         }
1549         
1550         nLL = nLL -(double)numSeqs * log(sigma);
1551         
1552         return nLL; 
1553     }
1554     catch(exception& e) {
1555         m->errorOut(e, "ShhherCommand", "getNewWeights");
1556         exit(1);        
1557     }           
1558 }
1559
1560 /**************************************************************************************************/
1561
1562 void ShhherCommand::checkCentroids(){
1563     try{
1564         vector<int> unique(numOTUs, 1);
1565         
1566         for(int i=0;i<numOTUs;i++){
1567             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1568                 unique[i] = -1;
1569             }
1570         }
1571         
1572         for(int i=0;i<numOTUs;i++){
1573             
1574             if (m->control_pressed) { break; }
1575             
1576             if(unique[i] == 1){
1577                 for(int j=i+1;j<numOTUs;j++){
1578                     if(unique[j] == 1){
1579                         
1580                         if(centroids[j] == centroids[i]){
1581                             unique[j] = 0;
1582                             centroids[j] = -1;
1583                             
1584                             weight[i] += weight[j];
1585                             weight[j] = 0.0;
1586                         }
1587                     }
1588                 }
1589             }
1590         }
1591     }
1592     catch(exception& e) {
1593         m->errorOut(e, "ShhherCommand", "checkCentroids");
1594         exit(1);        
1595     }           
1596 }
1597  /**************************************************************************************************/
1598
1599
1600  
1601 void ShhherCommand::writeQualities(vector<int> otuCounts){
1602     
1603     try {
1604         string thisOutputDir = outputDir;
1605         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1606         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1607         
1608         ofstream qualityFile;
1609         m->openOutputFile(qualityFileName, qualityFile);
1610         
1611         qualityFile.setf(ios::fixed, ios::floatfield);
1612         qualityFile.setf(ios::showpoint);
1613         qualityFile << setprecision(6);
1614         
1615         vector<vector<int> > qualities(numOTUs);
1616         vector<double> pr(HOMOPS, 0);
1617         
1618         
1619         for(int i=0;i<numOTUs;i++){
1620             
1621             if (m->control_pressed) { break; }
1622             
1623             int index = 0;
1624             int base = 0;
1625             
1626             if(nSeqsPerOTU[i] > 0){
1627                 qualities[i].assign(1024, -1);
1628                 
1629                 while(index < numFlowCells){
1630                     double maxPrValue = 1e8;
1631                     short maxPrIndex = -1;
1632                     double count = 0.0000;
1633                     
1634                     pr.assign(HOMOPS, 0);
1635                     
1636                     for(int j=0;j<nSeqsPerOTU[i];j++){
1637                         int lIndex = cumNumSeqs[i] + j;
1638                         double tauValue = singleTau[seqNumber[lIndex]];
1639                         int sequenceIndex = aaI[i][j];
1640                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1641                         
1642                         count += tauValue;
1643                         
1644                         for(int s=0;s<HOMOPS;s++){
1645                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1646                         }
1647                     }
1648                     
1649                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1650                     maxPrValue = pr[maxPrIndex];
1651                     
1652                     if(count > MIN_COUNT){
1653                         double U = 0.0000;
1654                         double norm = 0.0000;
1655                         
1656                         for(int s=0;s<HOMOPS;s++){
1657                             norm += exp(-(pr[s] - maxPrValue));
1658                         }
1659                         
1660                         for(int s=1;s<=maxPrIndex;s++){
1661                             int value = 0;
1662                             double temp = 0.0000;
1663                             
1664                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1665                             
1666                             if(U>0.00){
1667                                 temp = log10(U);
1668                             }
1669                             else{
1670                                 temp = -10.1;
1671                             }
1672                             temp = floor(-10 * temp);
1673                             value = (int)floor(temp);
1674                             if(value > 100){    value = 100;    }
1675                             
1676                             qualities[i][base] = (int)value;
1677                             base++;
1678                         }
1679                     }
1680                     
1681                     index++;
1682                 }
1683             }
1684             
1685             
1686             if(otuCounts[i] > 0){
1687                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1688                 
1689                 int j=4;        //need to get past the first four bases
1690                 while(qualities[i][j] != -1){
1691                     qualityFile << qualities[i][j] << ' ';
1692                     j++;
1693                 }
1694                 qualityFile << endl;
1695             }
1696         }
1697         qualityFile.close();
1698         outputNames.push_back(qualityFileName);
1699         
1700     }
1701     catch(exception& e) {
1702         m->errorOut(e, "ShhherCommand", "writeQualities");
1703         exit(1);        
1704     }           
1705 }
1706
1707 /**************************************************************************************************/
1708
1709 void ShhherCommand::writeSequences(vector<int> otuCounts){
1710     try {
1711         string thisOutputDir = outputDir;
1712         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1713         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1714         ofstream fastaFile;
1715         m->openOutputFile(fastaFileName, fastaFile);
1716         
1717         vector<string> names(numOTUs, "");
1718         
1719         for(int i=0;i<numOTUs;i++){
1720             
1721             if (m->control_pressed) { break; }
1722             
1723             int index = centroids[i];
1724             
1725             if(otuCounts[i] > 0){
1726                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1727                 
1728                 string newSeq = "";
1729                 
1730                 for(int j=0;j<numFlowCells;j++){
1731                     
1732                     char base = flowOrder[j % 4];
1733                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1734                         newSeq += base;
1735                     }
1736                 }
1737                 
1738                 fastaFile << newSeq.substr(4) << endl;
1739             }
1740         }
1741         fastaFile.close();
1742         
1743         outputNames.push_back(fastaFileName);
1744         
1745         if(compositeFASTAFileName != ""){
1746             m->appendFiles(fastaFileName, compositeFASTAFileName);
1747         }
1748     }
1749     catch(exception& e) {
1750         m->errorOut(e, "ShhherCommand", "writeSequences");
1751         exit(1);        
1752     }           
1753 }
1754
1755 /**************************************************************************************************/
1756
1757 void ShhherCommand::writeNames(vector<int> otuCounts){
1758     try {
1759         string thisOutputDir = outputDir;
1760         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1761         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1762         ofstream nameFile;
1763         m->openOutputFile(nameFileName, nameFile);
1764         
1765         for(int i=0;i<numOTUs;i++){
1766             
1767             if (m->control_pressed) { break; }
1768             
1769             if(otuCounts[i] > 0){
1770                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1771                 
1772                 for(int j=1;j<nSeqsPerOTU[i];j++){
1773                     nameFile << ',' << seqNameVector[aaI[i][j]];
1774                 }
1775                 
1776                 nameFile << endl;
1777             }
1778         }
1779         nameFile.close();
1780         outputNames.push_back(nameFileName);
1781         
1782         
1783         if(compositeNamesFileName != ""){
1784             m->appendFiles(nameFileName, compositeNamesFileName);
1785         }               
1786     }
1787     catch(exception& e) {
1788         m->errorOut(e, "ShhherCommand", "writeNames");
1789         exit(1);        
1790     }           
1791 }
1792
1793 /**************************************************************************************************/
1794
1795 void ShhherCommand::writeGroups(){
1796     try {
1797         string thisOutputDir = outputDir;
1798         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1799         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1800         string groupFileName = fileRoot + "shhh.groups";
1801         ofstream groupFile;
1802         m->openOutputFile(groupFileName, groupFile);
1803         
1804         for(int i=0;i<numSeqs;i++){
1805             if (m->control_pressed) { break; }
1806             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1807         }
1808         groupFile.close();
1809         outputNames.push_back(groupFileName);
1810         
1811     }
1812     catch(exception& e) {
1813         m->errorOut(e, "ShhherCommand", "writeGroups");
1814         exit(1);        
1815     }           
1816 }
1817
1818 /**************************************************************************************************/
1819
1820 void ShhherCommand::writeClusters(vector<int> otuCounts){
1821     try {
1822         string thisOutputDir = outputDir;
1823         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1824         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1825         ofstream otuCountsFile;
1826         m->openOutputFile(otuCountsFileName, otuCountsFile);
1827         
1828         string bases = flowOrder;
1829         
1830         for(int i=0;i<numOTUs;i++){
1831             
1832             if (m->control_pressed) {
1833                 break;
1834             }
1835             //output the translated version of the centroid sequence for the otu
1836             if(otuCounts[i] > 0){
1837                 int index = centroids[i];
1838                 
1839                 otuCountsFile << "ideal\t";
1840                 for(int j=8;j<numFlowCells;j++){
1841                     char base = bases[j % 4];
1842                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1843                         otuCountsFile << base;
1844                     }
1845                 }
1846                 otuCountsFile << endl;
1847                 
1848                 for(int j=0;j<nSeqsPerOTU[i];j++){
1849                     int sequence = aaI[i][j];
1850                     otuCountsFile << seqNameVector[sequence] << '\t';
1851                     
1852                     string newSeq = "";
1853                     
1854                     for(int k=0;k<lengths[sequence];k++){
1855                         char base = bases[k % 4];
1856                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1857                         
1858                         for(int s=0;s<freq;s++){
1859                             newSeq += base;
1860                             //otuCountsFile << base;
1861                         }
1862                     }
1863                     otuCountsFile << newSeq.substr(4) << endl;
1864                 }
1865                 otuCountsFile << endl;
1866             }
1867         }
1868         otuCountsFile.close();
1869         outputNames.push_back(otuCountsFileName);
1870         
1871     }
1872     catch(exception& e) {
1873         m->errorOut(e, "ShhherCommand", "writeClusters");
1874         exit(1);        
1875     }           
1876 }
1877
1878 #else
1879 //**********************************************************************************************************************
1880
1881 int ShhherCommand::execute(){
1882         try {
1883                 if (abort == true) { return 0; }
1884                 
1885                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1886                 getJointLookUp();       if (m->control_pressed) { return 0; }
1887                 
1888         int numFiles = flowFileVector.size();
1889                 
1890         if (numFiles < processors) { processors = numFiles; }
1891         
1892 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1893         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1894         else { createProcesses(flowFileVector); } //each processor processes one file
1895 #else
1896         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1897 #endif
1898         
1899                 if(compositeFASTAFileName != ""){
1900                         outputNames.push_back(compositeFASTAFileName);
1901                         outputNames.push_back(compositeNamesFileName);
1902                 }
1903
1904                 m->mothurOutEndLine();
1905                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1906                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1907                 m->mothurOutEndLine();
1908                 
1909                 return 0;
1910         }
1911         catch(exception& e) {
1912                 m->errorOut(e, "ShhherCommand", "execute");
1913                 exit(1);
1914         }
1915 }
1916 #endif
1917 /**************************************************************************************************/
1918
1919 int ShhherCommand::createProcesses(vector<string> filenames){
1920     try {
1921         vector<int> processIDS;
1922                 int process = 1;
1923                 int num = 0;
1924                 
1925                 //sanity check
1926                 if (filenames.size() < processors) { processors = filenames.size(); }
1927                 
1928                 //divide the groups between the processors
1929                 vector<linePair> lines;
1930                 int numFilesPerProcessor = filenames.size() / processors;
1931                 for (int i = 0; i < processors; i++) {
1932                         int startIndex =  i * numFilesPerProcessor;
1933                         int endIndex = (i+1) * numFilesPerProcessor;
1934                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1935                         lines.push_back(linePair(startIndex, endIndex));
1936                 }
1937                 
1938         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1939                 
1940                 //loop through and create all the processes you want
1941                 while (process != processors) {
1942                         int pid = fork();
1943                         
1944                         if (pid > 0) {
1945                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1946                                 process++;
1947                         }else if (pid == 0){
1948                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1949                                 exit(0);
1950                         }else { 
1951                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1952                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1953                                 exit(0);
1954                         }
1955                 }
1956                 
1957                 //do my part
1958                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1959                 
1960                 //force parent to wait until all the processes are done
1961                 for (int i=0;i<processIDS.size();i++) { 
1962                         int temp = processIDS[i];
1963                         wait(&temp);
1964                 }
1965         
1966         #else
1967         
1968         //////////////////////////////////////////////////////////////////////////////////////////////////////
1969         
1970         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1971         
1972         //////////////////////////////////////////////////////////////////////////////////////////////////////
1973                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
1974                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1975                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1976                 
1977                 vector<shhhFlowsData*> pDataArray; 
1978                 DWORD   dwThreadIdArray[processors-1];
1979                 HANDLE  hThreadArray[processors-1]; 
1980                 
1981                 //Create processor worker threads.
1982                 for( int i=0; i<processors-1; i++ ){
1983                         // Allocate memory for thread data.
1984                         string extension = "";
1985                         if (i != 0) { extension = toString(i) + ".temp"; }
1986                         
1987             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1988                         pDataArray.push_back(tempFlow);
1989                         processIDS.push_back(i);
1990             
1991                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1992                 }
1993                 
1994                 //using the main process as a worker saves time and memory
1995                 //do my part
1996                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1997                 
1998                 //Wait until all threads have terminated.
1999                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2000                 
2001                 //Close all thread handles and free memory allocations.
2002                 for(int i=0; i < pDataArray.size(); i++){
2003                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2004                         CloseHandle(hThreadArray[i]);
2005                         delete pDataArray[i];
2006                 }
2007                 
2008         #endif
2009         
2010         for (int i=0;i<processIDS.size();i++) { 
2011             if (compositeFASTAFileName != "") {
2012                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2013                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2014                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2015                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2016             }
2017         }
2018         
2019         return 0;
2020         
2021     }
2022         catch(exception& e) {
2023                 m->errorOut(e, "ShhherCommand", "createProcesses");
2024                 exit(1);
2025         }
2026 }
2027 /**************************************************************************************************/
2028
2029 vector<string> ShhherCommand::parseFlowFiles(string filename){
2030     try {
2031         vector<string> files;
2032         int count = 0;
2033         
2034         ifstream in;
2035         m->openInputFile(filename, in);
2036         
2037         int thisNumFLows = 0;
2038         in >> thisNumFLows; m->gobble(in);
2039         
2040         while (!in.eof()) {
2041             if (m->control_pressed) { break; }
2042             
2043             ofstream out;
2044             string outputFileName = filename + toString(count) + ".temp";
2045             m->openOutputFile(outputFileName, out);
2046             out << thisNumFLows << endl;
2047             files.push_back(outputFileName);
2048             
2049             for (int i = 0; i < largeSize; i++) {
2050                 if (in.eof()) { break; }
2051                 string line = m->getline(in);
2052                 out << line << endl;
2053             }
2054             out.close();
2055             count++;
2056         }
2057         in.close();
2058         
2059         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2060         
2061         return files;
2062     }
2063         catch(exception& e) {
2064                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2065                 exit(1);
2066         }
2067 }
2068 /**************************************************************************************************/
2069
2070 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2071     try {
2072         
2073         for(int i=start;i<end;i++){
2074                         
2075                         if (m->control_pressed) { break; }
2076                         
2077             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2078             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2079             
2080             if (m->control_pressed) { break; }
2081             
2082             double begClock = clock();
2083             unsigned long long begTime;
2084             
2085             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2086                 
2087                 string flowFileName = theseFlowFileNames[g];
2088                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2089                 m->mothurOut("Reading flowgrams...\n");
2090                 
2091                 vector<string> seqNameVector;
2092                 vector<int> lengths;
2093                 vector<short> flowDataIntI;
2094                 vector<double> flowDataPrI;
2095                 map<string, int> nameMap;
2096                 vector<short> uniqueFlowgrams;
2097                 vector<int> uniqueCount;
2098                 vector<int> mapSeqToUnique;
2099                 vector<int> mapUniqueToSeq;
2100                 vector<int> uniqueLengths;
2101                 int numFlowCells;
2102                 
2103                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2104                 
2105                 if (m->control_pressed) { break; }
2106                 
2107                 m->mothurOut("Identifying unique flowgrams...\n");
2108                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2109                 
2110                 if (m->control_pressed) { break; }
2111                 
2112                 m->mothurOut("Calculating distances between flowgrams...\n");
2113                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2114                 begTime = time(NULL);
2115                
2116                 
2117                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); 
2118                 
2119                 m->mothurOutEndLine();
2120                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2121                 
2122                 
2123                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2124                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2125                 
2126                 if (m->control_pressed) { break; }
2127                 
2128                 m->mothurOut("\nClustering flowgrams...\n");
2129                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2130                 cluster(listFileName, distFileName, namesFileName);
2131                 
2132                 if (m->control_pressed) { break; }
2133                 
2134                 vector<int> otuData;
2135                 vector<int> cumNumSeqs;
2136                 vector<int> nSeqsPerOTU;
2137                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2138                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2139                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2140                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2141                 
2142                 
2143                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2144                 
2145                 if (m->control_pressed) { break; }
2146                 
2147                 m->mothurRemove(distFileName);
2148                 m->mothurRemove(namesFileName);
2149                 m->mothurRemove(listFileName);
2150                 
2151                 vector<double> dist;            //adDist - distance of sequences to centroids
2152                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2153                 vector<int> centroids;          //the representative flowgram for each cluster m
2154                 vector<double> weight;
2155                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2156                 vector<int> nSeqsBreaks;
2157                 vector<int> nOTUsBreaks;
2158                 
2159                 dist.assign(numSeqs * numOTUs, 0);
2160                 change.assign(numOTUs, 1);
2161                 centroids.assign(numOTUs, -1);
2162                 weight.assign(numOTUs, 0);
2163                 singleTau.assign(numSeqs, 1.0);
2164                 
2165                 nSeqsBreaks.assign(2, 0);
2166                 nOTUsBreaks.assign(2, 0);
2167                 
2168                 nSeqsBreaks[0] = 0;
2169                 nSeqsBreaks[1] = numSeqs;
2170                 nOTUsBreaks[1] = numOTUs;
2171                 
2172                 if (m->control_pressed) { break; }
2173                 
2174                 double maxDelta = 0;
2175                 int iter = 0;
2176                 
2177                 begClock = clock();
2178                 begTime = time(NULL);
2179                 
2180                 m->mothurOut("\nDenoising flowgrams...\n");
2181                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2182                 
2183                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2184                     
2185                     if (m->control_pressed) { break; }
2186                     
2187                     double cycClock = clock();
2188                     unsigned long long cycTime = time(NULL);
2189                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2190                     
2191                     if (m->control_pressed) { break; }
2192                     
2193                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2194                     
2195                     if (m->control_pressed) { break; }
2196                     
2197                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2198                     
2199                     if (m->control_pressed) { break; }
2200                     
2201                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2202                     
2203                     if (m->control_pressed) { break; }
2204                     
2205                     checkCentroids(numOTUs, centroids, weight);
2206                     
2207                     if (m->control_pressed) { break; }
2208                     
2209                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2210                     
2211                     if (m->control_pressed) { break; }
2212                     
2213                     iter++;
2214                     
2215                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2216                     
2217                 }       
2218                 
2219                 if (m->control_pressed) { break; }
2220                 
2221                 m->mothurOut("\nFinalizing...\n");
2222                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2223                 
2224                 if (m->control_pressed) { break; }
2225                 
2226                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2227                 
2228                 if (m->control_pressed) { break; }
2229                 
2230                 vector<int> otuCounts(numOTUs, 0);
2231                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2232                 
2233                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2234                 
2235                 if (m->control_pressed) { break; }
2236                 
2237                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2238                 string thisOutputDir = outputDir;
2239                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2240                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
2241                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
2242                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
2243                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
2244                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2245                 string groupFileName = fileRoot + "shhh.groups";
2246
2247                 
2248                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2249                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2250                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2251                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2252                 writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector);                                           if (m->control_pressed) { break; }
2253                 
2254                 if (large) {
2255                     if (g > 0) {
2256                         m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.qual"));
2257                         m->mothurRemove(qualityFileName);
2258                         m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.fasta"));
2259                         m->mothurRemove(fastaFileName);
2260                         m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.names"));
2261                         m->mothurRemove(nameFileName);
2262                         m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.counts"));
2263                         m->mothurRemove(otuCountsFileName);
2264                         m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.groups"));
2265                         m->mothurRemove(groupFileName);
2266                         m->mothurRemove(theseFlowFileNames[g]);
2267                     }
2268                 }
2269                         }
2270             
2271                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2272                 }
2273                 
2274         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2275         
2276         return 0;
2277         
2278     }catch(exception& e) {
2279             m->errorOut(e, "ShhherCommand", "driver");
2280             exit(1);
2281     }
2282 }
2283
2284 /**************************************************************************************************/
2285 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2286         try{
2287        
2288                 ifstream flowFile;
2289        
2290                 m->openInputFile(filename, flowFile);
2291                 
2292                 string seqName;
2293                 int currentNumFlowCells;
2294                 float intensity;
2295         thisSeqNameVector.clear();
2296                 thisLengths.clear();
2297                 thisFlowDataIntI.clear();
2298                 thisNameMap.clear();
2299                 
2300                 flowFile >> numFlowCells;
2301                 int index = 0;//pcluster
2302                 while(!flowFile.eof()){
2303                         
2304                         if (m->control_pressed) { break; }
2305                         
2306                         flowFile >> seqName >> currentNumFlowCells;
2307                         thisLengths.push_back(currentNumFlowCells);
2308            
2309                         thisSeqNameVector.push_back(seqName);
2310                         thisNameMap[seqName] = index++;//pcluster
2311
2312                         for(int i=0;i<numFlowCells;i++){
2313                                 flowFile >> intensity;
2314                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2315                                 int intI = int(100 * intensity + 0.0001);
2316                                 thisFlowDataIntI.push_back(intI);
2317                         }
2318                         m->gobble(flowFile);
2319                 }
2320                 flowFile.close();
2321                 
2322                 int numSeqs = thisSeqNameVector.size();         
2323                 
2324                 for(int i=0;i<numSeqs;i++){
2325                         
2326                         if (m->control_pressed) { break; }
2327                         
2328                         int iNumFlowCells = i * numFlowCells;
2329                         for(int j=thisLengths[i];j<numFlowCells;j++){
2330                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2331                         }
2332                 }
2333         
2334         return numSeqs;
2335                 
2336         }
2337         catch(exception& e) {
2338                 m->errorOut(e, "ShhherCommand", "getFlowData");
2339                 exit(1);
2340         }
2341 }
2342 /**************************************************************************************************/
2343
2344 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2345         try{            
2346         
2347                 ostringstream outStream;
2348                 outStream.setf(ios::fixed, ios::floatfield);
2349                 outStream.setf(ios::dec, ios::basefield);
2350                 outStream.setf(ios::showpoint);
2351                 outStream.precision(6);
2352                 
2353                 int begTime = time(NULL);
2354                 double begClock = clock();
2355         
2356                 for(int i=0;i<stopSeq;i++){
2357                         
2358                         if (m->control_pressed) { break; }
2359                         
2360                         for(int j=0;j<i;j++){
2361                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2362                 
2363                                 if(flowDistance < 1e-6){
2364                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2365                                 }
2366                                 else if(flowDistance <= cutoff){
2367                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2368                                 }
2369                         }
2370                         if(i % 100 == 0){
2371                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2372                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2373                                 m->mothurOutEndLine();
2374                         }
2375                 }
2376                 
2377                 ofstream distFile(distFileName.c_str());
2378                 distFile << outStream.str();            
2379                 distFile.close();
2380                 
2381                 if (m->control_pressed) {}
2382                 else {
2383                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2384                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2385                         m->mothurOutEndLine();
2386                 }
2387         
2388         return 0;
2389         }
2390         catch(exception& e) {
2391                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2392                 exit(1);
2393         }
2394 }
2395 /**************************************************************************************************/
2396
2397 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2398         try{
2399                 int minLength = lengths[mapSeqToUnique[seqA]];
2400                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2401                 
2402                 int ANumFlowCells = seqA * numFlowCells;
2403                 int BNumFlowCells = seqB * numFlowCells;
2404                 
2405                 float dist = 0;
2406                 
2407                 for(int i=0;i<minLength;i++){
2408                         
2409                         if (m->control_pressed) { break; }
2410                         
2411                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2412                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2413                         
2414                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2415                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2416                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2417                 }
2418                 
2419                 dist /= (float) minLength;
2420                 return dist;
2421         }
2422         catch(exception& e) {
2423                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2424                 exit(1);
2425         }
2426 }
2427
2428 /**************************************************************************************************/
2429
2430 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2431         try{
2432                 int numUniques = 0;
2433                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2434                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2435                 uniqueLengths.assign(numSeqs, 0);
2436                 mapSeqToUnique.assign(numSeqs, -1);
2437                 mapUniqueToSeq.assign(numSeqs, -1);
2438                 
2439                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2440                 
2441                 for(int i=0;i<numSeqs;i++){
2442                         
2443                         if (m->control_pressed) { break; }
2444                         
2445                         int index = 0;
2446                         
2447                         vector<short> current(numFlowCells);
2448                         for(int j=0;j<numFlowCells;j++){
2449                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2450                         }
2451             
2452                         for(int j=0;j<numUniques;j++){
2453                                 int offset = j * numFlowCells;
2454                                 bool toEnd = 1;
2455                                 
2456                                 int shorterLength;
2457                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2458                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2459                 
2460                                 for(int k=0;k<shorterLength;k++){
2461                                         if(current[k] != uniqueFlowgrams[offset + k]){
2462                                                 toEnd = 0;
2463                                                 break;
2464                                         }
2465                                 }
2466                                 
2467                                 if(toEnd){
2468                                         mapSeqToUnique[i] = j;
2469                                         uniqueCount[j]++;
2470                                         index = j;
2471                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2472                                         break;
2473                                 }
2474                                 index++;
2475                         }
2476                         
2477                         if(index == numUniques){
2478                                 uniqueLengths[numUniques] = lengths[i];
2479                                 uniqueCount[numUniques] = 1;
2480                                 mapSeqToUnique[i] = numUniques;//anMap
2481                                 mapUniqueToSeq[numUniques] = i;//anF
2482                                 
2483                                 for(int k=0;k<numFlowCells;k++){
2484                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2485                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2486                                 }
2487                                 
2488                                 numUniques++;
2489                         }
2490                 }
2491                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2492                 uniqueLengths.resize(numUniques);       
2493                 
2494                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2495                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2496         
2497         return numUniques;
2498         }
2499         catch(exception& e) {
2500                 m->errorOut(e, "ShhherCommand", "getUniques");
2501                 exit(1);
2502         }
2503 }
2504 /**************************************************************************************************/
2505 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2506         try{
2507                 
2508                 vector<string> duplicateNames(numUniques, "");
2509                 for(int i=0;i<numSeqs;i++){
2510                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2511                 }
2512                 
2513                 ofstream nameFile;
2514                 m->openOutputFile(filename, nameFile);
2515                 
2516                 for(int i=0;i<numUniques;i++){
2517                         
2518                         if (m->control_pressed) { break; }
2519                         
2520             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2521                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2522                 }
2523                 
2524                 nameFile.close();
2525         
2526                 return 0;
2527         }
2528         catch(exception& e) {
2529                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2530                 exit(1);
2531         }
2532 }
2533 //**********************************************************************************************************************
2534
2535 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2536         try {
2537                 
2538                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2539                 read->setCutoff(cutoff);
2540                 
2541                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2542                 clusterNameMap->readMap();
2543                 read->read(clusterNameMap);
2544         
2545                 ListVector* list = read->getListVector();
2546                 SparseMatrix* matrix = read->getMatrix();
2547                 
2548                 delete read; 
2549                 delete clusterNameMap; 
2550         
2551                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2552                 
2553                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2554                 string tag = cluster->getTag();
2555                 
2556                 double clusterCutoff = cutoff;
2557                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2558                         
2559                         if (m->control_pressed) { break; }
2560                         
2561                         cluster->update(clusterCutoff);
2562                 }
2563                 
2564                 list->setLabel(toString(cutoff));
2565                 
2566                 ofstream listFile;
2567                 m->openOutputFile(filename, listFile);
2568                 list->print(listFile);
2569                 listFile.close();
2570                 
2571                 delete matrix;  delete cluster; delete rabund; delete list;
2572         
2573                 return 0;
2574         }
2575         catch(exception& e) {
2576                 m->errorOut(e, "ShhherCommand", "cluster");
2577                 exit(1);        
2578         }               
2579 }
2580 /**************************************************************************************************/
2581
2582 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2583                                vector<int>& cumNumSeqs,
2584                                vector<int>& nSeqsPerOTU,
2585                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2586                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2587                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2588                                vector<int>& seqIndex,
2589                                map<string, int>& nameMap){
2590         try {
2591         
2592                 ifstream listFile;
2593                 m->openInputFile(fileName, listFile);
2594                 string label;
2595         int numOTUs;
2596                 
2597                 listFile >> label >> numOTUs;
2598         
2599                 otuData.assign(numSeqs, 0);
2600                 cumNumSeqs.assign(numOTUs, 0);
2601                 nSeqsPerOTU.assign(numOTUs, 0);
2602                 aaP.clear();aaP.resize(numOTUs);
2603                 
2604                 seqNumber.clear();
2605                 aaI.clear();
2606                 seqIndex.clear();
2607                 
2608                 string singleOTU = "";
2609                 
2610                 for(int i=0;i<numOTUs;i++){
2611                         
2612                         if (m->control_pressed) { break; }
2613             
2614                         listFile >> singleOTU;
2615                         
2616                         istringstream otuString(singleOTU);
2617             
2618                         while(otuString){
2619                                 
2620                                 string seqName = "";
2621                                 
2622                                 for(int j=0;j<singleOTU.length();j++){
2623                                         char letter = otuString.get();
2624                                         
2625                                         if(letter != ','){
2626                                                 seqName += letter;
2627                                         }
2628                                         else{
2629                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2630                                                 int index = nmIt->second;
2631                                                 
2632                                                 nameMap.erase(nmIt);
2633                                                 
2634                                                 otuData[index] = i;
2635                                                 nSeqsPerOTU[i]++;
2636                                                 aaP[i].push_back(index);
2637                                                 seqName = "";
2638                                         }
2639                                 }
2640                                 
2641                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2642                 
2643                                 int index = nmIt->second;
2644                                 nameMap.erase(nmIt);
2645                 
2646                                 otuData[index] = i;
2647                                 nSeqsPerOTU[i]++;
2648                                 aaP[i].push_back(index);        
2649                                 
2650                                 otuString.get();
2651                         }
2652                         
2653                         sort(aaP[i].begin(), aaP[i].end());
2654                         for(int j=0;j<nSeqsPerOTU[i];j++){
2655                                 seqNumber.push_back(aaP[i][j]);
2656                         }
2657                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2658                                 aaP[i].push_back(0);
2659                         }
2660                         
2661                         
2662                 }
2663                 
2664                 for(int i=1;i<numOTUs;i++){
2665                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2666                 }
2667                 aaI = aaP;
2668                 seqIndex = seqNumber;
2669                 
2670                 listFile.close();       
2671         
2672         return numOTUs;
2673                 
2674         }
2675         catch(exception& e) {
2676                 m->errorOut(e, "ShhherCommand", "getOTUData");
2677                 exit(1);        
2678         }               
2679 }
2680 /**************************************************************************************************/
2681
2682 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2683                                           vector<int>& cumNumSeqs,
2684                                           vector<int>& nSeqsPerOTU,
2685                                           vector<int>& seqIndex,
2686                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2687                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2688                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2689                                           vector<int>& mapSeqToUnique,
2690                                           vector<short>& uniqueFlowgrams,
2691                                           vector<short>& flowDataIntI,
2692                                           vector<int>& lengths,
2693                                           int numFlowCells,
2694                                           vector<int>& seqNumber){                          
2695         
2696         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2697         //within an otu
2698         
2699         try{
2700                 
2701                 for(int i=0;i<numOTUs;i++){
2702                         
2703                         if (m->control_pressed) { break; }
2704                         
2705                         double count = 0;
2706                         int position = 0;
2707                         int minFlowGram = 100000000;
2708                         double minFlowValue = 1e8;
2709                         change[i] = 0; //FALSE
2710                         
2711                         for(int j=0;j<nSeqsPerOTU[i];j++){
2712                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2713                         }
2714             
2715                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2716                                 vector<double> adF(nSeqsPerOTU[i]);
2717                                 vector<int> anL(nSeqsPerOTU[i]);
2718                                 
2719                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2720                                         int index = cumNumSeqs[i] + j;
2721                                         int nI = seqIndex[index];
2722                                         int nIU = mapSeqToUnique[nI];
2723                                         
2724                                         int k;
2725                                         for(k=0;k<position;k++){
2726                                                 if(nIU == anL[k]){
2727                                                         break;
2728                                                 }
2729                                         }
2730                                         if(k == position){
2731                                                 anL[position] = nIU;
2732                                                 adF[position] = 0.0000;
2733                                                 position++;
2734                                         }                                               
2735                                 }
2736                                 
2737                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2738                                         int index = cumNumSeqs[i] + j;
2739                                         int nI = seqIndex[index];
2740                                         
2741                                         double tauValue = singleTau[seqNumber[index]];
2742                                         
2743                                         for(int k=0;k<position;k++){
2744                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2745                                                 adF[k] += dist * tauValue;
2746                                         }
2747                                 }
2748                                 
2749                                 for(int j=0;j<position;j++){
2750                                         if(adF[j] < minFlowValue){
2751                                                 minFlowGram = j;
2752                                                 minFlowValue = adF[j];
2753                                         }
2754                                 }
2755                                 
2756                                 if(centroids[i] != anL[minFlowGram]){
2757                                         change[i] = 1;
2758                                         centroids[i] = anL[minFlowGram];
2759                                 }
2760                         }
2761                         else if(centroids[i] != -1){
2762                                 change[i] = 1;
2763                                 centroids[i] = -1;                      
2764                         }
2765                 }
2766         
2767         return 0;
2768         }
2769         catch(exception& e) {
2770                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2771                 exit(1);        
2772         }               
2773 }
2774 /**************************************************************************************************/
2775
2776 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2777                                         vector<short>& flowDataIntI, int numFlowCells){
2778         try{
2779                 
2780                 int flowAValue = cent * numFlowCells;
2781                 int flowBValue = flow * numFlowCells;
2782                 
2783                 double dist = 0;
2784         
2785                 for(int i=0;i<length;i++){
2786                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2787                         flowAValue++;
2788                         flowBValue++;
2789                 }
2790                 
2791                 return dist / (double)length;
2792         }
2793         catch(exception& e) {
2794                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2795                 exit(1);        
2796         }               
2797 }
2798 /**************************************************************************************************/
2799
2800 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2801         try{
2802                 
2803                 double maxChange = 0;
2804                 
2805                 for(int i=0;i<numOTUs;i++){
2806                         
2807                         if (m->control_pressed) { break; }
2808                         
2809                         double difference = weight[i];
2810                         weight[i] = 0;
2811                         
2812                         for(int j=0;j<nSeqsPerOTU[i];j++){
2813                                 int index = cumNumSeqs[i] + j;
2814                                 double tauValue = singleTau[seqNumber[index]];
2815                                 weight[i] += tauValue;
2816                         }
2817                         
2818                         difference = fabs(weight[i] - difference);
2819                         if(difference > maxChange){     maxChange = difference; }
2820                 }
2821                 return maxChange;
2822         }
2823         catch(exception& e) {
2824                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2825                 exit(1);        
2826         }               
2827 }
2828
2829 /**************************************************************************************************/
2830
2831 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2832         
2833         try{
2834                 
2835                 vector<long double> P(numSeqs, 0);
2836                 int effNumOTUs = 0;
2837                 
2838                 for(int i=0;i<numOTUs;i++){
2839                         if(weight[i] > MIN_WEIGHT){
2840                                 effNumOTUs++;
2841                         }
2842                 }
2843                 
2844                 string hold;
2845                 for(int i=0;i<numOTUs;i++){
2846                         
2847                         if (m->control_pressed) { break; }
2848                         
2849                         for(int j=0;j<nSeqsPerOTU[i];j++){
2850                                 int index = cumNumSeqs[i] + j;
2851                                 int nI = seqIndex[index];
2852                                 double singleDist = dist[seqNumber[index]];
2853                                 
2854                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2855                         }
2856                 }
2857                 double nLL = 0.00;
2858                 for(int i=0;i<numSeqs;i++){
2859                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2860             
2861                         nLL += -log(P[i]);
2862                 }
2863                 
2864                 nLL = nLL -(double)numSeqs * log(sigma);
2865         
2866                 return nLL; 
2867         }
2868         catch(exception& e) {
2869                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2870                 exit(1);        
2871         }               
2872 }
2873
2874 /**************************************************************************************************/
2875
2876 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2877         try{
2878                 vector<int> unique(numOTUs, 1);
2879                 
2880                 for(int i=0;i<numOTUs;i++){
2881                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2882                                 unique[i] = -1;
2883                         }
2884                 }
2885                 
2886                 for(int i=0;i<numOTUs;i++){
2887                         
2888                         if (m->control_pressed) { break; }
2889                         
2890                         if(unique[i] == 1){
2891                                 for(int j=i+1;j<numOTUs;j++){
2892                                         if(unique[j] == 1){
2893                                                 
2894                                                 if(centroids[j] == centroids[i]){
2895                                                         unique[j] = 0;
2896                                                         centroids[j] = -1;
2897                                                         
2898                                                         weight[i] += weight[j];
2899                                                         weight[j] = 0.0;
2900                                                 }
2901                                         }
2902                                 }
2903                         }
2904                 }
2905         
2906         return 0;
2907         }
2908         catch(exception& e) {
2909                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2910                 exit(1);        
2911         }               
2912 }
2913 /**************************************************************************************************/
2914
2915 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
2916                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
2917                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
2918                                      vector<int>& seqNumber, vector<int>& seqIndex,
2919                                      vector<short>& uniqueFlowgrams,
2920                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2921         
2922         try{
2923                 
2924                 int total = 0;
2925                 vector<double> newTau(numOTUs,0);
2926                 vector<double> norms(numSeqs, 0);
2927                 nSeqsPerOTU.assign(numOTUs, 0);
2928         
2929                 for(int i=0;i<numSeqs;i++){
2930                         
2931                         if (m->control_pressed) { break; }
2932                         
2933                         int indexOffset = i * numOTUs;
2934             
2935                         double offset = 1e8;
2936                         
2937                         for(int j=0;j<numOTUs;j++){
2938                 
2939                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2940                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2941                                 }
2942                 
2943                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2944                                         offset = dist[indexOffset + j];
2945                                 }
2946                         }
2947             
2948                         for(int j=0;j<numOTUs;j++){
2949                                 if(weight[j] > MIN_WEIGHT){
2950                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2951                                         norms[i] += newTau[j];
2952                                 }
2953                                 else{
2954                                         newTau[j] = 0.0;
2955                                 }
2956                         }
2957             
2958                         for(int j=0;j<numOTUs;j++){
2959                                 newTau[j] /= norms[i];
2960                         }
2961             
2962                         for(int j=0;j<numOTUs;j++){
2963                                 if(newTau[j] > MIN_TAU){
2964                                         
2965                                         int oldTotal = total;
2966                                         
2967                                         total++;
2968                                         
2969                                         singleTau.resize(total, 0);
2970                                         seqNumber.resize(total, 0);
2971                                         seqIndex.resize(total, 0);
2972                                         
2973                                         singleTau[oldTotal] = newTau[j];
2974                                         
2975                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
2976                                         aaI[j][nSeqsPerOTU[j]] = i;
2977                                         nSeqsPerOTU[j]++;
2978                                 }
2979                         }
2980             
2981                 }
2982         
2983         }
2984         catch(exception& e) {
2985                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2986                 exit(1);        
2987         }               
2988 }
2989 /**************************************************************************************************/
2990
2991 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2992         try {
2993                 int index = 0;
2994                 for(int i=0;i<numOTUs;i++){
2995                         
2996                         if (m->control_pressed) { return 0; }
2997                         
2998                         cumNumSeqs[i] = index;
2999                         for(int j=0;j<nSeqsPerOTU[i];j++){
3000                                 seqNumber[index] = aaP[i][j];
3001                                 seqIndex[index] = aaI[i][j];
3002                                 
3003                                 index++;
3004                         }
3005                 }
3006         
3007         return 0; 
3008         }
3009         catch(exception& e) {
3010                 m->errorOut(e, "ShhherCommand", "fill");
3011                 exit(1);        
3012         }               
3013 }
3014 /**************************************************************************************************/
3015
3016 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3017                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3018         
3019         try {
3020                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3021                 
3022                 for(int i=0;i<numOTUs;i++){
3023                         
3024                         if (m->control_pressed) { break; }
3025                         
3026                         for(int j=0;j<nSeqsPerOTU[i];j++){
3027                                 int index = cumNumSeqs[i] + j;
3028                                 double tauValue = singleTau[seqNumber[index]];
3029                                 int sIndex = seqIndex[index];
3030                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3031                         }
3032                 }
3033                 
3034                 for(int i=0;i<numSeqs;i++){
3035                         double maxTau = -1.0000;
3036                         int maxOTU = -1;
3037                         for(int j=0;j<numOTUs;j++){
3038                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3039                                         maxTau = bigTauMatrix[i * numOTUs + j];
3040                                         maxOTU = j;
3041                                 }
3042                         }
3043                         
3044                         otuData[i] = maxOTU;
3045                 }
3046                 
3047                 nSeqsPerOTU.assign(numOTUs, 0);         
3048                 
3049                 for(int i=0;i<numSeqs;i++){
3050                         int index = otuData[i];
3051                         
3052                         singleTau[i] = 1.0000;
3053                         dist[i] = 0.0000;
3054                         
3055                         aaP[index][nSeqsPerOTU[index]] = i;
3056                         aaI[index][nSeqsPerOTU[index]] = i;
3057                         
3058                         nSeqsPerOTU[index]++;
3059                 }
3060         
3061                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3062         }
3063         catch(exception& e) {
3064                 m->errorOut(e, "ShhherCommand", "setOTUs");
3065                 exit(1);        
3066         }               
3067 }
3068 /**************************************************************************************************/
3069
3070 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3071                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3072                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3073         
3074         try {
3075         
3076                 ofstream qualityFile;
3077                 m->openOutputFile(qualityFileName, qualityFile);
3078         
3079                 qualityFile.setf(ios::fixed, ios::floatfield);
3080                 qualityFile.setf(ios::showpoint);
3081                 qualityFile << setprecision(6);
3082                 
3083                 vector<vector<int> > qualities(numOTUs);
3084                 vector<double> pr(HOMOPS, 0);
3085                 
3086                 
3087                 for(int i=0;i<numOTUs;i++){
3088                         
3089                         if (m->control_pressed) { break; }
3090                         
3091                         int index = 0;
3092                         int base = 0;
3093                         
3094                         if(nSeqsPerOTU[i] > 0){
3095                                 qualities[i].assign(1024, -1);
3096                                 
3097                                 while(index < numFlowCells){
3098                                         double maxPrValue = 1e8;
3099                                         short maxPrIndex = -1;
3100                                         double count = 0.0000;
3101                                         
3102                                         pr.assign(HOMOPS, 0);
3103                                         
3104                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3105                                                 int lIndex = cumNumSeqs[i] + j;
3106                                                 double tauValue = singleTau[seqNumber[lIndex]];
3107                                                 int sequenceIndex = aaI[i][j];
3108                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3109                                                 
3110                                                 count += tauValue;
3111                                                 
3112                                                 for(int s=0;s<HOMOPS;s++){
3113                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3114                                                 }
3115                                         }
3116                                         
3117                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3118                                         maxPrValue = pr[maxPrIndex];
3119                                         
3120                                         if(count > MIN_COUNT){
3121                                                 double U = 0.0000;
3122                                                 double norm = 0.0000;
3123                                                 
3124                                                 for(int s=0;s<HOMOPS;s++){
3125                                                         norm += exp(-(pr[s] - maxPrValue));
3126                                                 }
3127                                                 
3128                                                 for(int s=1;s<=maxPrIndex;s++){
3129                                                         int value = 0;
3130                                                         double temp = 0.0000;
3131                                                         
3132                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3133                                                         
3134                                                         if(U>0.00){
3135                                                                 temp = log10(U);
3136                                                         }
3137                                                         else{
3138                                                                 temp = -10.1;
3139                                                         }
3140                                                         temp = floor(-10 * temp);
3141                                                         value = (int)floor(temp);
3142                                                         if(value > 100){        value = 100;    }
3143                                                         
3144                                                         qualities[i][base] = (int)value;
3145                                                         base++;
3146                                                 }
3147                                         }
3148                                         
3149                                         index++;
3150                                 }
3151                         }
3152                         
3153                         
3154                         if(otuCounts[i] > 0){
3155                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3156                         
3157                                 int j=4;        //need to get past the first four bases
3158                                 while(qualities[i][j] != -1){
3159                     qualityFile << qualities[i][j] << ' ';
3160                     if (j > qualities[i].size()) { break; }
3161                     j++;
3162                                 }
3163                                 qualityFile << endl;
3164                         }
3165                 }
3166                 qualityFile.close();
3167                 outputNames.push_back(qualityFileName);
3168         
3169         }
3170         catch(exception& e) {
3171                 m->errorOut(e, "ShhherCommand", "writeQualities");
3172                 exit(1);        
3173         }               
3174 }
3175
3176 /**************************************************************************************************/
3177
3178 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3179         try {
3180                 
3181                 ofstream fastaFile;
3182                 m->openOutputFile(fastaFileName, fastaFile);
3183                 
3184                 vector<string> names(numOTUs, "");
3185                 
3186                 for(int i=0;i<numOTUs;i++){
3187                         
3188                         if (m->control_pressed) { break; }
3189                         
3190                         int index = centroids[i];
3191                         
3192                         if(otuCounts[i] > 0){
3193                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3194                                 
3195                                 string newSeq = "";
3196                                 
3197                                 for(int j=0;j<numFlowCells;j++){
3198                                         
3199                                         char base = flowOrder[j % 4];
3200                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3201                                                 newSeq += base;
3202                                         }
3203                                 }
3204                                 
3205                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3206                 else {  fastaFile << "NNNN" << endl;  }
3207                         }
3208                 }
3209                 fastaFile.close();
3210         
3211                 outputNames.push_back(fastaFileName);
3212         
3213                 if(thisCompositeFASTAFileName != ""){
3214                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3215                 }
3216         }
3217         catch(exception& e) {
3218                 m->errorOut(e, "ShhherCommand", "writeSequences");
3219                 exit(1);        
3220         }               
3221 }
3222
3223 /**************************************************************************************************/
3224
3225 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3226         try {
3227                 
3228                 ofstream nameFile;
3229                 m->openOutputFile(nameFileName, nameFile);
3230                 
3231                 for(int i=0;i<numOTUs;i++){
3232                         
3233                         if (m->control_pressed) { break; }
3234                         
3235                         if(otuCounts[i] > 0){
3236                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3237                                 
3238                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3239                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3240                                 }
3241                                 
3242                                 nameFile << endl;
3243                         }
3244                 }
3245                 nameFile.close();
3246                 outputNames.push_back(nameFileName);
3247                 
3248                 
3249                 if(thisCompositeNamesFileName != ""){
3250                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3251                 }               
3252         }
3253         catch(exception& e) {
3254                 m->errorOut(e, "ShhherCommand", "writeNames");
3255                 exit(1);        
3256         }               
3257 }
3258
3259 /**************************************************************************************************/
3260
3261 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3262         try {
3263         ofstream groupFile;
3264                 m->openOutputFile(groupFileName, groupFile);
3265                 
3266                 for(int i=0;i<numSeqs;i++){
3267                         if (m->control_pressed) { break; }
3268                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3269                 }
3270                 groupFile.close();
3271                 outputNames.push_back(groupFileName);
3272         
3273         }
3274         catch(exception& e) {
3275                 m->errorOut(e, "ShhherCommand", "writeGroups");
3276                 exit(1);        
3277         }               
3278 }
3279
3280 /**************************************************************************************************/
3281
3282 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3283         try {
3284                 ofstream otuCountsFile;
3285                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3286                 
3287                 string bases = flowOrder;
3288                 
3289                 for(int i=0;i<numOTUs;i++){
3290                         
3291                         if (m->control_pressed) {
3292                                 break;
3293                         }
3294                         //output the translated version of the centroid sequence for the otu
3295                         if(otuCounts[i] > 0){
3296                                 int index = centroids[i];
3297                                 
3298                                 otuCountsFile << "ideal\t";
3299                                 for(int j=8;j<numFlowCells;j++){
3300                                         char base = bases[j % 4];
3301                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3302                                                 otuCountsFile << base;
3303                                         }
3304                                 }
3305                                 otuCountsFile << endl;
3306                                 
3307                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3308                                         int sequence = aaI[i][j];
3309                                         otuCountsFile << seqNameVector[sequence] << '\t';
3310                                         
3311                                         string newSeq = "";
3312                                         
3313                                         for(int k=0;k<lengths[sequence];k++){
3314                                                 char base = bases[k % 4];
3315                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3316                         
3317                                                 for(int s=0;s<freq;s++){
3318                                                         newSeq += base;
3319                                                         //otuCountsFile << base;
3320                                                 }
3321                                         }
3322                                         otuCountsFile << newSeq.substr(4) << endl;
3323                                 }
3324                                 otuCountsFile << endl;
3325                         }
3326                 }
3327                 otuCountsFile.close();
3328                 outputNames.push_back(otuCountsFileName);
3329         
3330         }
3331         catch(exception& e) {
3332                 m->errorOut(e, "ShhherCommand", "writeClusters");
3333                 exit(1);        
3334         }               
3335 }
3336
3337 /**************************************************************************************************/
3338
3339 void ShhherCommand::getSingleLookUp(){
3340         try{
3341                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3342                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3343                 
3344                 int index = 0;
3345                 ifstream lookUpFile;
3346                 m->openInputFile(lookupFileName, lookUpFile);
3347                 
3348                 for(int i=0;i<HOMOPS;i++){
3349                         
3350                         if (m->control_pressed) { break; }
3351                         
3352                         float logFracFreq;
3353                         lookUpFile >> logFracFreq;
3354                         
3355                         for(int j=0;j<NUMBINS;j++)      {
3356                                 lookUpFile >> singleLookUp[index];
3357                                 index++;
3358                         }
3359                 }       
3360                 lookUpFile.close();
3361         }
3362         catch(exception& e) {
3363                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3364                 exit(1);
3365         }
3366 }
3367
3368 /**************************************************************************************************/
3369
3370 void ShhherCommand::getJointLookUp(){
3371         try{
3372                 
3373                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3374                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3375                 
3376                 for(int i=0;i<NUMBINS;i++){
3377                         
3378                         if (m->control_pressed) { break; }
3379                         
3380                         for(int j=0;j<NUMBINS;j++){             
3381                                 
3382                                 double minSum = 100000000;
3383                                 
3384                                 for(int k=0;k<HOMOPS;k++){
3385                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3386                                         
3387                                         if(sum < minSum)        {       minSum = sum;           }
3388                                 }       
3389                                 jointLookUp[i * NUMBINS + j] = minSum;
3390                         }
3391                 }
3392         }
3393         catch(exception& e) {
3394                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3395                 exit(1);
3396         }
3397 }
3398
3399 /**************************************************************************************************/
3400
3401 double ShhherCommand::getProbIntensity(int intIntensity){                          
3402         try{
3403
3404                 double minNegLogProb = 100000000; 
3405
3406                 
3407                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3408                         
3409                         if (m->control_pressed) { break; }
3410                         
3411                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3412                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3413                 }
3414                 
3415                 return minNegLogProb;
3416         }
3417         catch(exception& e) {
3418                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3419                 exit(1);
3420         }
3421 }
3422
3423
3424
3425