]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
fixed shhh.flows outputdir bug
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
22                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
23                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
24                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41                 return helpString;
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "ShhherCommand", "getHelpString");
45                 exit(1);
46         }
47 }
48 //**********************************************************************************************************************
49
50 ShhherCommand::ShhherCommand(){ 
51         try {
52                 abort = true; calledHelp = true;
53                 setParameters();
54                 
55                 //initialize outputTypes
56 //              vector<string> tempOutNames;
57 //              outputTypes["pn.dist"] = tempOutNames;
58
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
62                 exit(1);
63         }
64 }
65
66 //**********************************************************************************************************************
67
68 ShhherCommand::ShhherCommand(string option) {
69         try {
70         
71 #ifdef USE_MPI
72                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
73                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
74         
75                 if(pid == 0){
76 #endif
77                 abort = false; calledHelp = false;   
78                 
79                 //allow user to run help
80                 if(option == "help") { help(); abort = true; calledHelp = true; }
81                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
82                 
83                 else {
84                         vector<string> myArray = setParameters();
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter;
90                         map<string,string>::iterator it;
91                         
92                         //check to make sure all parameters are valid for command
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
95                         }
96                         
97                         //initialize outputTypes
98                         vector<string> tempOutNames;
99 //                      outputTypes["pn.dist"] = tempOutNames;
100                         //                      outputTypes["fasta"] = tempOutNames;
101                         
102                         //if the user changes the input directory command factory will send this info to us in the output parameter 
103                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
104                         if (inputDir == "not found"){   inputDir = "";          }
105                         else {
106                                 string path;
107                                 it = parameters.find("flow");
108                                 //user has given a template file
109                                 if(it != parameters.end()){ 
110                                         path = m->hasPath(it->second);
111                                         //if the user has not given a path then, add inputdir. else leave path alone.
112                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
113                                 }
114                                 
115                                 it = parameters.find("lookup");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
121                                 }
122
123                                 it = parameters.find("file");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
129                                 }
130                         }
131                         
132                         
133                         //check for required parameters
134                         flowFileName = validParameter.validFile(parameters, "flow", true);
135                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
136                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
137                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
138                                 m->mothurOutEndLine();
139                                 abort = true; 
140                         }
141                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
142                         
143                         if(flowFileName != "not found"){
144                                 compositeFASTAFileName = "";    
145                                 compositeNamesFileName = "";    
146                         }
147                         else{
148                                 ofstream temp;
149
150                                 //flow.files = 9 character offset
151                                 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
152                                 m->openOutputFile(compositeFASTAFileName, temp);
153                                 temp.close();
154                                 
155                                 compositeNamesFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.names";
156                                 m->openOutputFile(compositeNamesFileName, temp);
157                                 temp.close();
158                         }
159             
160             if(flowFilesFileName != "not found"){
161                 string fName;
162                 
163                 ifstream flowFilesFile;
164                 m->openInputFile(flowFilesFileName, flowFilesFile);
165                 while(flowFilesFile){
166                     fName = m->getline(flowFilesFile);
167                     
168                     //test if file is valid
169                     ifstream in;
170                     int ableToOpen = m->openInputFile(fName, in, "noerror");
171                     in.close(); 
172                     if (ableToOpen == 1) {
173                         if (inputDir != "") { //default path is set
174                             string tryPath = inputDir + fName;
175                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
176                             ifstream in2;
177                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
178                             in2.close();
179                             fName = tryPath;
180                         }
181                     }
182                     
183                     if (ableToOpen == 1) {
184                         if (m->getDefaultPath() != "") { //default path is set
185                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
186                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
187                             ifstream in2;
188                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
189                             in2.close();
190                             fName = tryPath;
191                         }
192                     }
193                     
194                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
195                     if (ableToOpen == 1) {
196                         string exepath = m->argv;
197                         string tempPath = exepath;
198                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
199                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
200                         
201                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
202                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
203                         ifstream in2;
204                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
205                         in2.close();
206                         fName = tryPath;
207                     }
208                     
209                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
210                     else { flowFileVector.push_back(fName); }
211                     m->gobble(flowFilesFile);
212                 }
213                 flowFilesFile.close();
214                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
215             }
216             else{
217                 flowFileVector.push_back(flowFileName);
218             }
219
220                         
221                         //if the user changes the output directory command factory will send this info to us in the output parameter 
222                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
223                                 outputDir = ""; 
224                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
225                         }
226                         
227                         
228                         //check for optional parameter and set defaults
229                         // ...at some point should added some additional type checking...
230                         string temp;
231                         temp = validParameter.validFile(parameters, "lookup", true);
232                         if (temp == "not found")        {       
233                                 lookupFileName = "LookUp_Titanium.pat"; 
234                                 
235                                 int ableToOpen;
236                                 ifstream in;
237                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
238                                 in.close();     
239                                 
240                                 //if you can't open it, try input location
241                                 if (ableToOpen == 1) {
242                                         if (inputDir != "") { //default path is set
243                                                 string tryPath = inputDir + lookupFileName;
244                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
245                                                 ifstream in2;
246                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
247                                                 in2.close();
248                                                 lookupFileName = tryPath;
249                                         }
250                                 }
251                                 
252                                 //if you can't open it, try default location
253                                 if (ableToOpen == 1) {
254                                         if (m->getDefaultPath() != "") { //default path is set
255                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
256                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
257                                                 ifstream in2;
258                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
259                                                 in2.close();
260                                                 lookupFileName = tryPath;
261                                         }
262                                 }
263                                 
264                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
265                                 if (ableToOpen == 1) {
266                                         string exepath = m->argv;
267                                         string tempPath = exepath;
268                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
269                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
270                                         
271                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
272                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
273                                         ifstream in2;
274                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
275                                         in2.close();
276                                         lookupFileName = tryPath;
277                                 }
278                                 
279                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
280                         }
281                         else if(temp == "not open")     {       
282                                 
283                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
284                                 
285                                 //if you can't open it its not inputDir, try mothur excutable location
286                                 string exepath = m->argv;
287                                 string tempPath = exepath;
288                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
289                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
290                                         
291                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
292                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
293                                 ifstream in2;
294                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
295                                 in2.close();
296                                 lookupFileName = tryPath;
297                                 
298                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
299                         }else                                           {       lookupFileName = temp;  }
300                         
301                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
302                         m->setProcessors(temp);
303                         m->mothurConvert(temp, processors);
304
305                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
306                         m->mothurConvert(temp, cutoff); 
307                         
308                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
309                         m->mothurConvert(temp, minDelta); 
310
311                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
312                         m->mothurConvert(temp, maxIters); 
313
314                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
315                         m->mothurConvert(temp, sigma); 
316                         
317                         flowOrder = validParameter.validFile(parameters, "order", false);
318                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
319                         else if(flowOrder.length() != 4){
320                                 m->mothurOut("The value of the order option must be four bases long\n");
321                         }
322                         
323                 }
324 #ifdef USE_MPI
325                 }                               
326 #endif
327         }
328         catch(exception& e) {
329                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
330                 exit(1);
331         }
332 }
333 //**********************************************************************************************************************
334 #ifdef USE_MPI
335 int ShhherCommand::execute(){
336         try {
337                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
338                 
339                 int tag = 1976;
340                 MPI_Status status; 
341                         
342                 if(pid == 0){
343
344                         for(int i=1;i<ncpus;i++){
345                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
346                         }
347                         if(abort == 1){ return 0;       }
348
349                         processors = ncpus;
350                         
351                         m->mothurOut("\nGetting preliminary data...\n");
352                         getSingleLookUp();      if (m->control_pressed) { return 0; }
353                         getJointLookUp();       if (m->control_pressed) { return 0; }
354                         
355             vector<string> flowFileVector;
356                         if(flowFilesFileName != "not found"){
357                                 string fName;
358                 
359                                 ifstream flowFilesFile;
360                                 m->openInputFile(flowFilesFileName, flowFilesFile);
361                                 while(flowFilesFile){
362                                         fName = m->getline(flowFilesFile);
363                                         flowFileVector.push_back(fName);
364                                         m->gobble(flowFilesFile);
365                                 }
366                         }
367                         else{
368                                 flowFileVector.push_back(flowFileName);
369                         }
370             
371                         int numFiles = flowFileVector.size();
372
373                         for(int i=1;i<ncpus;i++){
374                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
375                         }
376                         
377                         for(int i=0;i<numFiles;i++){
378                                 
379                                 if (m->control_pressed) { break; }
380                                 
381                                 double begClock = clock();
382                                 unsigned long long begTime = time(NULL);
383
384                                 flowFileName = flowFileVector[i];
385                                 
386                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
387                                 m->mothurOut("Reading flowgrams...\n");
388                                 getFlowData();
389                                 
390                                 if (m->control_pressed) { break; }
391
392                                 m->mothurOut("Identifying unique flowgrams...\n");
393                                 getUniques();
394                                 
395                                 if (m->control_pressed) { break; }
396
397                                 m->mothurOut("Calculating distances between flowgrams...\n");
398                                 char fileName[1024];
399                                 strcpy(fileName, flowFileName.c_str());
400
401                                 for(int i=1;i<ncpus;i++){
402                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
403
404                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
405                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
406                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
407                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
408                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
409                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
410                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
411                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
412                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
413                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
414                                 }                               
415                                                         
416                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
417                                 
418                                 if (m->control_pressed) { break; }
419                                 
420                                 int done;
421                                 for(int i=1;i<ncpus;i++){
422                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
423                                         
424                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
425                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
426                                 }
427
428                                 string namesFileName = createNamesFile();
429                                 
430                                 if (m->control_pressed) { break; }
431                                 
432                                 m->mothurOut("\nClustering flowgrams...\n");
433                                 string listFileName = cluster(distFileName, namesFileName);
434                                 
435                                 if (m->control_pressed) { break; }
436                                 
437                 
438                 
439                                 getOTUData(listFileName);
440
441                                 m->mothurRemove(distFileName);
442                                 m->mothurRemove(namesFileName);
443                                 m->mothurRemove(listFileName);
444                                 
445                                 if (m->control_pressed) { break; }
446                                 
447                                 initPyroCluster();
448
449                                 if (m->control_pressed) { break; }
450                                 
451             
452                                 for(int i=1;i<ncpus;i++){
453                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
454                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
455                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
456                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
457                                 }
458                                 
459                                 if (m->control_pressed) { break; }
460                                 
461                                 double maxDelta = 0;
462                                 int iter = 0;
463                                 
464                                 int numOTUsOnCPU = numOTUs / ncpus;
465                                 int numSeqsOnCPU = numSeqs / ncpus;
466                                 m->mothurOut("\nDenoising flowgrams...\n");
467                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
468                                 
469                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
470                                         
471                                         double cycClock = clock();
472                                         unsigned long long cycTime = time(NULL);
473                                         fill();
474                                         
475                                         if (m->control_pressed) { break; }
476
477                                         int total = singleTau.size();
478                                         for(int i=1;i<ncpus;i++){
479                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
480                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
481                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
482                                                 
483                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
484                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
485                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
486                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
487                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
488                                         }
489                                         
490                                         calcCentroidsDriver(0, numOTUsOnCPU);
491                                         
492                                         for(int i=1;i<ncpus;i++){
493                                                 int otuStart = i * numOTUs / ncpus;
494                                                 int otuStop = (i + 1) * numOTUs / ncpus;
495                                                 
496                                                 vector<int> tempCentroids(numOTUs, 0);
497                                                 vector<short> tempChange(numOTUs, 0);
498                                                 
499                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
500                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
501                                                 
502                                                 for(int j=otuStart;j<otuStop;j++){
503                                                         centroids[j] = tempCentroids[j];
504                                                         change[j] = tempChange[j];
505                                                 }
506                                         }
507                                                                         
508                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
509                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
510                                         checkCentroids(); if (m->control_pressed) { break; }
511                                         
512                                         for(int i=1;i<ncpus;i++){
513                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
514                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
515                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
516                                         }
517                                         
518                                         calcNewDistancesParent(0, numSeqsOnCPU);
519
520                                         total = singleTau.size();
521
522                                         for(int i=1;i<ncpus;i++){
523                                                 int childTotal;
524                                                 int seqStart = i * numSeqs / ncpus;
525                                                 int seqStop = (i + 1) * numSeqs / ncpus;
526                                                 
527                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
528
529                                                 vector<int> childSeqIndex(childTotal, 0);
530                                                 vector<double> childSingleTau(childTotal, 0);
531                                                 vector<double> childDist(numSeqs * numOTUs, 0);
532                                                 vector<int> otuIndex(childTotal, 0);
533                                                 
534                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
535                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
536                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
537                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
538
539                                                 int oldTotal = total;
540                                                 total += childTotal;
541                                                 singleTau.resize(total, 0);
542                                                 seqIndex.resize(total, 0);
543                                                 seqNumber.resize(total, 0);
544                                                 
545                                                 int childIndex = 0;
546                                                 
547                                                 for(int j=oldTotal;j<total;j++){
548                                                         int otuI = otuIndex[childIndex];
549                                                         int seqI = childSeqIndex[childIndex];
550                                                         
551                                                         singleTau[j] = childSingleTau[childIndex];
552                                                         
553                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
554                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
555                                                         nSeqsPerOTU[otuI]++;
556                                                         childIndex++;
557                                                 }
558                                                 
559                                                 int index = seqStart * numOTUs;
560                                                 for(int j=seqStart;j<seqStop;j++){
561                                                         for(int k=0;k<numOTUs;k++){
562                                                                 dist[index] = childDist[index];
563                                                                 index++;
564                                                         }
565                                                 }                                       
566                                         }
567                                         
568                                         iter++;
569                                         
570                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
571
572                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
573                                                 int live = 1;
574                                                 for(int i=1;i<ncpus;i++){
575                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
576                                                 }
577                                         }
578                                         else{
579                                                 int live = 0;
580                                                 for(int i=1;i<ncpus;i++){
581                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
582                                                 }
583                                         }
584                                         
585                                 }       
586                                 
587                                 if (m->control_pressed) { break; }
588                                 
589                                 m->mothurOut("\nFinalizing...\n");
590                                 fill();
591                                 
592                                 if (m->control_pressed) { break; }
593                                 
594                                 setOTUs();
595                                 
596                                 vector<int> otuCounts(numOTUs, 0);
597                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
598                                 calcCentroidsDriver(0, numOTUs);
599                                 
600                                 if (m->control_pressed) { break; }
601                                 
602                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
603                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
604                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
605                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
606                                 writeGroups();                          if (m->control_pressed) { break; }
607                                 
608                                                                  
609                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
610                         }
611                 }
612                 else{
613                         int abort = 1;
614
615                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
616                         if(abort){      return 0;       }
617
618                         int numFiles;
619                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
620
621                         for(int i=0;i<numFiles;i++){
622                                 
623                                 if (m->control_pressed) { break; }
624                                 
625                                 //Now into the pyrodist part
626                                 bool live = 1;
627
628                                 char fileName[1024];
629                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
630                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
631                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
632                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
633                                 
634                                 flowDataIntI.resize(numSeqs * numFlowCells);
635                                 flowDataPrI.resize(numSeqs * numFlowCells);
636                                 mapUniqueToSeq.resize(numSeqs);
637                                 mapSeqToUnique.resize(numSeqs);
638                                 lengths.resize(numSeqs);
639                                 jointLookUp.resize(NUMBINS * NUMBINS);
640
641                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
642                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
643                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
644                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
645                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
646                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
647                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
648
649                                 flowFileName = string(fileName);
650                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
651                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
652                                 
653                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
654                                 
655                                 if (m->control_pressed) { break; }
656                                 
657                                 int done = 1;
658                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
659
660                                 //Now into the pyronoise part
661                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
662                                 
663                                 singleLookUp.resize(HOMOPS * NUMBINS);
664                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
665                                 weight.resize(numOTUs);
666                                 centroids.resize(numOTUs);
667                                 change.resize(numOTUs);
668                                 dist.assign(numOTUs * numSeqs, 0);
669                                 nSeqsPerOTU.resize(numOTUs);
670                                 cumNumSeqs.resize(numOTUs);
671
672                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
673                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
674                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
675                                 
676                                 int startOTU = pid * numOTUs / ncpus;
677                                 int endOTU = (pid + 1) * numOTUs / ncpus;
678
679                                 int startSeq = pid * numSeqs / ncpus;
680                                 int endSeq = (pid + 1) * numSeqs /ncpus;
681                                 
682                                 int total;
683
684                                 while(live){
685                                         
686                                         if (m->control_pressed) { break; }
687                                         
688                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
689                                         singleTau.assign(total, 0.0000);
690                                         seqNumber.assign(total, 0);
691                                         seqIndex.assign(total, 0);
692
693                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
694                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
695                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
696                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
697                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
699                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
700
701                                         calcCentroidsDriver(startOTU, endOTU);
702
703                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
704                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
705
706                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
707                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
708                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
709
710                                         vector<int> otuIndex(total, 0);
711                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
712                                         total = otuIndex.size();
713                                         
714                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
715                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
716                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
717                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
718                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
719                                 
720                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
721                                 }
722                         }
723                 }
724                 
725                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
726                 
727                 MPI_Barrier(MPI_COMM_WORLD);
728
729                 
730                 if(compositeFASTAFileName != ""){
731                         outputNames.push_back(compositeFASTAFileName);
732                         outputNames.push_back(compositeNamesFileName);
733                 }
734
735                 m->mothurOutEndLine();
736                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
737                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
738                 m->mothurOutEndLine();
739                 
740                 return 0;
741
742         }
743         catch(exception& e) {
744                 m->errorOut(e, "ShhherCommand", "execute");
745                 exit(1);
746         }
747 }
748 /**************************************************************************************************/
749 string ShhherCommand::createNamesFile(){
750         try{
751                 
752                 vector<string> duplicateNames(numUniques, "");
753                 for(int i=0;i<numSeqs;i++){
754                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
755                 }
756                 
757                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
758                 
759                 ofstream nameFile;
760                 m->openOutputFile(nameFileName, nameFile);
761                 
762                 for(int i=0;i<numUniques;i++){
763                         
764                         if (m->control_pressed) { break; }
765                         
766             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
767                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
768                 }
769                 
770                 nameFile.close();
771                 return  nameFileName;
772         }
773         catch(exception& e) {
774                 m->errorOut(e, "ShhherCommand", "createNamesFile");
775                 exit(1);
776         }
777 }
778 /**************************************************************************************************/
779
780 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
781         try{            
782                 ostringstream outStream;
783                 outStream.setf(ios::fixed, ios::floatfield);
784                 outStream.setf(ios::dec, ios::basefield);
785                 outStream.setf(ios::showpoint);
786                 outStream.precision(6);
787                 
788                 int begTime = time(NULL);
789                 double begClock = clock();
790                 
791                 for(int i=startSeq;i<stopSeq;i++){
792                         
793                         if (m->control_pressed) { break; }
794                         
795                         for(int j=0;j<i;j++){
796                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
797                                 
798                                 if(flowDistance < 1e-6){
799                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
800                                 }
801                                 else if(flowDistance <= cutoff){
802                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
803                                 }
804                         }
805                         if(i % 100 == 0){
806                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
807                         }
808                 }
809                 
810                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
811                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
812                 
813                 if (m->control_pressed) { return fDistFileName; }
814                 
815                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
816
817                 ofstream distFile(fDistFileName.c_str());
818                 distFile << outStream.str();            
819                 distFile.close();
820                 
821                 return fDistFileName;
822         }
823         catch(exception& e) {
824                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
825                 exit(1);
826         }
827 }
828 /**************************************************************************************************/
829  
830 void ShhherCommand::getOTUData(string listFileName){
831     try {
832         
833         ifstream listFile;
834         m->openInputFile(listFileName, listFile);
835         string label;
836         
837         listFile >> label >> numOTUs;
838         
839         otuData.assign(numSeqs, 0);
840         cumNumSeqs.assign(numOTUs, 0);
841         nSeqsPerOTU.assign(numOTUs, 0);
842         aaP.clear();aaP.resize(numOTUs);
843         
844         seqNumber.clear();
845         aaI.clear();
846         seqIndex.clear();
847         
848         string singleOTU = "";
849         
850         for(int i=0;i<numOTUs;i++){
851             
852             if (m->control_pressed) { break; }
853             
854             listFile >> singleOTU;
855             
856             istringstream otuString(singleOTU);
857             
858             while(otuString){
859                 
860                 string seqName = "";
861                 
862                 for(int j=0;j<singleOTU.length();j++){
863                     char letter = otuString.get();
864                     
865                     if(letter != ','){
866                         seqName += letter;
867                     }
868                     else{
869                         map<string,int>::iterator nmIt = nameMap.find(seqName);
870                         int index = nmIt->second;
871                         
872                         nameMap.erase(nmIt);
873                         
874                         otuData[index] = i;
875                         nSeqsPerOTU[i]++;
876                         aaP[i].push_back(index);
877                         seqName = "";
878                     }
879                 }
880                 
881                 map<string,int>::iterator nmIt = nameMap.find(seqName);
882                 
883                 int index = nmIt->second;
884                 nameMap.erase(nmIt);
885                 
886                 otuData[index] = i;
887                 nSeqsPerOTU[i]++;
888                 aaP[i].push_back(index);        
889                 
890                 otuString.get();
891             }
892             
893             sort(aaP[i].begin(), aaP[i].end());
894             for(int j=0;j<nSeqsPerOTU[i];j++){
895                 seqNumber.push_back(aaP[i][j]);
896             }
897             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
898                 aaP[i].push_back(0);
899             }
900             
901             
902         }
903         
904         for(int i=1;i<numOTUs;i++){
905             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
906         }
907         aaI = aaP;
908         seqIndex = seqNumber;
909         
910         listFile.close();       
911         
912     }
913     catch(exception& e) {
914         m->errorOut(e, "ShhherCommand", "getOTUData");
915         exit(1);        
916     }           
917 }
918
919 /**************************************************************************************************/
920
921 void ShhherCommand::initPyroCluster(){                          
922     try{
923         if (numOTUs < processors) { processors = 1; }
924         
925         dist.assign(numSeqs * numOTUs, 0);
926         change.assign(numOTUs, 1);
927         centroids.assign(numOTUs, -1);
928         weight.assign(numOTUs, 0);
929         singleTau.assign(numSeqs, 1.0);
930         
931         nSeqsBreaks.assign(processors+1, 0);
932         nOTUsBreaks.assign(processors+1, 0);
933         
934         nSeqsBreaks[0] = 0;
935         for(int i=0;i<processors;i++){
936             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
937             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
938         }
939         nSeqsBreaks[processors] = numSeqs;
940         nOTUsBreaks[processors] = numOTUs;
941     }
942     catch(exception& e) {
943         m->errorOut(e, "ShhherCommand", "initPyroCluster");
944         exit(1);        
945     }
946 }
947
948 /**************************************************************************************************/
949
950 void ShhherCommand::fill(){
951     try {
952         int index = 0;
953         for(int i=0;i<numOTUs;i++){
954             
955             if (m->control_pressed) { break; }
956             
957             cumNumSeqs[i] = index;
958             for(int j=0;j<nSeqsPerOTU[i];j++){
959                 seqNumber[index] = aaP[i][j];
960                 seqIndex[index] = aaI[i][j];
961                 
962                 index++;
963             }
964         }
965     }
966     catch(exception& e) {
967         m->errorOut(e, "ShhherCommand", "fill");
968         exit(1);        
969     }           
970 }
971
972 /**************************************************************************************************/
973  
974 void ShhherCommand::getFlowData(){
975     try{
976         ifstream flowFile;
977         m->openInputFile(flowFileName, flowFile);
978         
979         string seqName;
980         seqNameVector.clear();
981         lengths.clear();
982         flowDataIntI.clear();
983         nameMap.clear();
984         
985         
986         int currentNumFlowCells;
987         
988         float intensity;
989         
990         flowFile >> numFlowCells;
991         int index = 0;//pcluster
992         while(!flowFile.eof()){
993             
994             if (m->control_pressed) { break; }
995             
996             flowFile >> seqName >> currentNumFlowCells;
997             lengths.push_back(currentNumFlowCells);
998             
999             seqNameVector.push_back(seqName);
1000             nameMap[seqName] = index++;//pcluster
1001             
1002             for(int i=0;i<numFlowCells;i++){
1003                 flowFile >> intensity;
1004                 if(intensity > 9.99)    {       intensity = 9.99;       }
1005                 int intI = int(100 * intensity + 0.0001);
1006                 flowDataIntI.push_back(intI);
1007             }
1008             m->gobble(flowFile);
1009         }
1010         flowFile.close();
1011         
1012         numSeqs = seqNameVector.size();         
1013         
1014         for(int i=0;i<numSeqs;i++){
1015             
1016             if (m->control_pressed) { break; }
1017             
1018             int iNumFlowCells = i * numFlowCells;
1019             for(int j=lengths[i];j<numFlowCells;j++){
1020                 flowDataIntI[iNumFlowCells + j] = 0;
1021             }
1022         }
1023         
1024     }
1025     catch(exception& e) {
1026         m->errorOut(e, "ShhherCommand", "getFlowData");
1027         exit(1);
1028     }
1029 }
1030 /**************************************************************************************************/
1031 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1032         
1033         try{
1034                 vector<double> newTau(numOTUs,0);
1035                 vector<double> norms(numSeqs, 0);
1036                 otuIndex.clear();
1037                 seqIndex.clear();
1038                 singleTau.clear();
1039                 
1040                 for(int i=startSeq;i<stopSeq;i++){
1041                         
1042                         if (m->control_pressed) { break; }
1043                         
1044                         double offset = 1e8;
1045                         int indexOffset = i * numOTUs;
1046                         
1047                         for(int j=0;j<numOTUs;j++){
1048                                 
1049                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1050                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1051                                 }
1052                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1053                                         offset = dist[indexOffset + j];
1054                                 }
1055                         }
1056                         
1057                         for(int j=0;j<numOTUs;j++){
1058                                 if(weight[j] > MIN_WEIGHT){
1059                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1060                                         norms[i] += newTau[j];
1061                                 }
1062                                 else{
1063                                         newTau[j] = 0.0;
1064                                 }
1065                         }
1066                         
1067                         for(int j=0;j<numOTUs;j++){
1068                 
1069                                 newTau[j] /= norms[i];
1070                                 
1071                                 if(newTau[j] > MIN_TAU){
1072                                         otuIndex.push_back(j);
1073                                         seqIndex.push_back(i);
1074                                         singleTau.push_back(newTau[j]);
1075                                 }
1076                         }
1077                         
1078                 }
1079         }
1080         catch(exception& e) {
1081                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1082                 exit(1);        
1083         }               
1084 }
1085
1086 /**************************************************************************************************/
1087
1088 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1089     
1090     try{
1091         
1092         int total = 0;
1093         vector<double> newTau(numOTUs,0);
1094         vector<double> norms(numSeqs, 0);
1095         nSeqsPerOTU.assign(numOTUs, 0);
1096         
1097         for(int i=startSeq;i<stopSeq;i++){
1098             
1099             if (m->control_pressed) { break; }
1100             
1101             int indexOffset = i * numOTUs;
1102             
1103             double offset = 1e8;
1104             
1105             for(int j=0;j<numOTUs;j++){
1106                 
1107                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1108                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1109                 }
1110                 
1111                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1112                     offset = dist[indexOffset + j];
1113                 }
1114             }
1115             
1116             for(int j=0;j<numOTUs;j++){
1117                 if(weight[j] > MIN_WEIGHT){
1118                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1119                     norms[i] += newTau[j];
1120                 }
1121                 else{
1122                     newTau[j] = 0.0;
1123                 }
1124             }
1125             
1126             for(int j=0;j<numOTUs;j++){
1127                 newTau[j] /= norms[i];
1128             }
1129             
1130             for(int j=0;j<numOTUs;j++){
1131                 if(newTau[j] > MIN_TAU){
1132                     
1133                     int oldTotal = total;
1134                     
1135                     total++;
1136                     
1137                     singleTau.resize(total, 0);
1138                     seqNumber.resize(total, 0);
1139                     seqIndex.resize(total, 0);
1140                     
1141                     singleTau[oldTotal] = newTau[j];
1142                     
1143                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1144                     aaI[j][nSeqsPerOTU[j]] = i;
1145                     nSeqsPerOTU[j]++;
1146                 }
1147             }
1148             
1149         }
1150         
1151     }
1152     catch(exception& e) {
1153         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1154         exit(1);        
1155     }           
1156 }
1157
1158 /**************************************************************************************************/
1159
1160 void ShhherCommand::setOTUs(){
1161     
1162     try {
1163         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1164         
1165         for(int i=0;i<numOTUs;i++){
1166             
1167             if (m->control_pressed) { break; }
1168             
1169             for(int j=0;j<nSeqsPerOTU[i];j++){
1170                 int index = cumNumSeqs[i] + j;
1171                 double tauValue = singleTau[seqNumber[index]];
1172                 int sIndex = seqIndex[index];
1173                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1174             }
1175         }
1176         
1177         for(int i=0;i<numSeqs;i++){
1178             double maxTau = -1.0000;
1179             int maxOTU = -1;
1180             for(int j=0;j<numOTUs;j++){
1181                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1182                     maxTau = bigTauMatrix[i * numOTUs + j];
1183                     maxOTU = j;
1184                 }
1185             }
1186             
1187             otuData[i] = maxOTU;
1188         }
1189         
1190         nSeqsPerOTU.assign(numOTUs, 0);         
1191         
1192         for(int i=0;i<numSeqs;i++){
1193             int index = otuData[i];
1194             
1195             singleTau[i] = 1.0000;
1196             dist[i] = 0.0000;
1197             
1198             aaP[index][nSeqsPerOTU[index]] = i;
1199             aaI[index][nSeqsPerOTU[index]] = i;
1200             
1201             nSeqsPerOTU[index]++;
1202         }
1203         fill(); 
1204     }
1205     catch(exception& e) {
1206         m->errorOut(e, "ShhherCommand", "setOTUs");
1207         exit(1);        
1208     }           
1209 }
1210
1211 /**************************************************************************************************/
1212  
1213 void ShhherCommand::getUniques(){
1214     try{
1215         
1216         
1217         numUniques = 0;
1218         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1219         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1220         uniqueLengths.assign(numSeqs, 0);
1221         mapSeqToUnique.assign(numSeqs, -1);
1222         mapUniqueToSeq.assign(numSeqs, -1);
1223         
1224         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1225         
1226         for(int i=0;i<numSeqs;i++){
1227             
1228             if (m->control_pressed) { break; }
1229             
1230             int index = 0;
1231             
1232             vector<short> current(numFlowCells);
1233             for(int j=0;j<numFlowCells;j++){
1234                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1235             }
1236             
1237             for(int j=0;j<numUniques;j++){
1238                 int offset = j * numFlowCells;
1239                 bool toEnd = 1;
1240                 
1241                 int shorterLength;
1242                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1243                 else                                                            {       shorterLength = uniqueLengths[j];       }
1244                 
1245                 for(int k=0;k<shorterLength;k++){
1246                     if(current[k] != uniqueFlowgrams[offset + k]){
1247                         toEnd = 0;
1248                         break;
1249                     }
1250                 }
1251                 
1252                 if(toEnd){
1253                     mapSeqToUnique[i] = j;
1254                     uniqueCount[j]++;
1255                     index = j;
1256                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1257                     break;
1258                 }
1259                 index++;
1260             }
1261             
1262             if(index == numUniques){
1263                 uniqueLengths[numUniques] = lengths[i];
1264                 uniqueCount[numUniques] = 1;
1265                 mapSeqToUnique[i] = numUniques;//anMap
1266                 mapUniqueToSeq[numUniques] = i;//anF
1267                 
1268                 for(int k=0;k<numFlowCells;k++){
1269                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1270                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1271                 }
1272                 
1273                 numUniques++;
1274             }
1275         }
1276         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1277         uniqueLengths.resize(numUniques);       
1278         
1279         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1280         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1281     }
1282     catch(exception& e) {
1283         m->errorOut(e, "ShhherCommand", "getUniques");
1284         exit(1);
1285     }
1286 }
1287
1288 /**************************************************************************************************/
1289
1290 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1291     try{
1292         int minLength = lengths[mapSeqToUnique[seqA]];
1293         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1294         
1295         int ANumFlowCells = seqA * numFlowCells;
1296         int BNumFlowCells = seqB * numFlowCells;
1297         
1298         float dist = 0;
1299         
1300         for(int i=0;i<minLength;i++){
1301             
1302             if (m->control_pressed) { break; }
1303             
1304             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1305             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1306             
1307             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1308             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1309             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1310         }
1311         
1312         dist /= (float) minLength;
1313         return dist;
1314     }
1315     catch(exception& e) {
1316         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1317         exit(1);
1318     }
1319 }
1320
1321 //**********************************************************************************************************************/
1322
1323 string ShhherCommand::cluster(string distFileName, string namesFileName){
1324     try {
1325         
1326         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1327         read->setCutoff(cutoff);
1328         
1329         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1330         clusterNameMap->readMap();
1331         read->read(clusterNameMap);
1332         
1333         ListVector* list = read->getListVector();
1334         SparseMatrix* matrix = read->getMatrix();
1335         
1336         delete read; 
1337         delete clusterNameMap; 
1338         
1339         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1340         
1341         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1342         string tag = cluster->getTag();
1343         
1344         double clusterCutoff = cutoff;
1345         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1346             
1347             if (m->control_pressed) { break; }
1348             
1349             cluster->update(clusterCutoff);
1350         }
1351         
1352         list->setLabel(toString(cutoff));
1353         
1354         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1355         ofstream listFile;
1356         m->openOutputFile(listFileName, listFile);
1357         list->print(listFile);
1358         listFile.close();
1359         
1360         delete matrix;  delete cluster; delete rabund; delete list;
1361         
1362         return listFileName;
1363     }
1364     catch(exception& e) {
1365         m->errorOut(e, "ShhherCommand", "cluster");
1366         exit(1);        
1367     }           
1368 }
1369
1370 /**************************************************************************************************/
1371
1372 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1373     
1374     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1375     //within an otu
1376     
1377     try{
1378         
1379         for(int i=start;i<finish;i++){
1380             
1381             if (m->control_pressed) { break; }
1382             
1383             double count = 0;
1384             int position = 0;
1385             int minFlowGram = 100000000;
1386             double minFlowValue = 1e8;
1387             change[i] = 0; //FALSE
1388             
1389             for(int j=0;j<nSeqsPerOTU[i];j++){
1390                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1391             }
1392             
1393             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1394                 vector<double> adF(nSeqsPerOTU[i]);
1395                 vector<int> anL(nSeqsPerOTU[i]);
1396                 
1397                 for(int j=0;j<nSeqsPerOTU[i];j++){
1398                     int index = cumNumSeqs[i] + j;
1399                     int nI = seqIndex[index];
1400                     int nIU = mapSeqToUnique[nI];
1401                     
1402                     int k;
1403                     for(k=0;k<position;k++){
1404                         if(nIU == anL[k]){
1405                             break;
1406                         }
1407                     }
1408                     if(k == position){
1409                         anL[position] = nIU;
1410                         adF[position] = 0.0000;
1411                         position++;
1412                     }                                           
1413                 }
1414                 
1415                 for(int j=0;j<nSeqsPerOTU[i];j++){
1416                     int index = cumNumSeqs[i] + j;
1417                     int nI = seqIndex[index];
1418                     
1419                     double tauValue = singleTau[seqNumber[index]];
1420                     
1421                     for(int k=0;k<position;k++){
1422                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1423                         adF[k] += dist * tauValue;
1424                     }
1425                 }
1426                 
1427                 for(int j=0;j<position;j++){
1428                     if(adF[j] < minFlowValue){
1429                         minFlowGram = j;
1430                         minFlowValue = adF[j];
1431                     }
1432                 }
1433                 
1434                 if(centroids[i] != anL[minFlowGram]){
1435                     change[i] = 1;
1436                     centroids[i] = anL[minFlowGram];
1437                 }
1438             }
1439             else if(centroids[i] != -1){
1440                 change[i] = 1;
1441                 centroids[i] = -1;                      
1442             }
1443         }
1444     }
1445     catch(exception& e) {
1446         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1447         exit(1);        
1448     }           
1449 }
1450
1451 /**************************************************************************************************/
1452
1453 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1454     try{
1455         
1456         int flowAValue = cent * numFlowCells;
1457         int flowBValue = flow * numFlowCells;
1458         
1459         double dist = 0;
1460         
1461         for(int i=0;i<length;i++){
1462             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1463             flowAValue++;
1464             flowBValue++;
1465         }
1466         
1467         return dist / (double)length;
1468     }
1469     catch(exception& e) {
1470         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1471         exit(1);        
1472     }           
1473 }
1474
1475 /**************************************************************************************************/
1476
1477 double ShhherCommand::getNewWeights(){
1478     try{
1479         
1480         double maxChange = 0;
1481         
1482         for(int i=0;i<numOTUs;i++){
1483             
1484             if (m->control_pressed) { break; }
1485             
1486             double difference = weight[i];
1487             weight[i] = 0;
1488             
1489             for(int j=0;j<nSeqsPerOTU[i];j++){
1490                 int index = cumNumSeqs[i] + j;
1491                 double tauValue = singleTau[seqNumber[index]];
1492                 weight[i] += tauValue;
1493             }
1494             
1495             difference = fabs(weight[i] - difference);
1496             if(difference > maxChange){ maxChange = difference; }
1497         }
1498         return maxChange;
1499     }
1500     catch(exception& e) {
1501         m->errorOut(e, "ShhherCommand", "getNewWeights");
1502         exit(1);        
1503     }           
1504 }
1505  
1506  /**************************************************************************************************/
1507  
1508 double ShhherCommand::getLikelihood(){
1509     
1510     try{
1511         
1512         vector<long double> P(numSeqs, 0);
1513         int effNumOTUs = 0;
1514         
1515         for(int i=0;i<numOTUs;i++){
1516             if(weight[i] > MIN_WEIGHT){
1517                 effNumOTUs++;
1518             }
1519         }
1520         
1521         string hold;
1522         for(int i=0;i<numOTUs;i++){
1523             
1524             if (m->control_pressed) { break; }
1525             
1526             for(int j=0;j<nSeqsPerOTU[i];j++){
1527                 int index = cumNumSeqs[i] + j;
1528                 int nI = seqIndex[index];
1529                 double singleDist = dist[seqNumber[index]];
1530                 
1531                 P[nI] += weight[i] * exp(-singleDist * sigma);
1532             }
1533         }
1534         double nLL = 0.00;
1535         for(int i=0;i<numSeqs;i++){
1536             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1537             
1538             nLL += -log(P[i]);
1539         }
1540         
1541         nLL = nLL -(double)numSeqs * log(sigma);
1542         
1543         return nLL; 
1544     }
1545     catch(exception& e) {
1546         m->errorOut(e, "ShhherCommand", "getNewWeights");
1547         exit(1);        
1548     }           
1549 }
1550
1551 /**************************************************************************************************/
1552
1553 void ShhherCommand::checkCentroids(){
1554     try{
1555         vector<int> unique(numOTUs, 1);
1556         
1557         for(int i=0;i<numOTUs;i++){
1558             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1559                 unique[i] = -1;
1560             }
1561         }
1562         
1563         for(int i=0;i<numOTUs;i++){
1564             
1565             if (m->control_pressed) { break; }
1566             
1567             if(unique[i] == 1){
1568                 for(int j=i+1;j<numOTUs;j++){
1569                     if(unique[j] == 1){
1570                         
1571                         if(centroids[j] == centroids[i]){
1572                             unique[j] = 0;
1573                             centroids[j] = -1;
1574                             
1575                             weight[i] += weight[j];
1576                             weight[j] = 0.0;
1577                         }
1578                     }
1579                 }
1580             }
1581         }
1582     }
1583     catch(exception& e) {
1584         m->errorOut(e, "ShhherCommand", "checkCentroids");
1585         exit(1);        
1586     }           
1587 }
1588  /**************************************************************************************************/
1589
1590
1591  
1592 void ShhherCommand::writeQualities(vector<int> otuCounts){
1593     
1594     try {
1595         string thisOutputDir = outputDir;
1596         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1597         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1598         
1599         ofstream qualityFile;
1600         m->openOutputFile(qualityFileName, qualityFile);
1601         
1602         qualityFile.setf(ios::fixed, ios::floatfield);
1603         qualityFile.setf(ios::showpoint);
1604         qualityFile << setprecision(6);
1605         
1606         vector<vector<int> > qualities(numOTUs);
1607         vector<double> pr(HOMOPS, 0);
1608         
1609         
1610         for(int i=0;i<numOTUs;i++){
1611             
1612             if (m->control_pressed) { break; }
1613             
1614             int index = 0;
1615             int base = 0;
1616             
1617             if(nSeqsPerOTU[i] > 0){
1618                 qualities[i].assign(1024, -1);
1619                 
1620                 while(index < numFlowCells){
1621                     double maxPrValue = 1e8;
1622                     short maxPrIndex = -1;
1623                     double count = 0.0000;
1624                     
1625                     pr.assign(HOMOPS, 0);
1626                     
1627                     for(int j=0;j<nSeqsPerOTU[i];j++){
1628                         int lIndex = cumNumSeqs[i] + j;
1629                         double tauValue = singleTau[seqNumber[lIndex]];
1630                         int sequenceIndex = aaI[i][j];
1631                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1632                         
1633                         count += tauValue;
1634                         
1635                         for(int s=0;s<HOMOPS;s++){
1636                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1637                         }
1638                     }
1639                     
1640                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1641                     maxPrValue = pr[maxPrIndex];
1642                     
1643                     if(count > MIN_COUNT){
1644                         double U = 0.0000;
1645                         double norm = 0.0000;
1646                         
1647                         for(int s=0;s<HOMOPS;s++){
1648                             norm += exp(-(pr[s] - maxPrValue));
1649                         }
1650                         
1651                         for(int s=1;s<=maxPrIndex;s++){
1652                             int value = 0;
1653                             double temp = 0.0000;
1654                             
1655                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1656                             
1657                             if(U>0.00){
1658                                 temp = log10(U);
1659                             }
1660                             else{
1661                                 temp = -10.1;
1662                             }
1663                             temp = floor(-10 * temp);
1664                             value = (int)floor(temp);
1665                             if(value > 100){    value = 100;    }
1666                             
1667                             qualities[i][base] = (int)value;
1668                             base++;
1669                         }
1670                     }
1671                     
1672                     index++;
1673                 }
1674             }
1675             
1676             
1677             if(otuCounts[i] > 0){
1678                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1679                 
1680                 int j=4;        //need to get past the first four bases
1681                 while(qualities[i][j] != -1){
1682                     qualityFile << qualities[i][j] << ' ';
1683                     j++;
1684                 }
1685                 qualityFile << endl;
1686             }
1687         }
1688         qualityFile.close();
1689         outputNames.push_back(qualityFileName);
1690         
1691     }
1692     catch(exception& e) {
1693         m->errorOut(e, "ShhherCommand", "writeQualities");
1694         exit(1);        
1695     }           
1696 }
1697
1698 /**************************************************************************************************/
1699
1700 void ShhherCommand::writeSequences(vector<int> otuCounts){
1701     try {
1702         string thisOutputDir = outputDir;
1703         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1704         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1705         ofstream fastaFile;
1706         m->openOutputFile(fastaFileName, fastaFile);
1707         
1708         vector<string> names(numOTUs, "");
1709         
1710         for(int i=0;i<numOTUs;i++){
1711             
1712             if (m->control_pressed) { break; }
1713             
1714             int index = centroids[i];
1715             
1716             if(otuCounts[i] > 0){
1717                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1718                 
1719                 string newSeq = "";
1720                 
1721                 for(int j=0;j<numFlowCells;j++){
1722                     
1723                     char base = flowOrder[j % 4];
1724                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1725                         newSeq += base;
1726                     }
1727                 }
1728                 
1729                 fastaFile << newSeq.substr(4) << endl;
1730             }
1731         }
1732         fastaFile.close();
1733         
1734         outputNames.push_back(fastaFileName);
1735         
1736         if(compositeFASTAFileName != ""){
1737             m->appendFiles(fastaFileName, thisOutputDir + compositeFASTAFileName);
1738         }
1739     }
1740     catch(exception& e) {
1741         m->errorOut(e, "ShhherCommand", "writeSequences");
1742         exit(1);        
1743     }           
1744 }
1745
1746 /**************************************************************************************************/
1747
1748 void ShhherCommand::writeNames(vector<int> otuCounts){
1749     try {
1750         string thisOutputDir = outputDir;
1751         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1752         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1753         ofstream nameFile;
1754         m->openOutputFile(nameFileName, nameFile);
1755         
1756         for(int i=0;i<numOTUs;i++){
1757             
1758             if (m->control_pressed) { break; }
1759             
1760             if(otuCounts[i] > 0){
1761                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1762                 
1763                 for(int j=1;j<nSeqsPerOTU[i];j++){
1764                     nameFile << ',' << seqNameVector[aaI[i][j]];
1765                 }
1766                 
1767                 nameFile << endl;
1768             }
1769         }
1770         nameFile.close();
1771         outputNames.push_back(nameFileName);
1772         
1773         
1774         if(compositeNamesFileName != ""){
1775             m->appendFiles(nameFileName, thisOutputDir + compositeNamesFileName);
1776         }               
1777     }
1778     catch(exception& e) {
1779         m->errorOut(e, "ShhherCommand", "writeNames");
1780         exit(1);        
1781     }           
1782 }
1783
1784 /**************************************************************************************************/
1785
1786 void ShhherCommand::writeGroups(){
1787     try {
1788         string thisOutputDir = outputDir;
1789         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1790         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1791         string groupFileName = fileRoot + "shhh.groups";
1792         ofstream groupFile;
1793         m->openOutputFile(groupFileName, groupFile);
1794         
1795         for(int i=0;i<numSeqs;i++){
1796             if (m->control_pressed) { break; }
1797             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1798         }
1799         groupFile.close();
1800         outputNames.push_back(groupFileName);
1801         
1802     }
1803     catch(exception& e) {
1804         m->errorOut(e, "ShhherCommand", "writeGroups");
1805         exit(1);        
1806     }           
1807 }
1808
1809 /**************************************************************************************************/
1810
1811 void ShhherCommand::writeClusters(vector<int> otuCounts){
1812     try {
1813         string thisOutputDir = outputDir;
1814         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1815         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1816         ofstream otuCountsFile;
1817         m->openOutputFile(otuCountsFileName, otuCountsFile);
1818         
1819         string bases = flowOrder;
1820         
1821         for(int i=0;i<numOTUs;i++){
1822             
1823             if (m->control_pressed) {
1824                 break;
1825             }
1826             //output the translated version of the centroid sequence for the otu
1827             if(otuCounts[i] > 0){
1828                 int index = centroids[i];
1829                 
1830                 otuCountsFile << "ideal\t";
1831                 for(int j=8;j<numFlowCells;j++){
1832                     char base = bases[j % 4];
1833                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1834                         otuCountsFile << base;
1835                     }
1836                 }
1837                 otuCountsFile << endl;
1838                 
1839                 for(int j=0;j<nSeqsPerOTU[i];j++){
1840                     int sequence = aaI[i][j];
1841                     otuCountsFile << seqNameVector[sequence] << '\t';
1842                     
1843                     string newSeq = "";
1844                     
1845                     for(int k=0;k<lengths[sequence];k++){
1846                         char base = bases[k % 4];
1847                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1848                         
1849                         for(int s=0;s<freq;s++){
1850                             newSeq += base;
1851                             //otuCountsFile << base;
1852                         }
1853                     }
1854                     otuCountsFile << newSeq.substr(4) << endl;
1855                 }
1856                 otuCountsFile << endl;
1857             }
1858         }
1859         otuCountsFile.close();
1860         outputNames.push_back(otuCountsFileName);
1861         
1862     }
1863     catch(exception& e) {
1864         m->errorOut(e, "ShhherCommand", "writeClusters");
1865         exit(1);        
1866     }           
1867 }
1868
1869 #else
1870 //**********************************************************************************************************************
1871
1872 int ShhherCommand::execute(){
1873         try {
1874                 if (abort == true) { return 0; }
1875                 
1876                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1877                 getJointLookUp();       if (m->control_pressed) { return 0; }
1878                 
1879         int numFiles = flowFileVector.size();
1880                 
1881         if (numFiles < processors) { processors = numFiles; }
1882         
1883 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1884         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1885         else { createProcesses(flowFileVector); } //each processor processes one file
1886 #else
1887         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1888 #endif
1889         
1890                 if(compositeFASTAFileName != ""){
1891                         outputNames.push_back(compositeFASTAFileName);
1892                         outputNames.push_back(compositeNamesFileName);
1893                 }
1894
1895                 m->mothurOutEndLine();
1896                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1897                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1898                 m->mothurOutEndLine();
1899                 
1900                 return 0;
1901         }
1902         catch(exception& e) {
1903                 m->errorOut(e, "ShhherCommand", "execute");
1904                 exit(1);
1905         }
1906 }
1907 #endif
1908 /**************************************************************************************************/
1909
1910 int ShhherCommand::createProcesses(vector<string> filenames){
1911     try {
1912         vector<int> processIDS;
1913                 int process = 1;
1914                 int num = 0;
1915                 
1916                 //sanity check
1917                 if (filenames.size() < processors) { processors = filenames.size(); }
1918                 
1919                 //divide the groups between the processors
1920                 vector<linePair> lines;
1921                 int numFilesPerProcessor = filenames.size() / processors;
1922                 for (int i = 0; i < processors; i++) {
1923                         int startIndex =  i * numFilesPerProcessor;
1924                         int endIndex = (i+1) * numFilesPerProcessor;
1925                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1926                         lines.push_back(linePair(startIndex, endIndex));
1927                 }
1928                 
1929         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1930                 
1931                 //loop through and create all the processes you want
1932                 while (process != processors) {
1933                         int pid = fork();
1934                         
1935                         if (pid > 0) {
1936                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1937                                 process++;
1938                         }else if (pid == 0){
1939                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1940                                 exit(0);
1941                         }else { 
1942                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1943                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1944                                 exit(0);
1945                         }
1946                 }
1947                 
1948                 //do my part
1949                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1950                 
1951                 //force parent to wait until all the processes are done
1952                 for (int i=0;i<processIDS.size();i++) { 
1953                         int temp = processIDS[i];
1954                         wait(&temp);
1955                 }
1956         
1957         #else
1958         
1959         //////////////////////////////////////////////////////////////////////////////////////////////////////
1960         
1961         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1962         
1963         //////////////////////////////////////////////////////////////////////////////////////////////////////
1964                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
1965                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1966                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1967                 
1968                 vector<shhhFlowsData*> pDataArray; 
1969                 DWORD   dwThreadIdArray[processors-1];
1970                 HANDLE  hThreadArray[processors-1]; 
1971                 
1972                 //Create processor worker threads.
1973                 for( int i=0; i<processors-1; i++ ){
1974                         // Allocate memory for thread data.
1975                         string extension = "";
1976                         if (i != 0) { extension = toString(i) + ".temp"; }
1977                         
1978             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1979                         pDataArray.push_back(tempFlow);
1980                         processIDS.push_back(i);
1981             
1982                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1983                 }
1984                 
1985                 //using the main process as a worker saves time and memory
1986                 //do my part
1987                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1988                 
1989                 //Wait until all threads have terminated.
1990                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1991                 
1992                 //Close all thread handles and free memory allocations.
1993                 for(int i=0; i < pDataArray.size(); i++){
1994                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
1995                         CloseHandle(hThreadArray[i]);
1996                         delete pDataArray[i];
1997                 }
1998                 
1999         #endif
2000         
2001         for (int i=0;i<processIDS.size();i++) { 
2002             if (compositeFASTAFileName != "") {
2003                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2004                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2005                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2006                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2007             }
2008         }
2009         
2010         return 0;
2011         
2012     }
2013         catch(exception& e) {
2014                 m->errorOut(e, "ShhherCommand", "createProcesses");
2015                 exit(1);
2016         }
2017 }
2018 /**************************************************************************************************/
2019
2020 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2021     try {
2022         
2023         for(int i=start;i<end;i++){
2024                         
2025                         if (m->control_pressed) { break; }
2026                         
2027                         string flowFileName = filenames[i];
2028             
2029                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2030                         m->mothurOut("Reading flowgrams...\n");
2031                         
2032             vector<string> seqNameVector;
2033             vector<int> lengths;
2034             vector<short> flowDataIntI;
2035             vector<double> flowDataPrI;
2036             map<string, int> nameMap;
2037             vector<short> uniqueFlowgrams;
2038             vector<int> uniqueCount;
2039             vector<int> mapSeqToUnique;
2040             vector<int> mapUniqueToSeq;
2041             vector<int> uniqueLengths;
2042             int numFlowCells;
2043             
2044             int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2045                         
2046                         if (m->control_pressed) { break; }
2047                         
2048                         m->mothurOut("Identifying unique flowgrams...\n");
2049                         int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2050                         
2051                         if (m->control_pressed) { break; }
2052                         
2053                         m->mothurOut("Calculating distances between flowgrams...\n");
2054             string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2055             unsigned long long begTime = time(NULL);
2056             double begClock = clock();
2057         
2058             flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);     
2059             
2060             m->mothurOutEndLine();
2061             m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2062
2063             
2064                         string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2065                         createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2066                         
2067                         if (m->control_pressed) { break; }
2068                         
2069                         m->mothurOut("\nClustering flowgrams...\n");
2070             string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2071                         cluster(listFileName, distFileName, namesFileName);
2072                         
2073                         if (m->control_pressed) { break; }
2074             
2075             vector<int> otuData;
2076             vector<int> cumNumSeqs;
2077             vector<int> nSeqsPerOTU;
2078             vector<vector<int> > aaP;   //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2079             vector<vector<int> > aaI;   //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2080             vector<int> seqNumber;              //tMaster->anP:         the sequence id number sorted by OTU
2081             vector<int> seqIndex;               //tMaster->anI;         the index that corresponds to seqNumber
2082
2083                         
2084                         int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2085                         
2086                         if (m->control_pressed) { break; }
2087                         
2088                         m->mothurRemove(distFileName);
2089                         m->mothurRemove(namesFileName);
2090                         m->mothurRemove(listFileName);
2091                         
2092             vector<double> dist;                //adDist - distance of sequences to centroids
2093             vector<short> change;               //did the centroid sequence change? 0 = no; 1 = yes
2094             vector<int> centroids;              //the representative flowgram for each cluster m
2095             vector<double> weight;
2096             vector<double> singleTau;   //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2097             vector<int> nSeqsBreaks;
2098             vector<int> nOTUsBreaks;
2099             
2100                         dist.assign(numSeqs * numOTUs, 0);
2101             change.assign(numOTUs, 1);
2102             centroids.assign(numOTUs, -1);
2103             weight.assign(numOTUs, 0);
2104             singleTau.assign(numSeqs, 1.0);
2105             
2106             nSeqsBreaks.assign(2, 0);
2107             nOTUsBreaks.assign(2, 0);
2108             
2109             nSeqsBreaks[0] = 0;
2110             nSeqsBreaks[1] = numSeqs;
2111             nOTUsBreaks[1] = numOTUs;
2112                         
2113                         if (m->control_pressed) { break; }
2114                         
2115                         double maxDelta = 0;
2116                         int iter = 0;
2117                         
2118                         begClock = clock();
2119                         begTime = time(NULL);
2120             
2121                         m->mothurOut("\nDenoising flowgrams...\n");
2122                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2123                         
2124                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2125                                 
2126                                 if (m->control_pressed) { break; }
2127                                 
2128                                 double cycClock = clock();
2129                                 unsigned long long cycTime = time(NULL);
2130                                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2131                                 
2132                                 if (m->control_pressed) { break; }
2133                 
2134                                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2135                                 
2136                                 if (m->control_pressed) { break; }
2137                 
2138                                 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2139                 
2140                 if (m->control_pressed) { break; }
2141                 
2142                                 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2143                 
2144                 if (m->control_pressed) { break; }
2145                 
2146                                 checkCentroids(numOTUs, centroids, weight);
2147                                 
2148                                 if (m->control_pressed) { break; }
2149                                 
2150                                 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2151                                 
2152                                 if (m->control_pressed) { break; }
2153                                 
2154                                 iter++;
2155                                 
2156                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2157                 
2158                         }       
2159                         
2160                         if (m->control_pressed) { break; }
2161                         
2162                         m->mothurOut("\nFinalizing...\n");
2163                         fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2164                         
2165                         if (m->control_pressed) { break; }
2166                         
2167                         setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2168                         
2169                         if (m->control_pressed) { break; }
2170                         
2171                         vector<int> otuCounts(numOTUs, 0);
2172                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2173                         
2174                         calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2175             
2176             if (m->control_pressed) { break; }
2177             
2178                         writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2179                         writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2180                         writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2181                         writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                       if (m->control_pressed) { break; }
2182                         writeGroups(flowFileName, numSeqs, seqNameVector);                                              if (m->control_pressed) { break; }
2183                         
2184                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2185                 }
2186                 
2187         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2188         
2189         return 0;
2190         
2191     }catch(exception& e) {
2192             m->errorOut(e, "ShhherCommand", "driver");
2193             exit(1);
2194     }
2195 }
2196
2197 /**************************************************************************************************/
2198 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2199         try{
2200        
2201                 ifstream flowFile;
2202        
2203                 m->openInputFile(filename, flowFile);
2204                 
2205                 string seqName;
2206                 int currentNumFlowCells;
2207                 float intensity;
2208         thisSeqNameVector.clear();
2209                 thisLengths.clear();
2210                 thisFlowDataIntI.clear();
2211                 thisNameMap.clear();
2212                 
2213                 flowFile >> numFlowCells;
2214                 int index = 0;//pcluster
2215                 while(!flowFile.eof()){
2216                         
2217                         if (m->control_pressed) { break; }
2218                         
2219                         flowFile >> seqName >> currentNumFlowCells;
2220                         thisLengths.push_back(currentNumFlowCells);
2221            
2222                         thisSeqNameVector.push_back(seqName);
2223                         thisNameMap[seqName] = index++;//pcluster
2224
2225                         for(int i=0;i<numFlowCells;i++){
2226                                 flowFile >> intensity;
2227                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2228                                 int intI = int(100 * intensity + 0.0001);
2229                                 thisFlowDataIntI.push_back(intI);
2230                         }
2231                         m->gobble(flowFile);
2232                 }
2233                 flowFile.close();
2234                 
2235                 int numSeqs = thisSeqNameVector.size();         
2236                 
2237                 for(int i=0;i<numSeqs;i++){
2238                         
2239                         if (m->control_pressed) { break; }
2240                         
2241                         int iNumFlowCells = i * numFlowCells;
2242                         for(int j=thisLengths[i];j<numFlowCells;j++){
2243                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2244                         }
2245                 }
2246         
2247         return numSeqs;
2248                 
2249         }
2250         catch(exception& e) {
2251                 m->errorOut(e, "ShhherCommand", "getFlowData");
2252                 exit(1);
2253         }
2254 }
2255 /**************************************************************************************************/
2256
2257 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2258         try{            
2259         
2260                 ostringstream outStream;
2261                 outStream.setf(ios::fixed, ios::floatfield);
2262                 outStream.setf(ios::dec, ios::basefield);
2263                 outStream.setf(ios::showpoint);
2264                 outStream.precision(6);
2265                 
2266                 int begTime = time(NULL);
2267                 double begClock = clock();
2268         
2269                 for(int i=0;i<stopSeq;i++){
2270                         
2271                         if (m->control_pressed) { break; }
2272                         
2273                         for(int j=0;j<i;j++){
2274                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2275                 
2276                                 if(flowDistance < 1e-6){
2277                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2278                                 }
2279                                 else if(flowDistance <= cutoff){
2280                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2281                                 }
2282                         }
2283                         if(i % 100 == 0){
2284                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2285                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2286                                 m->mothurOutEndLine();
2287                         }
2288                 }
2289                 
2290                 ofstream distFile(distFileName.c_str());
2291                 distFile << outStream.str();            
2292                 distFile.close();
2293                 
2294                 if (m->control_pressed) {}
2295                 else {
2296                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2297                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2298                         m->mothurOutEndLine();
2299                 }
2300         
2301         return 0;
2302         }
2303         catch(exception& e) {
2304                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2305                 exit(1);
2306         }
2307 }
2308 /**************************************************************************************************/
2309
2310 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2311         try{
2312                 int minLength = lengths[mapSeqToUnique[seqA]];
2313                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2314                 
2315                 int ANumFlowCells = seqA * numFlowCells;
2316                 int BNumFlowCells = seqB * numFlowCells;
2317                 
2318                 float dist = 0;
2319                 
2320                 for(int i=0;i<minLength;i++){
2321                         
2322                         if (m->control_pressed) { break; }
2323                         
2324                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2325                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2326                         
2327                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2328                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2329                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2330                 }
2331                 
2332                 dist /= (float) minLength;
2333                 return dist;
2334         }
2335         catch(exception& e) {
2336                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2337                 exit(1);
2338         }
2339 }
2340
2341 /**************************************************************************************************/
2342
2343 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2344         try{
2345                 int numUniques = 0;
2346                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2347                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2348                 uniqueLengths.assign(numSeqs, 0);
2349                 mapSeqToUnique.assign(numSeqs, -1);
2350                 mapUniqueToSeq.assign(numSeqs, -1);
2351                 
2352                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2353                 
2354                 for(int i=0;i<numSeqs;i++){
2355                         
2356                         if (m->control_pressed) { break; }
2357                         
2358                         int index = 0;
2359                         
2360                         vector<short> current(numFlowCells);
2361                         for(int j=0;j<numFlowCells;j++){
2362                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2363                         }
2364             
2365                         for(int j=0;j<numUniques;j++){
2366                                 int offset = j * numFlowCells;
2367                                 bool toEnd = 1;
2368                                 
2369                                 int shorterLength;
2370                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2371                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2372                 
2373                                 for(int k=0;k<shorterLength;k++){
2374                                         if(current[k] != uniqueFlowgrams[offset + k]){
2375                                                 toEnd = 0;
2376                                                 break;
2377                                         }
2378                                 }
2379                                 
2380                                 if(toEnd){
2381                                         mapSeqToUnique[i] = j;
2382                                         uniqueCount[j]++;
2383                                         index = j;
2384                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2385                                         break;
2386                                 }
2387                                 index++;
2388                         }
2389                         
2390                         if(index == numUniques){
2391                                 uniqueLengths[numUniques] = lengths[i];
2392                                 uniqueCount[numUniques] = 1;
2393                                 mapSeqToUnique[i] = numUniques;//anMap
2394                                 mapUniqueToSeq[numUniques] = i;//anF
2395                                 
2396                                 for(int k=0;k<numFlowCells;k++){
2397                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2398                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2399                                 }
2400                                 
2401                                 numUniques++;
2402                         }
2403                 }
2404                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2405                 uniqueLengths.resize(numUniques);       
2406                 uniqueFlowgrams.resize(numFlowCells * numUniques);
2407         
2408                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2409                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2410         
2411         return numUniques;
2412         }
2413         catch(exception& e) {
2414                 m->errorOut(e, "ShhherCommand", "getUniques");
2415                 exit(1);
2416         }
2417 }
2418 /**************************************************************************************************/
2419 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2420         try{
2421                 
2422                 vector<string> duplicateNames(numUniques, "");
2423                 for(int i=0;i<numSeqs;i++){
2424                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2425                 }
2426                 
2427                 ofstream nameFile;
2428                 m->openOutputFile(filename, nameFile);
2429                 
2430                 for(int i=0;i<numUniques;i++){
2431                         
2432                         if (m->control_pressed) { break; }
2433                         
2434             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2435                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2436                 }
2437                 
2438                 nameFile.close();
2439         
2440                 return 0;
2441         }
2442         catch(exception& e) {
2443                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2444                 exit(1);
2445         }
2446 }
2447 //**********************************************************************************************************************
2448
2449 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2450         try {
2451                 
2452                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2453                 read->setCutoff(cutoff);
2454                 
2455                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2456                 clusterNameMap->readMap();
2457                 read->read(clusterNameMap);
2458         
2459                 ListVector* list = read->getListVector();
2460                 SparseMatrix* matrix = read->getMatrix();
2461                 
2462                 delete read; 
2463                 delete clusterNameMap; 
2464         
2465                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2466                 
2467                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2468                 string tag = cluster->getTag();
2469                 
2470                 double clusterCutoff = cutoff;
2471                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2472                         
2473                         if (m->control_pressed) { break; }
2474                         
2475                         cluster->update(clusterCutoff);
2476                 }
2477                 
2478                 list->setLabel(toString(cutoff));
2479                 
2480                 ofstream listFile;
2481                 m->openOutputFile(filename, listFile);
2482                 list->print(listFile);
2483                 listFile.close();
2484                 
2485                 delete matrix;  delete cluster; delete rabund; delete list;
2486         
2487                 return 0;
2488         }
2489         catch(exception& e) {
2490                 m->errorOut(e, "ShhherCommand", "cluster");
2491                 exit(1);        
2492         }               
2493 }
2494 /**************************************************************************************************/
2495
2496 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2497                                vector<int>& cumNumSeqs,
2498                                vector<int>& nSeqsPerOTU,
2499                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2500                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2501                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2502                                vector<int>& seqIndex,
2503                                map<string, int>& nameMap){
2504         try {
2505         
2506                 ifstream listFile;
2507                 m->openInputFile(fileName, listFile);
2508                 string label;
2509         int numOTUs;
2510                 
2511                 listFile >> label >> numOTUs;
2512         
2513                 otuData.assign(numSeqs, 0);
2514                 cumNumSeqs.assign(numOTUs, 0);
2515                 nSeqsPerOTU.assign(numOTUs, 0);
2516                 aaP.clear();aaP.resize(numOTUs);
2517                 
2518                 seqNumber.clear();
2519                 aaI.clear();
2520                 seqIndex.clear();
2521                 
2522                 string singleOTU = "";
2523                 
2524                 for(int i=0;i<numOTUs;i++){
2525                         
2526                         if (m->control_pressed) { break; }
2527             
2528                         listFile >> singleOTU;
2529                         
2530                         istringstream otuString(singleOTU);
2531             
2532                         while(otuString){
2533                                 
2534                                 string seqName = "";
2535                                 
2536                                 for(int j=0;j<singleOTU.length();j++){
2537                                         char letter = otuString.get();
2538                                         
2539                                         if(letter != ','){
2540                                                 seqName += letter;
2541                                         }
2542                                         else{
2543                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2544                                                 int index = nmIt->second;
2545                                                 
2546                                                 nameMap.erase(nmIt);
2547                                                 
2548                                                 otuData[index] = i;
2549                                                 nSeqsPerOTU[i]++;
2550                                                 aaP[i].push_back(index);
2551                                                 seqName = "";
2552                                         }
2553                                 }
2554                                 
2555                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2556                 
2557                                 int index = nmIt->second;
2558                                 nameMap.erase(nmIt);
2559                 
2560                                 otuData[index] = i;
2561                                 nSeqsPerOTU[i]++;
2562                                 aaP[i].push_back(index);        
2563                                 
2564                                 otuString.get();
2565                         }
2566                         
2567                         sort(aaP[i].begin(), aaP[i].end());
2568                         for(int j=0;j<nSeqsPerOTU[i];j++){
2569                                 seqNumber.push_back(aaP[i][j]);
2570                         }
2571                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2572                                 aaP[i].push_back(0);
2573                         }
2574                         
2575                         
2576                 }
2577                 
2578                 for(int i=1;i<numOTUs;i++){
2579                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2580                 }
2581                 aaI = aaP;
2582                 seqIndex = seqNumber;
2583                 
2584                 listFile.close();       
2585         
2586         return numOTUs;
2587                 
2588         }
2589         catch(exception& e) {
2590                 m->errorOut(e, "ShhherCommand", "getOTUData");
2591                 exit(1);        
2592         }               
2593 }
2594 /**************************************************************************************************/
2595
2596 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2597                                           vector<int>& cumNumSeqs,
2598                                           vector<int>& nSeqsPerOTU,
2599                                           vector<int>& seqIndex,
2600                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2601                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2602                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2603                                           vector<int>& mapSeqToUnique,
2604                                           vector<short>& uniqueFlowgrams,
2605                                           vector<short>& flowDataIntI,
2606                                           vector<int>& lengths,
2607                                           int numFlowCells,
2608                                           vector<int>& seqNumber){                          
2609         
2610         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2611         //within an otu
2612         
2613         try{
2614                 
2615                 for(int i=0;i<numOTUs;i++){
2616                         
2617                         if (m->control_pressed) { break; }
2618                         
2619                         double count = 0;
2620                         int position = 0;
2621                         int minFlowGram = 100000000;
2622                         double minFlowValue = 1e8;
2623                         change[i] = 0; //FALSE
2624                         
2625                         for(int j=0;j<nSeqsPerOTU[i];j++){
2626                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2627                         }
2628             
2629                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2630                                 vector<double> adF(nSeqsPerOTU[i]);
2631                                 vector<int> anL(nSeqsPerOTU[i]);
2632                                 
2633                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2634                                         int index = cumNumSeqs[i] + j;
2635                                         int nI = seqIndex[index];
2636                                         int nIU = mapSeqToUnique[nI];
2637                                         
2638                                         int k;
2639                                         for(k=0;k<position;k++){
2640                                                 if(nIU == anL[k]){
2641                                                         break;
2642                                                 }
2643                                         }
2644                                         if(k == position){
2645                                                 anL[position] = nIU;
2646                                                 adF[position] = 0.0000;
2647                                                 position++;
2648                                         }                                               
2649                                 }
2650                                 
2651                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2652                                         int index = cumNumSeqs[i] + j;
2653                                         int nI = seqIndex[index];
2654                                         
2655                                         double tauValue = singleTau[seqNumber[index]];
2656                                         
2657                                         for(int k=0;k<position;k++){
2658                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2659                                                 adF[k] += dist * tauValue;
2660                                         }
2661                                 }
2662                                 
2663                                 for(int j=0;j<position;j++){
2664                                         if(adF[j] < minFlowValue){
2665                                                 minFlowGram = j;
2666                                                 minFlowValue = adF[j];
2667                                         }
2668                                 }
2669                                 
2670                                 if(centroids[i] != anL[minFlowGram]){
2671                                         change[i] = 1;
2672                                         centroids[i] = anL[minFlowGram];
2673                                 }
2674                         }
2675                         else if(centroids[i] != -1){
2676                                 change[i] = 1;
2677                                 centroids[i] = -1;                      
2678                         }
2679                 }
2680         
2681         return 0;
2682         }
2683         catch(exception& e) {
2684                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2685                 exit(1);        
2686         }               
2687 }
2688 /**************************************************************************************************/
2689
2690 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2691                                         vector<short>& flowDataIntI, int numFlowCells){
2692         try{
2693                 
2694                 int flowAValue = cent * numFlowCells;
2695                 int flowBValue = flow * numFlowCells;
2696                 
2697                 double dist = 0;
2698         
2699                 for(int i=0;i<length;i++){
2700                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2701                         flowAValue++;
2702                         flowBValue++;
2703                 }
2704                 
2705                 return dist / (double)length;
2706         }
2707         catch(exception& e) {
2708                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2709                 exit(1);        
2710         }               
2711 }
2712 /**************************************************************************************************/
2713
2714 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2715         try{
2716                 
2717                 double maxChange = 0;
2718                 
2719                 for(int i=0;i<numOTUs;i++){
2720                         
2721                         if (m->control_pressed) { break; }
2722                         
2723                         double difference = weight[i];
2724                         weight[i] = 0;
2725                         
2726                         for(int j=0;j<nSeqsPerOTU[i];j++){
2727                                 int index = cumNumSeqs[i] + j;
2728                                 double tauValue = singleTau[seqNumber[index]];
2729                                 weight[i] += tauValue;
2730                         }
2731                         
2732                         difference = fabs(weight[i] - difference);
2733                         if(difference > maxChange){     maxChange = difference; }
2734                 }
2735                 return maxChange;
2736         }
2737         catch(exception& e) {
2738                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2739                 exit(1);        
2740         }               
2741 }
2742
2743 /**************************************************************************************************/
2744
2745 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2746         
2747         try{
2748                 
2749                 vector<long double> P(numSeqs, 0);
2750                 int effNumOTUs = 0;
2751                 
2752                 for(int i=0;i<numOTUs;i++){
2753                         if(weight[i] > MIN_WEIGHT){
2754                                 effNumOTUs++;
2755                         }
2756                 }
2757                 
2758                 string hold;
2759                 for(int i=0;i<numOTUs;i++){
2760                         
2761                         if (m->control_pressed) { break; }
2762                         
2763                         for(int j=0;j<nSeqsPerOTU[i];j++){
2764                                 int index = cumNumSeqs[i] + j;
2765                                 int nI = seqIndex[index];
2766                                 double singleDist = dist[seqNumber[index]];
2767                                 
2768                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2769                         }
2770                 }
2771                 double nLL = 0.00;
2772                 for(int i=0;i<numSeqs;i++){
2773                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2774             
2775                         nLL += -log(P[i]);
2776                 }
2777                 
2778                 nLL = nLL -(double)numSeqs * log(sigma);
2779         
2780                 return nLL; 
2781         }
2782         catch(exception& e) {
2783                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2784                 exit(1);        
2785         }               
2786 }
2787
2788 /**************************************************************************************************/
2789
2790 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2791         try{
2792                 vector<int> unique(numOTUs, 1);
2793                 
2794                 for(int i=0;i<numOTUs;i++){
2795                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2796                                 unique[i] = -1;
2797                         }
2798                 }
2799                 
2800                 for(int i=0;i<numOTUs;i++){
2801                         
2802                         if (m->control_pressed) { break; }
2803                         
2804                         if(unique[i] == 1){
2805                                 for(int j=i+1;j<numOTUs;j++){
2806                                         if(unique[j] == 1){
2807                                                 
2808                                                 if(centroids[j] == centroids[i]){
2809                                                         unique[j] = 0;
2810                                                         centroids[j] = -1;
2811                                                         
2812                                                         weight[i] += weight[j];
2813                                                         weight[j] = 0.0;
2814                                                 }
2815                                         }
2816                                 }
2817                         }
2818                 }
2819         
2820         return 0;
2821         }
2822         catch(exception& e) {
2823                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2824                 exit(1);        
2825         }               
2826 }
2827 /**************************************************************************************************/
2828
2829 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
2830                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
2831                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
2832                                      vector<int>& seqNumber, vector<int>& seqIndex,
2833                                      vector<short>& uniqueFlowgrams,
2834                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2835         
2836         try{
2837                 
2838                 int total = 0;
2839                 vector<double> newTau(numOTUs,0);
2840                 vector<double> norms(numSeqs, 0);
2841                 nSeqsPerOTU.assign(numOTUs, 0);
2842         
2843                 for(int i=0;i<numSeqs;i++){
2844                         
2845                         if (m->control_pressed) { break; }
2846                         
2847                         int indexOffset = i * numOTUs;
2848             
2849                         double offset = 1e8;
2850                         
2851                         for(int j=0;j<numOTUs;j++){
2852                 
2853                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2854                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2855                                 }
2856                 
2857                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2858                                         offset = dist[indexOffset + j];
2859                                 }
2860                         }
2861             
2862                         for(int j=0;j<numOTUs;j++){
2863                                 if(weight[j] > MIN_WEIGHT){
2864                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2865                                         norms[i] += newTau[j];
2866                                 }
2867                                 else{
2868                                         newTau[j] = 0.0;
2869                                 }
2870                         }
2871             
2872                         for(int j=0;j<numOTUs;j++){
2873                                 newTau[j] /= norms[i];
2874                         }
2875             
2876                         for(int j=0;j<numOTUs;j++){
2877                                 if(newTau[j] > MIN_TAU){
2878                                         
2879                                         int oldTotal = total;
2880                                         
2881                                         total++;
2882                                         
2883                                         singleTau.resize(total, 0);
2884                                         seqNumber.resize(total, 0);
2885                                         seqIndex.resize(total, 0);
2886                                         
2887                                         singleTau[oldTotal] = newTau[j];
2888                                         
2889                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
2890                                         aaI[j][nSeqsPerOTU[j]] = i;
2891                                         nSeqsPerOTU[j]++;
2892                                 }
2893                         }
2894             
2895                 }
2896         
2897         }
2898         catch(exception& e) {
2899                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2900                 exit(1);        
2901         }               
2902 }
2903 /**************************************************************************************************/
2904
2905 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2906         try {
2907                 int index = 0;
2908                 for(int i=0;i<numOTUs;i++){
2909                         
2910                         if (m->control_pressed) { return 0; }
2911                         
2912                         cumNumSeqs[i] = index;
2913                         for(int j=0;j<nSeqsPerOTU[i];j++){
2914                                 seqNumber[index] = aaP[i][j];
2915                                 seqIndex[index] = aaI[i][j];
2916                                 
2917                                 index++;
2918                         }
2919                 }
2920         
2921         return 0; 
2922         }
2923         catch(exception& e) {
2924                 m->errorOut(e, "ShhherCommand", "fill");
2925                 exit(1);        
2926         }               
2927 }
2928 /**************************************************************************************************/
2929
2930 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
2931                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2932         
2933         try {
2934                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
2935                 
2936                 for(int i=0;i<numOTUs;i++){
2937                         
2938                         if (m->control_pressed) { break; }
2939                         
2940                         for(int j=0;j<nSeqsPerOTU[i];j++){
2941                                 int index = cumNumSeqs[i] + j;
2942                                 double tauValue = singleTau[seqNumber[index]];
2943                                 int sIndex = seqIndex[index];
2944                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
2945                         }
2946                 }
2947                 
2948                 for(int i=0;i<numSeqs;i++){
2949                         double maxTau = -1.0000;
2950                         int maxOTU = -1;
2951                         for(int j=0;j<numOTUs;j++){
2952                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
2953                                         maxTau = bigTauMatrix[i * numOTUs + j];
2954                                         maxOTU = j;
2955                                 }
2956                         }
2957                         
2958                         otuData[i] = maxOTU;
2959                 }
2960                 
2961                 nSeqsPerOTU.assign(numOTUs, 0);         
2962                 
2963                 for(int i=0;i<numSeqs;i++){
2964                         int index = otuData[i];
2965                         
2966                         singleTau[i] = 1.0000;
2967                         dist[i] = 0.0000;
2968                         
2969                         aaP[index][nSeqsPerOTU[index]] = i;
2970                         aaI[index][nSeqsPerOTU[index]] = i;
2971                         
2972                         nSeqsPerOTU[index]++;
2973                 }
2974         
2975                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
2976         }
2977         catch(exception& e) {
2978                 m->errorOut(e, "ShhherCommand", "setOTUs");
2979                 exit(1);        
2980         }               
2981 }
2982 /**************************************************************************************************/
2983
2984 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
2985                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
2986                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
2987         
2988         try {
2989                 string thisOutputDir = outputDir;
2990                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
2991                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
2992         
2993                 ofstream qualityFile;
2994                 m->openOutputFile(qualityFileName, qualityFile);
2995         
2996                 qualityFile.setf(ios::fixed, ios::floatfield);
2997                 qualityFile.setf(ios::showpoint);
2998                 qualityFile << setprecision(6);
2999                 
3000                 vector<vector<int> > qualities(numOTUs);
3001                 vector<double> pr(HOMOPS, 0);
3002                 
3003                 
3004                 for(int i=0;i<numOTUs;i++){
3005                         
3006                         if (m->control_pressed) { break; }
3007                         
3008                         int index = 0;
3009                         int base = 0;
3010                         
3011                         if(nSeqsPerOTU[i] > 0){
3012                                 qualities[i].assign(1024, -1);
3013                                 
3014                                 while(index < numFlowCells){
3015                                         double maxPrValue = 1e8;
3016                                         short maxPrIndex = -1;
3017                                         double count = 0.0000;
3018                                         
3019                                         pr.assign(HOMOPS, 0);
3020                                         
3021                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3022                                                 int lIndex = cumNumSeqs[i] + j;
3023                                                 double tauValue = singleTau[seqNumber[lIndex]];
3024                                                 int sequenceIndex = aaI[i][j];
3025                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3026                                                 
3027                                                 count += tauValue;
3028                                                 
3029                                                 for(int s=0;s<HOMOPS;s++){
3030                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3031                                                 }
3032                                         }
3033                                         
3034                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3035                                         maxPrValue = pr[maxPrIndex];
3036                                         
3037                                         if(count > MIN_COUNT){
3038                                                 double U = 0.0000;
3039                                                 double norm = 0.0000;
3040                                                 
3041                                                 for(int s=0;s<HOMOPS;s++){
3042                                                         norm += exp(-(pr[s] - maxPrValue));
3043                                                 }
3044                                                 
3045                                                 for(int s=1;s<=maxPrIndex;s++){
3046                                                         int value = 0;
3047                                                         double temp = 0.0000;
3048                                                         
3049                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3050                                                         
3051                                                         if(U>0.00){
3052                                                                 temp = log10(U);
3053                                                         }
3054                                                         else{
3055                                                                 temp = -10.1;
3056                                                         }
3057                                                         temp = floor(-10 * temp);
3058                                                         value = (int)floor(temp);
3059                                                         if(value > 100){        value = 100;    }
3060                                                         
3061                                                         qualities[i][base] = (int)value;
3062                                                         base++;
3063                                                 }
3064                                         }
3065                                         
3066                                         index++;
3067                                 }
3068                         }
3069                         
3070                         
3071                         if(otuCounts[i] > 0){
3072                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3073                         
3074                                 int j=4;        //need to get past the first four bases
3075                                 while(qualities[i][j] != -1){
3076                     qualityFile << qualities[i][j] << ' ';
3077                     if (j > qualities[i].size()) { break; }
3078                     j++;
3079                                 }
3080                                 qualityFile << endl;
3081                         }
3082                 }
3083                 qualityFile.close();
3084                 outputNames.push_back(qualityFileName);
3085         
3086         }
3087         catch(exception& e) {
3088                 m->errorOut(e, "ShhherCommand", "writeQualities");
3089                 exit(1);        
3090         }               
3091 }
3092
3093 /**************************************************************************************************/
3094
3095 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3096         try {
3097                 string thisOutputDir = outputDir;
3098                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3099                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
3100                 ofstream fastaFile;
3101                 m->openOutputFile(fastaFileName, fastaFile);
3102                 
3103                 vector<string> names(numOTUs, "");
3104                 
3105                 for(int i=0;i<numOTUs;i++){
3106                         
3107                         if (m->control_pressed) { break; }
3108                         
3109                         int index = centroids[i];
3110                         
3111                         if(otuCounts[i] > 0){
3112                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3113                                 
3114                                 string newSeq = "";
3115                                 
3116                                 for(int j=0;j<numFlowCells;j++){
3117                                         
3118                                         char base = flowOrder[j % 4];
3119                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3120                                                 newSeq += base;
3121                                         }
3122                                 }
3123                                 
3124                                 fastaFile << newSeq.substr(4) << endl;
3125                         }
3126                 }
3127                 fastaFile.close();
3128         
3129                 outputNames.push_back(fastaFileName);
3130         
3131                 if(thisCompositeFASTAFileName != ""){
3132                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3133                 }
3134         }
3135         catch(exception& e) {
3136                 m->errorOut(e, "ShhherCommand", "writeSequences");
3137                 exit(1);        
3138         }               
3139 }
3140
3141 /**************************************************************************************************/
3142
3143 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3144         try {
3145                 string thisOutputDir = outputDir;
3146                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3147                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
3148                 ofstream nameFile;
3149                 m->openOutputFile(nameFileName, nameFile);
3150                 
3151                 for(int i=0;i<numOTUs;i++){
3152                         
3153                         if (m->control_pressed) { break; }
3154                         
3155                         if(otuCounts[i] > 0){
3156                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3157                                 
3158                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3159                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3160                                 }
3161                                 
3162                                 nameFile << endl;
3163                         }
3164                 }
3165                 nameFile.close();
3166                 outputNames.push_back(nameFileName);
3167                 
3168                 
3169                 if(thisCompositeNamesFileName != ""){
3170                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3171                 }               
3172         }
3173         catch(exception& e) {
3174                 m->errorOut(e, "ShhherCommand", "writeNames");
3175                 exit(1);        
3176         }               
3177 }
3178
3179 /**************************************************************************************************/
3180
3181 void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
3182         try {
3183                 string thisOutputDir = outputDir;
3184                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3185                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
3186                 string groupFileName = fileRoot + "shhh.groups";
3187                 ofstream groupFile;
3188                 m->openOutputFile(groupFileName, groupFile);
3189                 
3190                 for(int i=0;i<numSeqs;i++){
3191                         if (m->control_pressed) { break; }
3192                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3193                 }
3194                 groupFile.close();
3195                 outputNames.push_back(groupFileName);
3196         
3197         }
3198         catch(exception& e) {
3199                 m->errorOut(e, "ShhherCommand", "writeGroups");
3200                 exit(1);        
3201         }               
3202 }
3203
3204 /**************************************************************************************************/
3205
3206 void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3207         try {
3208                 string thisOutputDir = outputDir;
3209                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3210                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
3211                 ofstream otuCountsFile;
3212                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3213                 
3214                 string bases = flowOrder;
3215                 
3216                 for(int i=0;i<numOTUs;i++){
3217                         
3218                         if (m->control_pressed) {
3219                                 break;
3220                         }
3221                         //output the translated version of the centroid sequence for the otu
3222                         if(otuCounts[i] > 0){
3223                                 int index = centroids[i];
3224                                 
3225                                 otuCountsFile << "ideal\t";
3226                                 for(int j=8;j<numFlowCells;j++){
3227                                         char base = bases[j % 4];
3228                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3229                                                 otuCountsFile << base;
3230                                         }
3231                                 }
3232                                 otuCountsFile << endl;
3233                                 
3234                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3235                                         int sequence = aaI[i][j];
3236                                         otuCountsFile << seqNameVector[sequence] << '\t';
3237                                         
3238                                         string newSeq = "";
3239                                         
3240                                         for(int k=0;k<lengths[sequence];k++){
3241                                                 char base = bases[k % 4];
3242                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3243                         
3244                                                 for(int s=0;s<freq;s++){
3245                                                         newSeq += base;
3246                                                         //otuCountsFile << base;
3247                                                 }
3248                                         }
3249                                         otuCountsFile << newSeq.substr(4) << endl;
3250                                 }
3251                                 otuCountsFile << endl;
3252                         }
3253                 }
3254                 otuCountsFile.close();
3255                 outputNames.push_back(otuCountsFileName);
3256         
3257         }
3258         catch(exception& e) {
3259                 m->errorOut(e, "ShhherCommand", "writeClusters");
3260                 exit(1);        
3261         }               
3262 }
3263
3264 /**************************************************************************************************/
3265
3266 void ShhherCommand::getSingleLookUp(){
3267         try{
3268                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3269                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3270                 
3271                 int index = 0;
3272                 ifstream lookUpFile;
3273                 m->openInputFile(lookupFileName, lookUpFile);
3274                 
3275                 for(int i=0;i<HOMOPS;i++){
3276                         
3277                         if (m->control_pressed) { break; }
3278                         
3279                         float logFracFreq;
3280                         lookUpFile >> logFracFreq;
3281                         
3282                         for(int j=0;j<NUMBINS;j++)      {
3283                                 lookUpFile >> singleLookUp[index];
3284                                 index++;
3285                         }
3286                 }       
3287                 lookUpFile.close();
3288         }
3289         catch(exception& e) {
3290                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3291                 exit(1);
3292         }
3293 }
3294
3295 /**************************************************************************************************/
3296
3297 void ShhherCommand::getJointLookUp(){
3298         try{
3299                 
3300                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3301                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3302                 
3303                 for(int i=0;i<NUMBINS;i++){
3304                         
3305                         if (m->control_pressed) { break; }
3306                         
3307                         for(int j=0;j<NUMBINS;j++){             
3308                                 
3309                                 double minSum = 100000000;
3310                                 
3311                                 for(int k=0;k<HOMOPS;k++){
3312                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3313                                         
3314                                         if(sum < minSum)        {       minSum = sum;           }
3315                                 }       
3316                                 jointLookUp[i * NUMBINS + j] = minSum;
3317                         }
3318                 }
3319         }
3320         catch(exception& e) {
3321                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3322                 exit(1);
3323         }
3324 }
3325
3326 /**************************************************************************************************/
3327
3328 double ShhherCommand::getProbIntensity(int intIntensity){                          
3329         try{
3330
3331                 double minNegLogProb = 100000000; 
3332
3333                 
3334                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3335                         
3336                         if (m->control_pressed) { break; }
3337                         
3338                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3339                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3340                 }
3341                 
3342                 return minNegLogProb;
3343         }
3344         catch(exception& e) {
3345                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3346                 exit(1);
3347         }
3348 }
3349
3350
3351
3352