]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
22                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
23                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
24                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41                 return helpString;
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "ShhherCommand", "getHelpString");
45                 exit(1);
46         }
47 }
48 //**********************************************************************************************************************
49
50 ShhherCommand::ShhherCommand(){ 
51         try {
52                 abort = true; calledHelp = true;
53                 setParameters();
54                 
55                 //initialize outputTypes
56 //              vector<string> tempOutNames;
57 //              outputTypes["pn.dist"] = tempOutNames;
58
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
62                 exit(1);
63         }
64 }
65
66 //**********************************************************************************************************************
67
68 ShhherCommand::ShhherCommand(string option) {
69         try {
70         
71 #ifdef USE_MPI
72                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
73                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
74         
75                 if(pid == 0){
76 #endif
77                 abort = false; calledHelp = false;   
78                 
79                 //allow user to run help
80                 if(option == "help") { help(); abort = true; calledHelp = true; }
81                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
82                 
83                 else {
84                         vector<string> myArray = setParameters();
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter;
90                         map<string,string>::iterator it;
91                         
92                         //check to make sure all parameters are valid for command
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
95                         }
96                         
97                         //initialize outputTypes
98                         vector<string> tempOutNames;
99 //                      outputTypes["pn.dist"] = tempOutNames;
100                         //                      outputTypes["fasta"] = tempOutNames;
101                         
102                         //if the user changes the input directory command factory will send this info to us in the output parameter 
103                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
104                         if (inputDir == "not found"){   inputDir = "";          }
105                         else {
106                                 string path;
107                                 it = parameters.find("flow");
108                                 //user has given a template file
109                                 if(it != parameters.end()){ 
110                                         path = m->hasPath(it->second);
111                                         //if the user has not given a path then, add inputdir. else leave path alone.
112                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
113                                 }
114                                 
115                                 it = parameters.find("lookup");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
121                                 }
122
123                                 it = parameters.find("file");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
129                                 }
130                         }
131             
132             //if the user changes the output directory command factory will send this info to us in the output parameter 
133                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
134             
135                         //check for required parameters
136                         flowFileName = validParameter.validFile(parameters, "flow", true);
137                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
138                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
139                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
140                                 m->mothurOutEndLine();
141                                 abort = true; 
142                         }
143                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
144                         
145                         if(flowFileName != "not found"){
146                                 compositeFASTAFileName = "";    
147                                 compositeNamesFileName = "";    
148                         }
149                         else{
150                                 ofstream temp;
151                 
152                 string thisoutputDir = m->hasPath(flowFilesFileName); //if user entered a file with a path then preserve it
153                 
154                                 //flow.files = 9 character offset
155                                 compositeFASTAFileName = outputDir + flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
156                                 m->openOutputFile(compositeFASTAFileName, temp);
157                                 temp.close();
158                                 
159                                 compositeNamesFileName = outputDir + flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.names";
160                                 m->openOutputFile(compositeNamesFileName, temp);
161                                 temp.close();
162                         }
163             
164             if(flowFilesFileName != "not found"){
165                 string fName;
166                 
167                 ifstream flowFilesFile;
168                 m->openInputFile(flowFilesFileName, flowFilesFile);
169                 while(flowFilesFile){
170                     fName = m->getline(flowFilesFile);
171                     
172                     //test if file is valid
173                     ifstream in;
174                     int ableToOpen = m->openInputFile(fName, in, "noerror");
175                     in.close(); 
176                     if (ableToOpen == 1) {
177                         if (inputDir != "") { //default path is set
178                             string tryPath = inputDir + fName;
179                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
180                             ifstream in2;
181                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
182                             in2.close();
183                             fName = tryPath;
184                         }
185                     }
186                     
187                     if (ableToOpen == 1) {
188                         if (m->getDefaultPath() != "") { //default path is set
189                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
190                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
191                             ifstream in2;
192                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
193                             in2.close();
194                             fName = tryPath;
195                         }
196                     }
197                     
198                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
199                     if (ableToOpen == 1) {
200                         string exepath = m->argv;
201                         string tempPath = exepath;
202                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
203                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
204                         
205                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
206                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
207                         ifstream in2;
208                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
209                         in2.close();
210                         fName = tryPath;
211                     }
212                     
213                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
214                     else { flowFileVector.push_back(fName); }
215                     m->gobble(flowFilesFile);
216                 }
217                 flowFilesFile.close();
218                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
219             }
220             else{
221                 outputDir += m->hasPath(flowFileName);
222                 flowFileVector.push_back(flowFileName);
223             }
224                 
225                         //check for optional parameter and set defaults
226                         // ...at some point should added some additional type checking...
227                         string temp;
228                         temp = validParameter.validFile(parameters, "lookup", true);
229                         if (temp == "not found")        {       
230                                 lookupFileName = "LookUp_Titanium.pat"; 
231                                 
232                                 int ableToOpen;
233                                 ifstream in;
234                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
235                                 in.close();     
236                                 
237                                 //if you can't open it, try input location
238                                 if (ableToOpen == 1) {
239                                         if (inputDir != "") { //default path is set
240                                                 string tryPath = inputDir + lookupFileName;
241                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
242                                                 ifstream in2;
243                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
244                                                 in2.close();
245                                                 lookupFileName = tryPath;
246                                         }
247                                 }
248                                 
249                                 //if you can't open it, try default location
250                                 if (ableToOpen == 1) {
251                                         if (m->getDefaultPath() != "") { //default path is set
252                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
253                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
254                                                 ifstream in2;
255                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
256                                                 in2.close();
257                                                 lookupFileName = tryPath;
258                                         }
259                                 }
260                                 
261                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
262                                 if (ableToOpen == 1) {
263                                         string exepath = m->argv;
264                                         string tempPath = exepath;
265                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
266                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
267                                         
268                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
269                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
270                                         ifstream in2;
271                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
272                                         in2.close();
273                                         lookupFileName = tryPath;
274                                 }
275                                 
276                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
277                         }
278                         else if(temp == "not open")     {       
279                                 
280                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
281                                 
282                                 //if you can't open it its not inputDir, try mothur excutable location
283                                 string exepath = m->argv;
284                                 string tempPath = exepath;
285                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
286                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
287                                         
288                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
289                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
290                                 ifstream in2;
291                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
292                                 in2.close();
293                                 lookupFileName = tryPath;
294                                 
295                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
296                         }else                                           {       lookupFileName = temp;  }
297                         
298                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
299                         m->setProcessors(temp);
300                         m->mothurConvert(temp, processors);
301
302                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
303                         m->mothurConvert(temp, cutoff); 
304                         
305                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
306                         m->mothurConvert(temp, minDelta); 
307
308                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
309                         m->mothurConvert(temp, maxIters); 
310
311                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
312                         m->mothurConvert(temp, sigma); 
313                         
314                         flowOrder = validParameter.validFile(parameters, "order", false);
315                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
316                         else if(flowOrder.length() != 4){
317                                 m->mothurOut("The value of the order option must be four bases long\n");
318                         }
319                         
320                 }
321 #ifdef USE_MPI
322                 }                               
323 #endif
324         }
325         catch(exception& e) {
326                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
327                 exit(1);
328         }
329 }
330 //**********************************************************************************************************************
331 #ifdef USE_MPI
332 int ShhherCommand::execute(){
333         try {
334                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
335                 
336                 int tag = 1976;
337                 MPI_Status status; 
338                         
339                 if(pid == 0){
340
341                         for(int i=1;i<ncpus;i++){
342                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
343                         }
344                         if(abort == 1){ return 0;       }
345
346                         processors = ncpus;
347                         
348                         m->mothurOut("\nGetting preliminary data...\n");
349                         getSingleLookUp();      if (m->control_pressed) { return 0; }
350                         getJointLookUp();       if (m->control_pressed) { return 0; }
351                         
352             vector<string> flowFileVector;
353                         if(flowFilesFileName != "not found"){
354                                 string fName;
355                 
356                                 ifstream flowFilesFile;
357                                 m->openInputFile(flowFilesFileName, flowFilesFile);
358                                 while(flowFilesFile){
359                                         fName = m->getline(flowFilesFile);
360                                         flowFileVector.push_back(fName);
361                                         m->gobble(flowFilesFile);
362                                 }
363                         }
364                         else{
365                                 flowFileVector.push_back(flowFileName);
366                         }
367             
368                         int numFiles = flowFileVector.size();
369
370                         for(int i=1;i<ncpus;i++){
371                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
372                         }
373                         
374                         for(int i=0;i<numFiles;i++){
375                                 
376                                 if (m->control_pressed) { break; }
377                                 
378                                 double begClock = clock();
379                                 unsigned long long begTime = time(NULL);
380
381                                 flowFileName = flowFileVector[i];
382                                 
383                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
384                                 m->mothurOut("Reading flowgrams...\n");
385                                 getFlowData();
386                                 
387                                 if (m->control_pressed) { break; }
388
389                                 m->mothurOut("Identifying unique flowgrams...\n");
390                                 getUniques();
391                                 
392                                 if (m->control_pressed) { break; }
393
394                                 m->mothurOut("Calculating distances between flowgrams...\n");
395                                 char fileName[1024];
396                                 strcpy(fileName, flowFileName.c_str());
397
398                                 for(int i=1;i<ncpus;i++){
399                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
400
401                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
402                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
403                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
404                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
405                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
406                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
407                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
408                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
409                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
410                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
411                                 }                               
412                                                         
413                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
414                                 
415                                 if (m->control_pressed) { break; }
416                                 
417                                 int done;
418                                 for(int i=1;i<ncpus;i++){
419                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
420                                         
421                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
422                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
423                                 }
424
425                                 string namesFileName = createNamesFile();
426                                 
427                                 if (m->control_pressed) { break; }
428                                 
429                                 m->mothurOut("\nClustering flowgrams...\n");
430                                 string listFileName = cluster(distFileName, namesFileName);
431                                 
432                                 if (m->control_pressed) { break; }
433                                 
434                 
435                 
436                                 getOTUData(listFileName);
437
438                                 m->mothurRemove(distFileName);
439                                 m->mothurRemove(namesFileName);
440                                 m->mothurRemove(listFileName);
441                                 
442                                 if (m->control_pressed) { break; }
443                                 
444                                 initPyroCluster();
445
446                                 if (m->control_pressed) { break; }
447                                 
448             
449                                 for(int i=1;i<ncpus;i++){
450                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
451                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
452                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
453                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
454                                 }
455                                 
456                                 if (m->control_pressed) { break; }
457                                 
458                                 double maxDelta = 0;
459                                 int iter = 0;
460                                 
461                                 int numOTUsOnCPU = numOTUs / ncpus;
462                                 int numSeqsOnCPU = numSeqs / ncpus;
463                                 m->mothurOut("\nDenoising flowgrams...\n");
464                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
465                                 
466                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
467                                         
468                                         double cycClock = clock();
469                                         unsigned long long cycTime = time(NULL);
470                                         fill();
471                                         
472                                         if (m->control_pressed) { break; }
473
474                                         int total = singleTau.size();
475                                         for(int i=1;i<ncpus;i++){
476                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
477                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
478                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
479                                                 
480                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
481                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
482                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
483                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
484                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
485                                         }
486                                         
487                                         calcCentroidsDriver(0, numOTUsOnCPU);
488                                         
489                                         for(int i=1;i<ncpus;i++){
490                                                 int otuStart = i * numOTUs / ncpus;
491                                                 int otuStop = (i + 1) * numOTUs / ncpus;
492                                                 
493                                                 vector<int> tempCentroids(numOTUs, 0);
494                                                 vector<short> tempChange(numOTUs, 0);
495                                                 
496                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
497                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
498                                                 
499                                                 for(int j=otuStart;j<otuStop;j++){
500                                                         centroids[j] = tempCentroids[j];
501                                                         change[j] = tempChange[j];
502                                                 }
503                                         }
504                                                                         
505                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
506                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
507                                         checkCentroids(); if (m->control_pressed) { break; }
508                                         
509                                         for(int i=1;i<ncpus;i++){
510                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
511                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
512                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
513                                         }
514                                         
515                                         calcNewDistancesParent(0, numSeqsOnCPU);
516
517                                         total = singleTau.size();
518
519                                         for(int i=1;i<ncpus;i++){
520                                                 int childTotal;
521                                                 int seqStart = i * numSeqs / ncpus;
522                                                 int seqStop = (i + 1) * numSeqs / ncpus;
523                                                 
524                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
525
526                                                 vector<int> childSeqIndex(childTotal, 0);
527                                                 vector<double> childSingleTau(childTotal, 0);
528                                                 vector<double> childDist(numSeqs * numOTUs, 0);
529                                                 vector<int> otuIndex(childTotal, 0);
530                                                 
531                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
532                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
533                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
534                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
535
536                                                 int oldTotal = total;
537                                                 total += childTotal;
538                                                 singleTau.resize(total, 0);
539                                                 seqIndex.resize(total, 0);
540                                                 seqNumber.resize(total, 0);
541                                                 
542                                                 int childIndex = 0;
543                                                 
544                                                 for(int j=oldTotal;j<total;j++){
545                                                         int otuI = otuIndex[childIndex];
546                                                         int seqI = childSeqIndex[childIndex];
547                                                         
548                                                         singleTau[j] = childSingleTau[childIndex];
549                                                         
550                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
551                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
552                                                         nSeqsPerOTU[otuI]++;
553                                                         childIndex++;
554                                                 }
555                                                 
556                                                 int index = seqStart * numOTUs;
557                                                 for(int j=seqStart;j<seqStop;j++){
558                                                         for(int k=0;k<numOTUs;k++){
559                                                                 dist[index] = childDist[index];
560                                                                 index++;
561                                                         }
562                                                 }                                       
563                                         }
564                                         
565                                         iter++;
566                                         
567                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
568
569                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
570                                                 int live = 1;
571                                                 for(int i=1;i<ncpus;i++){
572                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
573                                                 }
574                                         }
575                                         else{
576                                                 int live = 0;
577                                                 for(int i=1;i<ncpus;i++){
578                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
579                                                 }
580                                         }
581                                         
582                                 }       
583                                 
584                                 if (m->control_pressed) { break; }
585                                 
586                                 m->mothurOut("\nFinalizing...\n");
587                                 fill();
588                                 
589                                 if (m->control_pressed) { break; }
590                                 
591                                 setOTUs();
592                                 
593                                 vector<int> otuCounts(numOTUs, 0);
594                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
595                                 calcCentroidsDriver(0, numOTUs);
596                                 
597                                 if (m->control_pressed) { break; }
598                                 
599                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
600                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
601                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
602                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
603                                 writeGroups();                          if (m->control_pressed) { break; }
604                                 
605                                                                  
606                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
607                         }
608                 }
609                 else{
610                         int abort = 1;
611
612                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
613                         if(abort){      return 0;       }
614
615                         int numFiles;
616                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
617
618                         for(int i=0;i<numFiles;i++){
619                                 
620                                 if (m->control_pressed) { break; }
621                                 
622                                 //Now into the pyrodist part
623                                 bool live = 1;
624
625                                 char fileName[1024];
626                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
627                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
628                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
629                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
630                                 
631                                 flowDataIntI.resize(numSeqs * numFlowCells);
632                                 flowDataPrI.resize(numSeqs * numFlowCells);
633                                 mapUniqueToSeq.resize(numSeqs);
634                                 mapSeqToUnique.resize(numSeqs);
635                                 lengths.resize(numSeqs);
636                                 jointLookUp.resize(NUMBINS * NUMBINS);
637
638                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
639                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
640                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
641                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
642                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
643                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
644                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
645
646                                 flowFileName = string(fileName);
647                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
648                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
649                                 
650                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
651                                 
652                                 if (m->control_pressed) { break; }
653                                 
654                                 int done = 1;
655                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
656
657                                 //Now into the pyronoise part
658                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
659                                 
660                                 singleLookUp.resize(HOMOPS * NUMBINS);
661                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
662                                 weight.resize(numOTUs);
663                                 centroids.resize(numOTUs);
664                                 change.resize(numOTUs);
665                                 dist.assign(numOTUs * numSeqs, 0);
666                                 nSeqsPerOTU.resize(numOTUs);
667                                 cumNumSeqs.resize(numOTUs);
668
669                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
670                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
671                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
672                                 
673                                 int startOTU = pid * numOTUs / ncpus;
674                                 int endOTU = (pid + 1) * numOTUs / ncpus;
675
676                                 int startSeq = pid * numSeqs / ncpus;
677                                 int endSeq = (pid + 1) * numSeqs /ncpus;
678                                 
679                                 int total;
680
681                                 while(live){
682                                         
683                                         if (m->control_pressed) { break; }
684                                         
685                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
686                                         singleTau.assign(total, 0.0000);
687                                         seqNumber.assign(total, 0);
688                                         seqIndex.assign(total, 0);
689
690                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
691                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
692                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
693                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
694                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
695                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
696                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
697
698                                         calcCentroidsDriver(startOTU, endOTU);
699
700                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
701                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
702
703                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
704                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
705                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
706
707                                         vector<int> otuIndex(total, 0);
708                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
709                                         total = otuIndex.size();
710                                         
711                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
712                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
713                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
714                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
715                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
716                                 
717                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
718                                 }
719                         }
720                 }
721                 
722                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
723                 
724                 MPI_Barrier(MPI_COMM_WORLD);
725
726                 
727                 if(compositeFASTAFileName != ""){
728                         outputNames.push_back(compositeFASTAFileName);
729                         outputNames.push_back(compositeNamesFileName);
730                 }
731
732                 m->mothurOutEndLine();
733                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
734                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
735                 m->mothurOutEndLine();
736                 
737                 return 0;
738
739         }
740         catch(exception& e) {
741                 m->errorOut(e, "ShhherCommand", "execute");
742                 exit(1);
743         }
744 }
745 /**************************************************************************************************/
746 string ShhherCommand::createNamesFile(){
747         try{
748                 
749                 vector<string> duplicateNames(numUniques, "");
750                 for(int i=0;i<numSeqs;i++){
751                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
752                 }
753                 
754                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
755                 
756                 ofstream nameFile;
757                 m->openOutputFile(nameFileName, nameFile);
758                 
759                 for(int i=0;i<numUniques;i++){
760                         
761                         if (m->control_pressed) { break; }
762                         
763             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
764                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
765                 }
766                 
767                 nameFile.close();
768                 return  nameFileName;
769         }
770         catch(exception& e) {
771                 m->errorOut(e, "ShhherCommand", "createNamesFile");
772                 exit(1);
773         }
774 }
775 /**************************************************************************************************/
776
777 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
778         try{            
779                 ostringstream outStream;
780                 outStream.setf(ios::fixed, ios::floatfield);
781                 outStream.setf(ios::dec, ios::basefield);
782                 outStream.setf(ios::showpoint);
783                 outStream.precision(6);
784                 
785                 int begTime = time(NULL);
786                 double begClock = clock();
787                 
788                 for(int i=startSeq;i<stopSeq;i++){
789                         
790                         if (m->control_pressed) { break; }
791                         
792                         for(int j=0;j<i;j++){
793                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
794                                 
795                                 if(flowDistance < 1e-6){
796                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
797                                 }
798                                 else if(flowDistance <= cutoff){
799                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
800                                 }
801                         }
802                         if(i % 100 == 0){
803                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
804                         }
805                 }
806                 
807                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
808                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
809                 
810                 if (m->control_pressed) { return fDistFileName; }
811                 
812                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
813
814                 ofstream distFile(fDistFileName.c_str());
815                 distFile << outStream.str();            
816                 distFile.close();
817                 
818                 return fDistFileName;
819         }
820         catch(exception& e) {
821                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
822                 exit(1);
823         }
824 }
825 /**************************************************************************************************/
826  
827 void ShhherCommand::getOTUData(string listFileName){
828     try {
829         
830         ifstream listFile;
831         m->openInputFile(listFileName, listFile);
832         string label;
833         
834         listFile >> label >> numOTUs;
835         
836         otuData.assign(numSeqs, 0);
837         cumNumSeqs.assign(numOTUs, 0);
838         nSeqsPerOTU.assign(numOTUs, 0);
839         aaP.clear();aaP.resize(numOTUs);
840         
841         seqNumber.clear();
842         aaI.clear();
843         seqIndex.clear();
844         
845         string singleOTU = "";
846         
847         for(int i=0;i<numOTUs;i++){
848             
849             if (m->control_pressed) { break; }
850             
851             listFile >> singleOTU;
852             
853             istringstream otuString(singleOTU);
854             
855             while(otuString){
856                 
857                 string seqName = "";
858                 
859                 for(int j=0;j<singleOTU.length();j++){
860                     char letter = otuString.get();
861                     
862                     if(letter != ','){
863                         seqName += letter;
864                     }
865                     else{
866                         map<string,int>::iterator nmIt = nameMap.find(seqName);
867                         int index = nmIt->second;
868                         
869                         nameMap.erase(nmIt);
870                         
871                         otuData[index] = i;
872                         nSeqsPerOTU[i]++;
873                         aaP[i].push_back(index);
874                         seqName = "";
875                     }
876                 }
877                 
878                 map<string,int>::iterator nmIt = nameMap.find(seqName);
879                 
880                 int index = nmIt->second;
881                 nameMap.erase(nmIt);
882                 
883                 otuData[index] = i;
884                 nSeqsPerOTU[i]++;
885                 aaP[i].push_back(index);        
886                 
887                 otuString.get();
888             }
889             
890             sort(aaP[i].begin(), aaP[i].end());
891             for(int j=0;j<nSeqsPerOTU[i];j++){
892                 seqNumber.push_back(aaP[i][j]);
893             }
894             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
895                 aaP[i].push_back(0);
896             }
897             
898             
899         }
900         
901         for(int i=1;i<numOTUs;i++){
902             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
903         }
904         aaI = aaP;
905         seqIndex = seqNumber;
906         
907         listFile.close();       
908         
909     }
910     catch(exception& e) {
911         m->errorOut(e, "ShhherCommand", "getOTUData");
912         exit(1);        
913     }           
914 }
915
916 /**************************************************************************************************/
917
918 void ShhherCommand::initPyroCluster(){                          
919     try{
920         if (numOTUs < processors) { processors = 1; }
921         
922         dist.assign(numSeqs * numOTUs, 0);
923         change.assign(numOTUs, 1);
924         centroids.assign(numOTUs, -1);
925         weight.assign(numOTUs, 0);
926         singleTau.assign(numSeqs, 1.0);
927         
928         nSeqsBreaks.assign(processors+1, 0);
929         nOTUsBreaks.assign(processors+1, 0);
930         
931         nSeqsBreaks[0] = 0;
932         for(int i=0;i<processors;i++){
933             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
934             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
935         }
936         nSeqsBreaks[processors] = numSeqs;
937         nOTUsBreaks[processors] = numOTUs;
938     }
939     catch(exception& e) {
940         m->errorOut(e, "ShhherCommand", "initPyroCluster");
941         exit(1);        
942     }
943 }
944
945 /**************************************************************************************************/
946
947 void ShhherCommand::fill(){
948     try {
949         int index = 0;
950         for(int i=0;i<numOTUs;i++){
951             
952             if (m->control_pressed) { break; }
953             
954             cumNumSeqs[i] = index;
955             for(int j=0;j<nSeqsPerOTU[i];j++){
956                 seqNumber[index] = aaP[i][j];
957                 seqIndex[index] = aaI[i][j];
958                 
959                 index++;
960             }
961         }
962     }
963     catch(exception& e) {
964         m->errorOut(e, "ShhherCommand", "fill");
965         exit(1);        
966     }           
967 }
968
969 /**************************************************************************************************/
970  
971 void ShhherCommand::getFlowData(){
972     try{
973         ifstream flowFile;
974         m->openInputFile(flowFileName, flowFile);
975         
976         string seqName;
977         seqNameVector.clear();
978         lengths.clear();
979         flowDataIntI.clear();
980         nameMap.clear();
981         
982         
983         int currentNumFlowCells;
984         
985         float intensity;
986         
987         flowFile >> numFlowCells;
988         int index = 0;//pcluster
989         while(!flowFile.eof()){
990             
991             if (m->control_pressed) { break; }
992             
993             flowFile >> seqName >> currentNumFlowCells;
994             lengths.push_back(currentNumFlowCells);
995             
996             seqNameVector.push_back(seqName);
997             nameMap[seqName] = index++;//pcluster
998             
999             for(int i=0;i<numFlowCells;i++){
1000                 flowFile >> intensity;
1001                 if(intensity > 9.99)    {       intensity = 9.99;       }
1002                 int intI = int(100 * intensity + 0.0001);
1003                 flowDataIntI.push_back(intI);
1004             }
1005             m->gobble(flowFile);
1006         }
1007         flowFile.close();
1008         
1009         numSeqs = seqNameVector.size();         
1010         
1011         for(int i=0;i<numSeqs;i++){
1012             
1013             if (m->control_pressed) { break; }
1014             
1015             int iNumFlowCells = i * numFlowCells;
1016             for(int j=lengths[i];j<numFlowCells;j++){
1017                 flowDataIntI[iNumFlowCells + j] = 0;
1018             }
1019         }
1020         
1021     }
1022     catch(exception& e) {
1023         m->errorOut(e, "ShhherCommand", "getFlowData");
1024         exit(1);
1025     }
1026 }
1027 /**************************************************************************************************/
1028 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1029         
1030         try{
1031                 vector<double> newTau(numOTUs,0);
1032                 vector<double> norms(numSeqs, 0);
1033                 otuIndex.clear();
1034                 seqIndex.clear();
1035                 singleTau.clear();
1036                 
1037                 for(int i=startSeq;i<stopSeq;i++){
1038                         
1039                         if (m->control_pressed) { break; }
1040                         
1041                         double offset = 1e8;
1042                         int indexOffset = i * numOTUs;
1043                         
1044                         for(int j=0;j<numOTUs;j++){
1045                                 
1046                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1047                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1048                                 }
1049                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1050                                         offset = dist[indexOffset + j];
1051                                 }
1052                         }
1053                         
1054                         for(int j=0;j<numOTUs;j++){
1055                                 if(weight[j] > MIN_WEIGHT){
1056                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1057                                         norms[i] += newTau[j];
1058                                 }
1059                                 else{
1060                                         newTau[j] = 0.0;
1061                                 }
1062                         }
1063                         
1064                         for(int j=0;j<numOTUs;j++){
1065                 
1066                                 newTau[j] /= norms[i];
1067                                 
1068                                 if(newTau[j] > MIN_TAU){
1069                                         otuIndex.push_back(j);
1070                                         seqIndex.push_back(i);
1071                                         singleTau.push_back(newTau[j]);
1072                                 }
1073                         }
1074                         
1075                 }
1076         }
1077         catch(exception& e) {
1078                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1079                 exit(1);        
1080         }               
1081 }
1082
1083 /**************************************************************************************************/
1084
1085 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1086     
1087     try{
1088         
1089         int total = 0;
1090         vector<double> newTau(numOTUs,0);
1091         vector<double> norms(numSeqs, 0);
1092         nSeqsPerOTU.assign(numOTUs, 0);
1093         
1094         for(int i=startSeq;i<stopSeq;i++){
1095             
1096             if (m->control_pressed) { break; }
1097             
1098             int indexOffset = i * numOTUs;
1099             
1100             double offset = 1e8;
1101             
1102             for(int j=0;j<numOTUs;j++){
1103                 
1104                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1105                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1106                 }
1107                 
1108                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1109                     offset = dist[indexOffset + j];
1110                 }
1111             }
1112             
1113             for(int j=0;j<numOTUs;j++){
1114                 if(weight[j] > MIN_WEIGHT){
1115                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1116                     norms[i] += newTau[j];
1117                 }
1118                 else{
1119                     newTau[j] = 0.0;
1120                 }
1121             }
1122             
1123             for(int j=0;j<numOTUs;j++){
1124                 newTau[j] /= norms[i];
1125             }
1126             
1127             for(int j=0;j<numOTUs;j++){
1128                 if(newTau[j] > MIN_TAU){
1129                     
1130                     int oldTotal = total;
1131                     
1132                     total++;
1133                     
1134                     singleTau.resize(total, 0);
1135                     seqNumber.resize(total, 0);
1136                     seqIndex.resize(total, 0);
1137                     
1138                     singleTau[oldTotal] = newTau[j];
1139                     
1140                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1141                     aaI[j][nSeqsPerOTU[j]] = i;
1142                     nSeqsPerOTU[j]++;
1143                 }
1144             }
1145             
1146         }
1147         
1148     }
1149     catch(exception& e) {
1150         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1151         exit(1);        
1152     }           
1153 }
1154
1155 /**************************************************************************************************/
1156
1157 void ShhherCommand::setOTUs(){
1158     
1159     try {
1160         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1161         
1162         for(int i=0;i<numOTUs;i++){
1163             
1164             if (m->control_pressed) { break; }
1165             
1166             for(int j=0;j<nSeqsPerOTU[i];j++){
1167                 int index = cumNumSeqs[i] + j;
1168                 double tauValue = singleTau[seqNumber[index]];
1169                 int sIndex = seqIndex[index];
1170                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1171             }
1172         }
1173         
1174         for(int i=0;i<numSeqs;i++){
1175             double maxTau = -1.0000;
1176             int maxOTU = -1;
1177             for(int j=0;j<numOTUs;j++){
1178                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1179                     maxTau = bigTauMatrix[i * numOTUs + j];
1180                     maxOTU = j;
1181                 }
1182             }
1183             
1184             otuData[i] = maxOTU;
1185         }
1186         
1187         nSeqsPerOTU.assign(numOTUs, 0);         
1188         
1189         for(int i=0;i<numSeqs;i++){
1190             int index = otuData[i];
1191             
1192             singleTau[i] = 1.0000;
1193             dist[i] = 0.0000;
1194             
1195             aaP[index][nSeqsPerOTU[index]] = i;
1196             aaI[index][nSeqsPerOTU[index]] = i;
1197             
1198             nSeqsPerOTU[index]++;
1199         }
1200         fill(); 
1201     }
1202     catch(exception& e) {
1203         m->errorOut(e, "ShhherCommand", "setOTUs");
1204         exit(1);        
1205     }           
1206 }
1207
1208 /**************************************************************************************************/
1209  
1210 void ShhherCommand::getUniques(){
1211     try{
1212         
1213         
1214         numUniques = 0;
1215         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1216         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1217         uniqueLengths.assign(numSeqs, 0);
1218         mapSeqToUnique.assign(numSeqs, -1);
1219         mapUniqueToSeq.assign(numSeqs, -1);
1220         
1221         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1222         
1223         for(int i=0;i<numSeqs;i++){
1224             
1225             if (m->control_pressed) { break; }
1226             
1227             int index = 0;
1228             
1229             vector<short> current(numFlowCells);
1230             for(int j=0;j<numFlowCells;j++){
1231                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1232             }
1233             
1234             for(int j=0;j<numUniques;j++){
1235                 int offset = j * numFlowCells;
1236                 bool toEnd = 1;
1237                 
1238                 int shorterLength;
1239                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1240                 else                                                            {       shorterLength = uniqueLengths[j];       }
1241                 
1242                 for(int k=0;k<shorterLength;k++){
1243                     if(current[k] != uniqueFlowgrams[offset + k]){
1244                         toEnd = 0;
1245                         break;
1246                     }
1247                 }
1248                 
1249                 if(toEnd){
1250                     mapSeqToUnique[i] = j;
1251                     uniqueCount[j]++;
1252                     index = j;
1253                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1254                     break;
1255                 }
1256                 index++;
1257             }
1258             
1259             if(index == numUniques){
1260                 uniqueLengths[numUniques] = lengths[i];
1261                 uniqueCount[numUniques] = 1;
1262                 mapSeqToUnique[i] = numUniques;//anMap
1263                 mapUniqueToSeq[numUniques] = i;//anF
1264                 
1265                 for(int k=0;k<numFlowCells;k++){
1266                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1267                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1268                 }
1269                 
1270                 numUniques++;
1271             }
1272         }
1273         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1274         uniqueLengths.resize(numUniques);       
1275         
1276         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1277         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1278     }
1279     catch(exception& e) {
1280         m->errorOut(e, "ShhherCommand", "getUniques");
1281         exit(1);
1282     }
1283 }
1284
1285 /**************************************************************************************************/
1286
1287 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1288     try{
1289         int minLength = lengths[mapSeqToUnique[seqA]];
1290         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1291         
1292         int ANumFlowCells = seqA * numFlowCells;
1293         int BNumFlowCells = seqB * numFlowCells;
1294         
1295         float dist = 0;
1296         
1297         for(int i=0;i<minLength;i++){
1298             
1299             if (m->control_pressed) { break; }
1300             
1301             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1302             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1303             
1304             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1305             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1306             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1307         }
1308         
1309         dist /= (float) minLength;
1310         return dist;
1311     }
1312     catch(exception& e) {
1313         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1314         exit(1);
1315     }
1316 }
1317
1318 //**********************************************************************************************************************/
1319
1320 string ShhherCommand::cluster(string distFileName, string namesFileName){
1321     try {
1322         
1323         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1324         read->setCutoff(cutoff);
1325         
1326         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1327         clusterNameMap->readMap();
1328         read->read(clusterNameMap);
1329         
1330         ListVector* list = read->getListVector();
1331         SparseMatrix* matrix = read->getMatrix();
1332         
1333         delete read; 
1334         delete clusterNameMap; 
1335         
1336         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1337         
1338         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1339         string tag = cluster->getTag();
1340         
1341         double clusterCutoff = cutoff;
1342         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1343             
1344             if (m->control_pressed) { break; }
1345             
1346             cluster->update(clusterCutoff);
1347         }
1348         
1349         list->setLabel(toString(cutoff));
1350         
1351         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1352         ofstream listFile;
1353         m->openOutputFile(listFileName, listFile);
1354         list->print(listFile);
1355         listFile.close();
1356         
1357         delete matrix;  delete cluster; delete rabund; delete list;
1358         
1359         return listFileName;
1360     }
1361     catch(exception& e) {
1362         m->errorOut(e, "ShhherCommand", "cluster");
1363         exit(1);        
1364     }           
1365 }
1366
1367 /**************************************************************************************************/
1368
1369 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1370     
1371     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1372     //within an otu
1373     
1374     try{
1375         
1376         for(int i=start;i<finish;i++){
1377             
1378             if (m->control_pressed) { break; }
1379             
1380             double count = 0;
1381             int position = 0;
1382             int minFlowGram = 100000000;
1383             double minFlowValue = 1e8;
1384             change[i] = 0; //FALSE
1385             
1386             for(int j=0;j<nSeqsPerOTU[i];j++){
1387                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1388             }
1389             
1390             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1391                 vector<double> adF(nSeqsPerOTU[i]);
1392                 vector<int> anL(nSeqsPerOTU[i]);
1393                 
1394                 for(int j=0;j<nSeqsPerOTU[i];j++){
1395                     int index = cumNumSeqs[i] + j;
1396                     int nI = seqIndex[index];
1397                     int nIU = mapSeqToUnique[nI];
1398                     
1399                     int k;
1400                     for(k=0;k<position;k++){
1401                         if(nIU == anL[k]){
1402                             break;
1403                         }
1404                     }
1405                     if(k == position){
1406                         anL[position] = nIU;
1407                         adF[position] = 0.0000;
1408                         position++;
1409                     }                                           
1410                 }
1411                 
1412                 for(int j=0;j<nSeqsPerOTU[i];j++){
1413                     int index = cumNumSeqs[i] + j;
1414                     int nI = seqIndex[index];
1415                     
1416                     double tauValue = singleTau[seqNumber[index]];
1417                     
1418                     for(int k=0;k<position;k++){
1419                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1420                         adF[k] += dist * tauValue;
1421                     }
1422                 }
1423                 
1424                 for(int j=0;j<position;j++){
1425                     if(adF[j] < minFlowValue){
1426                         minFlowGram = j;
1427                         minFlowValue = adF[j];
1428                     }
1429                 }
1430                 
1431                 if(centroids[i] != anL[minFlowGram]){
1432                     change[i] = 1;
1433                     centroids[i] = anL[minFlowGram];
1434                 }
1435             }
1436             else if(centroids[i] != -1){
1437                 change[i] = 1;
1438                 centroids[i] = -1;                      
1439             }
1440         }
1441     }
1442     catch(exception& e) {
1443         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1444         exit(1);        
1445     }           
1446 }
1447
1448 /**************************************************************************************************/
1449
1450 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1451     try{
1452         
1453         int flowAValue = cent * numFlowCells;
1454         int flowBValue = flow * numFlowCells;
1455         
1456         double dist = 0;
1457         
1458         for(int i=0;i<length;i++){
1459             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1460             flowAValue++;
1461             flowBValue++;
1462         }
1463         
1464         return dist / (double)length;
1465     }
1466     catch(exception& e) {
1467         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1468         exit(1);        
1469     }           
1470 }
1471
1472 /**************************************************************************************************/
1473
1474 double ShhherCommand::getNewWeights(){
1475     try{
1476         
1477         double maxChange = 0;
1478         
1479         for(int i=0;i<numOTUs;i++){
1480             
1481             if (m->control_pressed) { break; }
1482             
1483             double difference = weight[i];
1484             weight[i] = 0;
1485             
1486             for(int j=0;j<nSeqsPerOTU[i];j++){
1487                 int index = cumNumSeqs[i] + j;
1488                 double tauValue = singleTau[seqNumber[index]];
1489                 weight[i] += tauValue;
1490             }
1491             
1492             difference = fabs(weight[i] - difference);
1493             if(difference > maxChange){ maxChange = difference; }
1494         }
1495         return maxChange;
1496     }
1497     catch(exception& e) {
1498         m->errorOut(e, "ShhherCommand", "getNewWeights");
1499         exit(1);        
1500     }           
1501 }
1502  
1503  /**************************************************************************************************/
1504  
1505 double ShhherCommand::getLikelihood(){
1506     
1507     try{
1508         
1509         vector<long double> P(numSeqs, 0);
1510         int effNumOTUs = 0;
1511         
1512         for(int i=0;i<numOTUs;i++){
1513             if(weight[i] > MIN_WEIGHT){
1514                 effNumOTUs++;
1515             }
1516         }
1517         
1518         string hold;
1519         for(int i=0;i<numOTUs;i++){
1520             
1521             if (m->control_pressed) { break; }
1522             
1523             for(int j=0;j<nSeqsPerOTU[i];j++){
1524                 int index = cumNumSeqs[i] + j;
1525                 int nI = seqIndex[index];
1526                 double singleDist = dist[seqNumber[index]];
1527                 
1528                 P[nI] += weight[i] * exp(-singleDist * sigma);
1529             }
1530         }
1531         double nLL = 0.00;
1532         for(int i=0;i<numSeqs;i++){
1533             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1534             
1535             nLL += -log(P[i]);
1536         }
1537         
1538         nLL = nLL -(double)numSeqs * log(sigma);
1539         
1540         return nLL; 
1541     }
1542     catch(exception& e) {
1543         m->errorOut(e, "ShhherCommand", "getNewWeights");
1544         exit(1);        
1545     }           
1546 }
1547
1548 /**************************************************************************************************/
1549
1550 void ShhherCommand::checkCentroids(){
1551     try{
1552         vector<int> unique(numOTUs, 1);
1553         
1554         for(int i=0;i<numOTUs;i++){
1555             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1556                 unique[i] = -1;
1557             }
1558         }
1559         
1560         for(int i=0;i<numOTUs;i++){
1561             
1562             if (m->control_pressed) { break; }
1563             
1564             if(unique[i] == 1){
1565                 for(int j=i+1;j<numOTUs;j++){
1566                     if(unique[j] == 1){
1567                         
1568                         if(centroids[j] == centroids[i]){
1569                             unique[j] = 0;
1570                             centroids[j] = -1;
1571                             
1572                             weight[i] += weight[j];
1573                             weight[j] = 0.0;
1574                         }
1575                     }
1576                 }
1577             }
1578         }
1579     }
1580     catch(exception& e) {
1581         m->errorOut(e, "ShhherCommand", "checkCentroids");
1582         exit(1);        
1583     }           
1584 }
1585  /**************************************************************************************************/
1586
1587
1588  
1589 void ShhherCommand::writeQualities(vector<int> otuCounts){
1590     
1591     try {
1592         string thisOutputDir = outputDir;
1593         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1594         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1595         
1596         ofstream qualityFile;
1597         m->openOutputFile(qualityFileName, qualityFile);
1598         
1599         qualityFile.setf(ios::fixed, ios::floatfield);
1600         qualityFile.setf(ios::showpoint);
1601         qualityFile << setprecision(6);
1602         
1603         vector<vector<int> > qualities(numOTUs);
1604         vector<double> pr(HOMOPS, 0);
1605         
1606         
1607         for(int i=0;i<numOTUs;i++){
1608             
1609             if (m->control_pressed) { break; }
1610             
1611             int index = 0;
1612             int base = 0;
1613             
1614             if(nSeqsPerOTU[i] > 0){
1615                 qualities[i].assign(1024, -1);
1616                 
1617                 while(index < numFlowCells){
1618                     double maxPrValue = 1e8;
1619                     short maxPrIndex = -1;
1620                     double count = 0.0000;
1621                     
1622                     pr.assign(HOMOPS, 0);
1623                     
1624                     for(int j=0;j<nSeqsPerOTU[i];j++){
1625                         int lIndex = cumNumSeqs[i] + j;
1626                         double tauValue = singleTau[seqNumber[lIndex]];
1627                         int sequenceIndex = aaI[i][j];
1628                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1629                         
1630                         count += tauValue;
1631                         
1632                         for(int s=0;s<HOMOPS;s++){
1633                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1634                         }
1635                     }
1636                     
1637                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1638                     maxPrValue = pr[maxPrIndex];
1639                     
1640                     if(count > MIN_COUNT){
1641                         double U = 0.0000;
1642                         double norm = 0.0000;
1643                         
1644                         for(int s=0;s<HOMOPS;s++){
1645                             norm += exp(-(pr[s] - maxPrValue));
1646                         }
1647                         
1648                         for(int s=1;s<=maxPrIndex;s++){
1649                             int value = 0;
1650                             double temp = 0.0000;
1651                             
1652                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1653                             
1654                             if(U>0.00){
1655                                 temp = log10(U);
1656                             }
1657                             else{
1658                                 temp = -10.1;
1659                             }
1660                             temp = floor(-10 * temp);
1661                             value = (int)floor(temp);
1662                             if(value > 100){    value = 100;    }
1663                             
1664                             qualities[i][base] = (int)value;
1665                             base++;
1666                         }
1667                     }
1668                     
1669                     index++;
1670                 }
1671             }
1672             
1673             
1674             if(otuCounts[i] > 0){
1675                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1676                 
1677                 int j=4;        //need to get past the first four bases
1678                 while(qualities[i][j] != -1){
1679                     qualityFile << qualities[i][j] << ' ';
1680                     j++;
1681                 }
1682                 qualityFile << endl;
1683             }
1684         }
1685         qualityFile.close();
1686         outputNames.push_back(qualityFileName);
1687         
1688     }
1689     catch(exception& e) {
1690         m->errorOut(e, "ShhherCommand", "writeQualities");
1691         exit(1);        
1692     }           
1693 }
1694
1695 /**************************************************************************************************/
1696
1697 void ShhherCommand::writeSequences(vector<int> otuCounts){
1698     try {
1699         string thisOutputDir = outputDir;
1700         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1701         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1702         ofstream fastaFile;
1703         m->openOutputFile(fastaFileName, fastaFile);
1704         
1705         vector<string> names(numOTUs, "");
1706         
1707         for(int i=0;i<numOTUs;i++){
1708             
1709             if (m->control_pressed) { break; }
1710             
1711             int index = centroids[i];
1712             
1713             if(otuCounts[i] > 0){
1714                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1715                 
1716                 string newSeq = "";
1717                 
1718                 for(int j=0;j<numFlowCells;j++){
1719                     
1720                     char base = flowOrder[j % 4];
1721                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1722                         newSeq += base;
1723                     }
1724                 }
1725                 
1726                 fastaFile << newSeq.substr(4) << endl;
1727             }
1728         }
1729         fastaFile.close();
1730         
1731         outputNames.push_back(fastaFileName);
1732         
1733         if(compositeFASTAFileName != ""){
1734             m->appendFiles(fastaFileName, compositeFASTAFileName);
1735         }
1736     }
1737     catch(exception& e) {
1738         m->errorOut(e, "ShhherCommand", "writeSequences");
1739         exit(1);        
1740     }           
1741 }
1742
1743 /**************************************************************************************************/
1744
1745 void ShhherCommand::writeNames(vector<int> otuCounts){
1746     try {
1747         string thisOutputDir = outputDir;
1748         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1749         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1750         ofstream nameFile;
1751         m->openOutputFile(nameFileName, nameFile);
1752         
1753         for(int i=0;i<numOTUs;i++){
1754             
1755             if (m->control_pressed) { break; }
1756             
1757             if(otuCounts[i] > 0){
1758                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1759                 
1760                 for(int j=1;j<nSeqsPerOTU[i];j++){
1761                     nameFile << ',' << seqNameVector[aaI[i][j]];
1762                 }
1763                 
1764                 nameFile << endl;
1765             }
1766         }
1767         nameFile.close();
1768         outputNames.push_back(nameFileName);
1769         
1770         
1771         if(compositeNamesFileName != ""){
1772             m->appendFiles(nameFileName, uchimecompositeNamesFileName);
1773         }               
1774     }
1775     catch(exception& e) {
1776         m->errorOut(e, "ShhherCommand", "writeNames");
1777         exit(1);        
1778     }           
1779 }
1780
1781 /**************************************************************************************************/
1782
1783 void ShhherCommand::writeGroups(){
1784     try {
1785         string thisOutputDir = outputDir;
1786         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1787         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1788         string groupFileName = fileRoot + "shhh.groups";
1789         ofstream groupFile;
1790         m->openOutputFile(groupFileName, groupFile);
1791         
1792         for(int i=0;i<numSeqs;i++){
1793             if (m->control_pressed) { break; }
1794             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1795         }
1796         groupFile.close();
1797         outputNames.push_back(groupFileName);
1798         
1799     }
1800     catch(exception& e) {
1801         m->errorOut(e, "ShhherCommand", "writeGroups");
1802         exit(1);        
1803     }           
1804 }
1805
1806 /**************************************************************************************************/
1807
1808 void ShhherCommand::writeClusters(vector<int> otuCounts){
1809     try {
1810         string thisOutputDir = outputDir;
1811         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1812         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1813         ofstream otuCountsFile;
1814         m->openOutputFile(otuCountsFileName, otuCountsFile);
1815         
1816         string bases = flowOrder;
1817         
1818         for(int i=0;i<numOTUs;i++){
1819             
1820             if (m->control_pressed) {
1821                 break;
1822             }
1823             //output the translated version of the centroid sequence for the otu
1824             if(otuCounts[i] > 0){
1825                 int index = centroids[i];
1826                 
1827                 otuCountsFile << "ideal\t";
1828                 for(int j=8;j<numFlowCells;j++){
1829                     char base = bases[j % 4];
1830                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1831                         otuCountsFile << base;
1832                     }
1833                 }
1834                 otuCountsFile << endl;
1835                 
1836                 for(int j=0;j<nSeqsPerOTU[i];j++){
1837                     int sequence = aaI[i][j];
1838                     otuCountsFile << seqNameVector[sequence] << '\t';
1839                     
1840                     string newSeq = "";
1841                     
1842                     for(int k=0;k<lengths[sequence];k++){
1843                         char base = bases[k % 4];
1844                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1845                         
1846                         for(int s=0;s<freq;s++){
1847                             newSeq += base;
1848                             //otuCountsFile << base;
1849                         }
1850                     }
1851                     otuCountsFile << newSeq.substr(4) << endl;
1852                 }
1853                 otuCountsFile << endl;
1854             }
1855         }
1856         otuCountsFile.close();
1857         outputNames.push_back(otuCountsFileName);
1858         
1859     }
1860     catch(exception& e) {
1861         m->errorOut(e, "ShhherCommand", "writeClusters");
1862         exit(1);        
1863     }           
1864 }
1865
1866 #else
1867 //**********************************************************************************************************************
1868
1869 int ShhherCommand::execute(){
1870         try {
1871                 if (abort == true) { return 0; }
1872                 
1873                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1874                 getJointLookUp();       if (m->control_pressed) { return 0; }
1875                 
1876         int numFiles = flowFileVector.size();
1877                 
1878         if (numFiles < processors) { processors = numFiles; }
1879         
1880 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1881         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1882         else { createProcesses(flowFileVector); } //each processor processes one file
1883 #else
1884         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1885 #endif
1886         
1887                 if(compositeFASTAFileName != ""){
1888                         outputNames.push_back(compositeFASTAFileName);
1889                         outputNames.push_back(compositeNamesFileName);
1890                 }
1891
1892                 m->mothurOutEndLine();
1893                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1894                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1895                 m->mothurOutEndLine();
1896                 
1897                 return 0;
1898         }
1899         catch(exception& e) {
1900                 m->errorOut(e, "ShhherCommand", "execute");
1901                 exit(1);
1902         }
1903 }
1904 #endif
1905 /**************************************************************************************************/
1906
1907 int ShhherCommand::createProcesses(vector<string> filenames){
1908     try {
1909         vector<int> processIDS;
1910                 int process = 1;
1911                 int num = 0;
1912                 
1913                 //sanity check
1914                 if (filenames.size() < processors) { processors = filenames.size(); }
1915                 
1916                 //divide the groups between the processors
1917                 vector<linePair> lines;
1918                 int numFilesPerProcessor = filenames.size() / processors;
1919                 for (int i = 0; i < processors; i++) {
1920                         int startIndex =  i * numFilesPerProcessor;
1921                         int endIndex = (i+1) * numFilesPerProcessor;
1922                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1923                         lines.push_back(linePair(startIndex, endIndex));
1924                 }
1925                 
1926         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1927                 
1928                 //loop through and create all the processes you want
1929                 while (process != processors) {
1930                         int pid = fork();
1931                         
1932                         if (pid > 0) {
1933                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1934                                 process++;
1935                         }else if (pid == 0){
1936                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1937                                 exit(0);
1938                         }else { 
1939                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1940                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1941                                 exit(0);
1942                         }
1943                 }
1944                 
1945                 //do my part
1946                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1947                 
1948                 //force parent to wait until all the processes are done
1949                 for (int i=0;i<processIDS.size();i++) { 
1950                         int temp = processIDS[i];
1951                         wait(&temp);
1952                 }
1953         
1954         #else
1955         
1956         //////////////////////////////////////////////////////////////////////////////////////////////////////
1957         
1958         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1959         
1960         //////////////////////////////////////////////////////////////////////////////////////////////////////
1961                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
1962                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1963                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1964                 
1965                 vector<shhhFlowsData*> pDataArray; 
1966                 DWORD   dwThreadIdArray[processors-1];
1967                 HANDLE  hThreadArray[processors-1]; 
1968                 
1969                 //Create processor worker threads.
1970                 for( int i=0; i<processors-1; i++ ){
1971                         // Allocate memory for thread data.
1972                         string extension = "";
1973                         if (i != 0) { extension = toString(i) + ".temp"; }
1974                         
1975             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1976                         pDataArray.push_back(tempFlow);
1977                         processIDS.push_back(i);
1978             
1979                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1980                 }
1981                 
1982                 //using the main process as a worker saves time and memory
1983                 //do my part
1984                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1985                 
1986                 //Wait until all threads have terminated.
1987                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1988                 
1989                 //Close all thread handles and free memory allocations.
1990                 for(int i=0; i < pDataArray.size(); i++){
1991                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
1992                         CloseHandle(hThreadArray[i]);
1993                         delete pDataArray[i];
1994                 }
1995                 
1996         #endif
1997         
1998         for (int i=0;i<processIDS.size();i++) { 
1999             if (compositeFASTAFileName != "") {
2000                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2001                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2002                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2003                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2004             }
2005         }
2006         
2007         return 0;
2008         
2009     }
2010         catch(exception& e) {
2011                 m->errorOut(e, "ShhherCommand", "createProcesses");
2012                 exit(1);
2013         }
2014 }
2015 /**************************************************************************************************/
2016
2017 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2018     try {
2019         
2020         for(int i=start;i<end;i++){
2021                         
2022                         if (m->control_pressed) { break; }
2023                         
2024                         string flowFileName = filenames[i];
2025             
2026                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2027                         m->mothurOut("Reading flowgrams...\n");
2028                         
2029             vector<string> seqNameVector;
2030             vector<int> lengths;
2031             vector<short> flowDataIntI;
2032             vector<double> flowDataPrI;
2033             map<string, int> nameMap;
2034             vector<short> uniqueFlowgrams;
2035             vector<int> uniqueCount;
2036             vector<int> mapSeqToUnique;
2037             vector<int> mapUniqueToSeq;
2038             vector<int> uniqueLengths;
2039             int numFlowCells;
2040             
2041             int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2042                         
2043                         if (m->control_pressed) { break; }
2044                         
2045                         m->mothurOut("Identifying unique flowgrams...\n");
2046                         int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2047                         
2048                         if (m->control_pressed) { break; }
2049                         
2050                         m->mothurOut("Calculating distances between flowgrams...\n");
2051             string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2052             unsigned long long begTime = time(NULL);
2053             double begClock = clock();
2054         
2055             flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);     
2056             
2057             m->mothurOutEndLine();
2058             m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2059
2060             
2061                         string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2062                         createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2063                         
2064                         if (m->control_pressed) { break; }
2065                         
2066                         m->mothurOut("\nClustering flowgrams...\n");
2067             string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2068                         cluster(listFileName, distFileName, namesFileName);
2069                         
2070                         if (m->control_pressed) { break; }
2071             
2072             vector<int> otuData;
2073             vector<int> cumNumSeqs;
2074             vector<int> nSeqsPerOTU;
2075             vector<vector<int> > aaP;   //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2076             vector<vector<int> > aaI;   //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2077             vector<int> seqNumber;              //tMaster->anP:         the sequence id number sorted by OTU
2078             vector<int> seqIndex;               //tMaster->anI;         the index that corresponds to seqNumber
2079
2080                         
2081                         int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2082                         
2083                         if (m->control_pressed) { break; }
2084                         
2085                         m->mothurRemove(distFileName);
2086                         m->mothurRemove(namesFileName);
2087                         m->mothurRemove(listFileName);
2088                         
2089             vector<double> dist;                //adDist - distance of sequences to centroids
2090             vector<short> change;               //did the centroid sequence change? 0 = no; 1 = yes
2091             vector<int> centroids;              //the representative flowgram for each cluster m
2092             vector<double> weight;
2093             vector<double> singleTau;   //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2094             vector<int> nSeqsBreaks;
2095             vector<int> nOTUsBreaks;
2096             
2097                         dist.assign(numSeqs * numOTUs, 0);
2098             change.assign(numOTUs, 1);
2099             centroids.assign(numOTUs, -1);
2100             weight.assign(numOTUs, 0);
2101             singleTau.assign(numSeqs, 1.0);
2102             
2103             nSeqsBreaks.assign(2, 0);
2104             nOTUsBreaks.assign(2, 0);
2105             
2106             nSeqsBreaks[0] = 0;
2107             nSeqsBreaks[1] = numSeqs;
2108             nOTUsBreaks[1] = numOTUs;
2109                         
2110                         if (m->control_pressed) { break; }
2111                         
2112                         double maxDelta = 0;
2113                         int iter = 0;
2114                         
2115                         begClock = clock();
2116                         begTime = time(NULL);
2117             
2118                         m->mothurOut("\nDenoising flowgrams...\n");
2119                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2120                         
2121                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2122                                 
2123                                 if (m->control_pressed) { break; }
2124                                 
2125                                 double cycClock = clock();
2126                                 unsigned long long cycTime = time(NULL);
2127                                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2128                                 
2129                                 if (m->control_pressed) { break; }
2130                 
2131                                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2132                                 
2133                                 if (m->control_pressed) { break; }
2134                 
2135                                 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2136                 
2137                 if (m->control_pressed) { break; }
2138                 
2139                                 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2140                 
2141                 if (m->control_pressed) { break; }
2142                 
2143                                 checkCentroids(numOTUs, centroids, weight);
2144                                 
2145                                 if (m->control_pressed) { break; }
2146                                 
2147                                 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2148                                 
2149                                 if (m->control_pressed) { break; }
2150                                 
2151                                 iter++;
2152                                 
2153                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2154                 
2155                         }       
2156                         
2157                         if (m->control_pressed) { break; }
2158                         
2159                         m->mothurOut("\nFinalizing...\n");
2160                         fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2161                         
2162                         if (m->control_pressed) { break; }
2163                         
2164                         setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2165                         
2166                         if (m->control_pressed) { break; }
2167                         
2168                         vector<int> otuCounts(numOTUs, 0);
2169                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2170                         
2171                         calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2172             
2173             if (m->control_pressed) { break; }
2174             
2175                         writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2176                         writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2177                         writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2178                         writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                       if (m->control_pressed) { break; }
2179                         writeGroups(flowFileName, numSeqs, seqNameVector);                                              if (m->control_pressed) { break; }
2180                         
2181                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2182                 }
2183                 
2184         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2185         
2186         return 0;
2187         
2188     }catch(exception& e) {
2189             m->errorOut(e, "ShhherCommand", "driver");
2190             exit(1);
2191     }
2192 }
2193
2194 /**************************************************************************************************/
2195 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2196         try{
2197        
2198                 ifstream flowFile;
2199        
2200                 m->openInputFile(filename, flowFile);
2201                 
2202                 string seqName;
2203                 int currentNumFlowCells;
2204                 float intensity;
2205         thisSeqNameVector.clear();
2206                 thisLengths.clear();
2207                 thisFlowDataIntI.clear();
2208                 thisNameMap.clear();
2209                 
2210                 flowFile >> numFlowCells;
2211                 int index = 0;//pcluster
2212                 while(!flowFile.eof()){
2213                         
2214                         if (m->control_pressed) { break; }
2215                         
2216                         flowFile >> seqName >> currentNumFlowCells;
2217                         thisLengths.push_back(currentNumFlowCells);
2218            
2219                         thisSeqNameVector.push_back(seqName);
2220                         thisNameMap[seqName] = index++;//pcluster
2221
2222                         for(int i=0;i<numFlowCells;i++){
2223                                 flowFile >> intensity;
2224                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2225                                 int intI = int(100 * intensity + 0.0001);
2226                                 thisFlowDataIntI.push_back(intI);
2227                         }
2228                         m->gobble(flowFile);
2229                 }
2230                 flowFile.close();
2231                 
2232                 int numSeqs = thisSeqNameVector.size();         
2233                 
2234                 for(int i=0;i<numSeqs;i++){
2235                         
2236                         if (m->control_pressed) { break; }
2237                         
2238                         int iNumFlowCells = i * numFlowCells;
2239                         for(int j=thisLengths[i];j<numFlowCells;j++){
2240                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2241                         }
2242                 }
2243         
2244         return numSeqs;
2245                 
2246         }
2247         catch(exception& e) {
2248                 m->errorOut(e, "ShhherCommand", "getFlowData");
2249                 exit(1);
2250         }
2251 }
2252 /**************************************************************************************************/
2253
2254 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2255         try{            
2256         
2257                 ostringstream outStream;
2258                 outStream.setf(ios::fixed, ios::floatfield);
2259                 outStream.setf(ios::dec, ios::basefield);
2260                 outStream.setf(ios::showpoint);
2261                 outStream.precision(6);
2262                 
2263                 int begTime = time(NULL);
2264                 double begClock = clock();
2265         
2266                 for(int i=0;i<stopSeq;i++){
2267                         
2268                         if (m->control_pressed) { break; }
2269                         
2270                         for(int j=0;j<i;j++){
2271                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2272                 
2273                                 if(flowDistance < 1e-6){
2274                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2275                                 }
2276                                 else if(flowDistance <= cutoff){
2277                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2278                                 }
2279                         }
2280                         if(i % 100 == 0){
2281                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2282                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2283                                 m->mothurOutEndLine();
2284                         }
2285                 }
2286                 
2287                 ofstream distFile(distFileName.c_str());
2288                 distFile << outStream.str();            
2289                 distFile.close();
2290                 
2291                 if (m->control_pressed) {}
2292                 else {
2293                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2294                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2295                         m->mothurOutEndLine();
2296                 }
2297         
2298         return 0;
2299         }
2300         catch(exception& e) {
2301                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2302                 exit(1);
2303         }
2304 }
2305 /**************************************************************************************************/
2306
2307 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2308         try{
2309                 int minLength = lengths[mapSeqToUnique[seqA]];
2310                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2311                 
2312                 int ANumFlowCells = seqA * numFlowCells;
2313                 int BNumFlowCells = seqB * numFlowCells;
2314                 
2315                 float dist = 0;
2316                 
2317                 for(int i=0;i<minLength;i++){
2318                         
2319                         if (m->control_pressed) { break; }
2320                         
2321                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2322                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2323                         
2324                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2325                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2326                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2327                 }
2328                 
2329                 dist /= (float) minLength;
2330                 return dist;
2331         }
2332         catch(exception& e) {
2333                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2334                 exit(1);
2335         }
2336 }
2337
2338 /**************************************************************************************************/
2339
2340 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2341         try{
2342                 int numUniques = 0;
2343                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2344                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2345                 uniqueLengths.assign(numSeqs, 0);
2346                 mapSeqToUnique.assign(numSeqs, -1);
2347                 mapUniqueToSeq.assign(numSeqs, -1);
2348                 
2349                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2350                 
2351                 for(int i=0;i<numSeqs;i++){
2352                         
2353                         if (m->control_pressed) { break; }
2354                         
2355                         int index = 0;
2356                         
2357                         vector<short> current(numFlowCells);
2358                         for(int j=0;j<numFlowCells;j++){
2359                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2360                         }
2361             
2362                         for(int j=0;j<numUniques;j++){
2363                                 int offset = j * numFlowCells;
2364                                 bool toEnd = 1;
2365                                 
2366                                 int shorterLength;
2367                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2368                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2369                 
2370                                 for(int k=0;k<shorterLength;k++){
2371                                         if(current[k] != uniqueFlowgrams[offset + k]){
2372                                                 toEnd = 0;
2373                                                 break;
2374                                         }
2375                                 }
2376                                 
2377                                 if(toEnd){
2378                                         mapSeqToUnique[i] = j;
2379                                         uniqueCount[j]++;
2380                                         index = j;
2381                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2382                                         break;
2383                                 }
2384                                 index++;
2385                         }
2386                         
2387                         if(index == numUniques){
2388                                 uniqueLengths[numUniques] = lengths[i];
2389                                 uniqueCount[numUniques] = 1;
2390                                 mapSeqToUnique[i] = numUniques;//anMap
2391                                 mapUniqueToSeq[numUniques] = i;//anF
2392                                 
2393                                 for(int k=0;k<numFlowCells;k++){
2394                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2395                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2396                                 }
2397                                 
2398                                 numUniques++;
2399                         }
2400                 }
2401                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2402                 uniqueLengths.resize(numUniques);       
2403                 uniqueFlowgrams.resize(numFlowCells * numUniques);
2404         
2405                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2406                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2407         
2408         return numUniques;
2409         }
2410         catch(exception& e) {
2411                 m->errorOut(e, "ShhherCommand", "getUniques");
2412                 exit(1);
2413         }
2414 }
2415 /**************************************************************************************************/
2416 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2417         try{
2418                 
2419                 vector<string> duplicateNames(numUniques, "");
2420                 for(int i=0;i<numSeqs;i++){
2421                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2422                 }
2423                 
2424                 ofstream nameFile;
2425                 m->openOutputFile(filename, nameFile);
2426                 
2427                 for(int i=0;i<numUniques;i++){
2428                         
2429                         if (m->control_pressed) { break; }
2430                         
2431             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2432                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2433                 }
2434                 
2435                 nameFile.close();
2436         
2437                 return 0;
2438         }
2439         catch(exception& e) {
2440                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2441                 exit(1);
2442         }
2443 }
2444 //**********************************************************************************************************************
2445
2446 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2447         try {
2448                 
2449                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2450                 read->setCutoff(cutoff);
2451                 
2452                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2453                 clusterNameMap->readMap();
2454                 read->read(clusterNameMap);
2455         
2456                 ListVector* list = read->getListVector();
2457                 SparseMatrix* matrix = read->getMatrix();
2458                 
2459                 delete read; 
2460                 delete clusterNameMap; 
2461         
2462                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2463                 
2464                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2465                 string tag = cluster->getTag();
2466                 
2467                 double clusterCutoff = cutoff;
2468                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2469                         
2470                         if (m->control_pressed) { break; }
2471                         
2472                         cluster->update(clusterCutoff);
2473                 }
2474                 
2475                 list->setLabel(toString(cutoff));
2476                 
2477                 ofstream listFile;
2478                 m->openOutputFile(filename, listFile);
2479                 list->print(listFile);
2480                 listFile.close();
2481                 
2482                 delete matrix;  delete cluster; delete rabund; delete list;
2483         
2484                 return 0;
2485         }
2486         catch(exception& e) {
2487                 m->errorOut(e, "ShhherCommand", "cluster");
2488                 exit(1);        
2489         }               
2490 }
2491 /**************************************************************************************************/
2492
2493 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2494                                vector<int>& cumNumSeqs,
2495                                vector<int>& nSeqsPerOTU,
2496                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2497                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2498                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2499                                vector<int>& seqIndex,
2500                                map<string, int>& nameMap){
2501         try {
2502         
2503                 ifstream listFile;
2504                 m->openInputFile(fileName, listFile);
2505                 string label;
2506         int numOTUs;
2507                 
2508                 listFile >> label >> numOTUs;
2509         
2510                 otuData.assign(numSeqs, 0);
2511                 cumNumSeqs.assign(numOTUs, 0);
2512                 nSeqsPerOTU.assign(numOTUs, 0);
2513                 aaP.clear();aaP.resize(numOTUs);
2514                 
2515                 seqNumber.clear();
2516                 aaI.clear();
2517                 seqIndex.clear();
2518                 
2519                 string singleOTU = "";
2520                 
2521                 for(int i=0;i<numOTUs;i++){
2522                         
2523                         if (m->control_pressed) { break; }
2524             
2525                         listFile >> singleOTU;
2526                         
2527                         istringstream otuString(singleOTU);
2528             
2529                         while(otuString){
2530                                 
2531                                 string seqName = "";
2532                                 
2533                                 for(int j=0;j<singleOTU.length();j++){
2534                                         char letter = otuString.get();
2535                                         
2536                                         if(letter != ','){
2537                                                 seqName += letter;
2538                                         }
2539                                         else{
2540                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2541                                                 int index = nmIt->second;
2542                                                 
2543                                                 nameMap.erase(nmIt);
2544                                                 
2545                                                 otuData[index] = i;
2546                                                 nSeqsPerOTU[i]++;
2547                                                 aaP[i].push_back(index);
2548                                                 seqName = "";
2549                                         }
2550                                 }
2551                                 
2552                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2553                 
2554                                 int index = nmIt->second;
2555                                 nameMap.erase(nmIt);
2556                 
2557                                 otuData[index] = i;
2558                                 nSeqsPerOTU[i]++;
2559                                 aaP[i].push_back(index);        
2560                                 
2561                                 otuString.get();
2562                         }
2563                         
2564                         sort(aaP[i].begin(), aaP[i].end());
2565                         for(int j=0;j<nSeqsPerOTU[i];j++){
2566                                 seqNumber.push_back(aaP[i][j]);
2567                         }
2568                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2569                                 aaP[i].push_back(0);
2570                         }
2571                         
2572                         
2573                 }
2574                 
2575                 for(int i=1;i<numOTUs;i++){
2576                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2577                 }
2578                 aaI = aaP;
2579                 seqIndex = seqNumber;
2580                 
2581                 listFile.close();       
2582         
2583         return numOTUs;
2584                 
2585         }
2586         catch(exception& e) {
2587                 m->errorOut(e, "ShhherCommand", "getOTUData");
2588                 exit(1);        
2589         }               
2590 }
2591 /**************************************************************************************************/
2592
2593 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2594                                           vector<int>& cumNumSeqs,
2595                                           vector<int>& nSeqsPerOTU,
2596                                           vector<int>& seqIndex,
2597                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2598                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2599                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2600                                           vector<int>& mapSeqToUnique,
2601                                           vector<short>& uniqueFlowgrams,
2602                                           vector<short>& flowDataIntI,
2603                                           vector<int>& lengths,
2604                                           int numFlowCells,
2605                                           vector<int>& seqNumber){                          
2606         
2607         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2608         //within an otu
2609         
2610         try{
2611                 
2612                 for(int i=0;i<numOTUs;i++){
2613                         
2614                         if (m->control_pressed) { break; }
2615                         
2616                         double count = 0;
2617                         int position = 0;
2618                         int minFlowGram = 100000000;
2619                         double minFlowValue = 1e8;
2620                         change[i] = 0; //FALSE
2621                         
2622                         for(int j=0;j<nSeqsPerOTU[i];j++){
2623                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2624                         }
2625             
2626                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2627                                 vector<double> adF(nSeqsPerOTU[i]);
2628                                 vector<int> anL(nSeqsPerOTU[i]);
2629                                 
2630                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2631                                         int index = cumNumSeqs[i] + j;
2632                                         int nI = seqIndex[index];
2633                                         int nIU = mapSeqToUnique[nI];
2634                                         
2635                                         int k;
2636                                         for(k=0;k<position;k++){
2637                                                 if(nIU == anL[k]){
2638                                                         break;
2639                                                 }
2640                                         }
2641                                         if(k == position){
2642                                                 anL[position] = nIU;
2643                                                 adF[position] = 0.0000;
2644                                                 position++;
2645                                         }                                               
2646                                 }
2647                                 
2648                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2649                                         int index = cumNumSeqs[i] + j;
2650                                         int nI = seqIndex[index];
2651                                         
2652                                         double tauValue = singleTau[seqNumber[index]];
2653                                         
2654                                         for(int k=0;k<position;k++){
2655                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2656                                                 adF[k] += dist * tauValue;
2657                                         }
2658                                 }
2659                                 
2660                                 for(int j=0;j<position;j++){
2661                                         if(adF[j] < minFlowValue){
2662                                                 minFlowGram = j;
2663                                                 minFlowValue = adF[j];
2664                                         }
2665                                 }
2666                                 
2667                                 if(centroids[i] != anL[minFlowGram]){
2668                                         change[i] = 1;
2669                                         centroids[i] = anL[minFlowGram];
2670                                 }
2671                         }
2672                         else if(centroids[i] != -1){
2673                                 change[i] = 1;
2674                                 centroids[i] = -1;                      
2675                         }
2676                 }
2677         
2678         return 0;
2679         }
2680         catch(exception& e) {
2681                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2682                 exit(1);        
2683         }               
2684 }
2685 /**************************************************************************************************/
2686
2687 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2688                                         vector<short>& flowDataIntI, int numFlowCells){
2689         try{
2690                 
2691                 int flowAValue = cent * numFlowCells;
2692                 int flowBValue = flow * numFlowCells;
2693                 
2694                 double dist = 0;
2695         
2696                 for(int i=0;i<length;i++){
2697                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2698                         flowAValue++;
2699                         flowBValue++;
2700                 }
2701                 
2702                 return dist / (double)length;
2703         }
2704         catch(exception& e) {
2705                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2706                 exit(1);        
2707         }               
2708 }
2709 /**************************************************************************************************/
2710
2711 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2712         try{
2713                 
2714                 double maxChange = 0;
2715                 
2716                 for(int i=0;i<numOTUs;i++){
2717                         
2718                         if (m->control_pressed) { break; }
2719                         
2720                         double difference = weight[i];
2721                         weight[i] = 0;
2722                         
2723                         for(int j=0;j<nSeqsPerOTU[i];j++){
2724                                 int index = cumNumSeqs[i] + j;
2725                                 double tauValue = singleTau[seqNumber[index]];
2726                                 weight[i] += tauValue;
2727                         }
2728                         
2729                         difference = fabs(weight[i] - difference);
2730                         if(difference > maxChange){     maxChange = difference; }
2731                 }
2732                 return maxChange;
2733         }
2734         catch(exception& e) {
2735                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2736                 exit(1);        
2737         }               
2738 }
2739
2740 /**************************************************************************************************/
2741
2742 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2743         
2744         try{
2745                 
2746                 vector<long double> P(numSeqs, 0);
2747                 int effNumOTUs = 0;
2748                 
2749                 for(int i=0;i<numOTUs;i++){
2750                         if(weight[i] > MIN_WEIGHT){
2751                                 effNumOTUs++;
2752                         }
2753                 }
2754                 
2755                 string hold;
2756                 for(int i=0;i<numOTUs;i++){
2757                         
2758                         if (m->control_pressed) { break; }
2759                         
2760                         for(int j=0;j<nSeqsPerOTU[i];j++){
2761                                 int index = cumNumSeqs[i] + j;
2762                                 int nI = seqIndex[index];
2763                                 double singleDist = dist[seqNumber[index]];
2764                                 
2765                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2766                         }
2767                 }
2768                 double nLL = 0.00;
2769                 for(int i=0;i<numSeqs;i++){
2770                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2771             
2772                         nLL += -log(P[i]);
2773                 }
2774                 
2775                 nLL = nLL -(double)numSeqs * log(sigma);
2776         
2777                 return nLL; 
2778         }
2779         catch(exception& e) {
2780                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2781                 exit(1);        
2782         }               
2783 }
2784
2785 /**************************************************************************************************/
2786
2787 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2788         try{
2789                 vector<int> unique(numOTUs, 1);
2790                 
2791                 for(int i=0;i<numOTUs;i++){
2792                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2793                                 unique[i] = -1;
2794                         }
2795                 }
2796                 
2797                 for(int i=0;i<numOTUs;i++){
2798                         
2799                         if (m->control_pressed) { break; }
2800                         
2801                         if(unique[i] == 1){
2802                                 for(int j=i+1;j<numOTUs;j++){
2803                                         if(unique[j] == 1){
2804                                                 
2805                                                 if(centroids[j] == centroids[i]){
2806                                                         unique[j] = 0;
2807                                                         centroids[j] = -1;
2808                                                         
2809                                                         weight[i] += weight[j];
2810                                                         weight[j] = 0.0;
2811                                                 }
2812                                         }
2813                                 }
2814                         }
2815                 }
2816         
2817         return 0;
2818         }
2819         catch(exception& e) {
2820                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2821                 exit(1);        
2822         }               
2823 }
2824 /**************************************************************************************************/
2825
2826 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
2827                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
2828                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
2829                                      vector<int>& seqNumber, vector<int>& seqIndex,
2830                                      vector<short>& uniqueFlowgrams,
2831                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2832         
2833         try{
2834                 
2835                 int total = 0;
2836                 vector<double> newTau(numOTUs,0);
2837                 vector<double> norms(numSeqs, 0);
2838                 nSeqsPerOTU.assign(numOTUs, 0);
2839         
2840                 for(int i=0;i<numSeqs;i++){
2841                         
2842                         if (m->control_pressed) { break; }
2843                         
2844                         int indexOffset = i * numOTUs;
2845             
2846                         double offset = 1e8;
2847                         
2848                         for(int j=0;j<numOTUs;j++){
2849                 
2850                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2851                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2852                                 }
2853                 
2854                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2855                                         offset = dist[indexOffset + j];
2856                                 }
2857                         }
2858             
2859                         for(int j=0;j<numOTUs;j++){
2860                                 if(weight[j] > MIN_WEIGHT){
2861                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2862                                         norms[i] += newTau[j];
2863                                 }
2864                                 else{
2865                                         newTau[j] = 0.0;
2866                                 }
2867                         }
2868             
2869                         for(int j=0;j<numOTUs;j++){
2870                                 newTau[j] /= norms[i];
2871                         }
2872             
2873                         for(int j=0;j<numOTUs;j++){
2874                                 if(newTau[j] > MIN_TAU){
2875                                         
2876                                         int oldTotal = total;
2877                                         
2878                                         total++;
2879                                         
2880                                         singleTau.resize(total, 0);
2881                                         seqNumber.resize(total, 0);
2882                                         seqIndex.resize(total, 0);
2883                                         
2884                                         singleTau[oldTotal] = newTau[j];
2885                                         
2886                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
2887                                         aaI[j][nSeqsPerOTU[j]] = i;
2888                                         nSeqsPerOTU[j]++;
2889                                 }
2890                         }
2891             
2892                 }
2893         
2894         }
2895         catch(exception& e) {
2896                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2897                 exit(1);        
2898         }               
2899 }
2900 /**************************************************************************************************/
2901
2902 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2903         try {
2904                 int index = 0;
2905                 for(int i=0;i<numOTUs;i++){
2906                         
2907                         if (m->control_pressed) { return 0; }
2908                         
2909                         cumNumSeqs[i] = index;
2910                         for(int j=0;j<nSeqsPerOTU[i];j++){
2911                                 seqNumber[index] = aaP[i][j];
2912                                 seqIndex[index] = aaI[i][j];
2913                                 
2914                                 index++;
2915                         }
2916                 }
2917         
2918         return 0; 
2919         }
2920         catch(exception& e) {
2921                 m->errorOut(e, "ShhherCommand", "fill");
2922                 exit(1);        
2923         }               
2924 }
2925 /**************************************************************************************************/
2926
2927 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
2928                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2929         
2930         try {
2931                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
2932                 
2933                 for(int i=0;i<numOTUs;i++){
2934                         
2935                         if (m->control_pressed) { break; }
2936                         
2937                         for(int j=0;j<nSeqsPerOTU[i];j++){
2938                                 int index = cumNumSeqs[i] + j;
2939                                 double tauValue = singleTau[seqNumber[index]];
2940                                 int sIndex = seqIndex[index];
2941                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
2942                         }
2943                 }
2944                 
2945                 for(int i=0;i<numSeqs;i++){
2946                         double maxTau = -1.0000;
2947                         int maxOTU = -1;
2948                         for(int j=0;j<numOTUs;j++){
2949                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
2950                                         maxTau = bigTauMatrix[i * numOTUs + j];
2951                                         maxOTU = j;
2952                                 }
2953                         }
2954                         
2955                         otuData[i] = maxOTU;
2956                 }
2957                 
2958                 nSeqsPerOTU.assign(numOTUs, 0);         
2959                 
2960                 for(int i=0;i<numSeqs;i++){
2961                         int index = otuData[i];
2962                         
2963                         singleTau[i] = 1.0000;
2964                         dist[i] = 0.0000;
2965                         
2966                         aaP[index][nSeqsPerOTU[index]] = i;
2967                         aaI[index][nSeqsPerOTU[index]] = i;
2968                         
2969                         nSeqsPerOTU[index]++;
2970                 }
2971         
2972                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
2973         }
2974         catch(exception& e) {
2975                 m->errorOut(e, "ShhherCommand", "setOTUs");
2976                 exit(1);        
2977         }               
2978 }
2979 /**************************************************************************************************/
2980
2981 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
2982                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
2983                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
2984         
2985         try {
2986                 string thisOutputDir = outputDir;
2987                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
2988                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
2989         
2990                 ofstream qualityFile;
2991                 m->openOutputFile(qualityFileName, qualityFile);
2992         
2993                 qualityFile.setf(ios::fixed, ios::floatfield);
2994                 qualityFile.setf(ios::showpoint);
2995                 qualityFile << setprecision(6);
2996                 
2997                 vector<vector<int> > qualities(numOTUs);
2998                 vector<double> pr(HOMOPS, 0);
2999                 
3000                 
3001                 for(int i=0;i<numOTUs;i++){
3002                         
3003                         if (m->control_pressed) { break; }
3004                         
3005                         int index = 0;
3006                         int base = 0;
3007                         
3008                         if(nSeqsPerOTU[i] > 0){
3009                                 qualities[i].assign(1024, -1);
3010                                 
3011                                 while(index < numFlowCells){
3012                                         double maxPrValue = 1e8;
3013                                         short maxPrIndex = -1;
3014                                         double count = 0.0000;
3015                                         
3016                                         pr.assign(HOMOPS, 0);
3017                                         
3018                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3019                                                 int lIndex = cumNumSeqs[i] + j;
3020                                                 double tauValue = singleTau[seqNumber[lIndex]];
3021                                                 int sequenceIndex = aaI[i][j];
3022                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3023                                                 
3024                                                 count += tauValue;
3025                                                 
3026                                                 for(int s=0;s<HOMOPS;s++){
3027                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3028                                                 }
3029                                         }
3030                                         
3031                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3032                                         maxPrValue = pr[maxPrIndex];
3033                                         
3034                                         if(count > MIN_COUNT){
3035                                                 double U = 0.0000;
3036                                                 double norm = 0.0000;
3037                                                 
3038                                                 for(int s=0;s<HOMOPS;s++){
3039                                                         norm += exp(-(pr[s] - maxPrValue));
3040                                                 }
3041                                                 
3042                                                 for(int s=1;s<=maxPrIndex;s++){
3043                                                         int value = 0;
3044                                                         double temp = 0.0000;
3045                                                         
3046                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3047                                                         
3048                                                         if(U>0.00){
3049                                                                 temp = log10(U);
3050                                                         }
3051                                                         else{
3052                                                                 temp = -10.1;
3053                                                         }
3054                                                         temp = floor(-10 * temp);
3055                                                         value = (int)floor(temp);
3056                                                         if(value > 100){        value = 100;    }
3057                                                         
3058                                                         qualities[i][base] = (int)value;
3059                                                         base++;
3060                                                 }
3061                                         }
3062                                         
3063                                         index++;
3064                                 }
3065                         }
3066                         
3067                         
3068                         if(otuCounts[i] > 0){
3069                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3070                         
3071                                 int j=4;        //need to get past the first four bases
3072                                 while(qualities[i][j] != -1){
3073                     qualityFile << qualities[i][j] << ' ';
3074                     if (j > qualities[i].size()) { break; }
3075                     j++;
3076                                 }
3077                                 qualityFile << endl;
3078                         }
3079                 }
3080                 qualityFile.close();
3081                 outputNames.push_back(qualityFileName);
3082         
3083         }
3084         catch(exception& e) {
3085                 m->errorOut(e, "ShhherCommand", "writeQualities");
3086                 exit(1);        
3087         }               
3088 }
3089
3090 /**************************************************************************************************/
3091
3092 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3093         try {
3094                 string thisOutputDir = outputDir;
3095                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3096                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
3097                 ofstream fastaFile;
3098                 m->openOutputFile(fastaFileName, fastaFile);
3099                 
3100                 vector<string> names(numOTUs, "");
3101                 
3102                 for(int i=0;i<numOTUs;i++){
3103                         
3104                         if (m->control_pressed) { break; }
3105                         
3106                         int index = centroids[i];
3107                         
3108                         if(otuCounts[i] > 0){
3109                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3110                                 
3111                                 string newSeq = "";
3112                                 
3113                                 for(int j=0;j<numFlowCells;j++){
3114                                         
3115                                         char base = flowOrder[j % 4];
3116                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3117                                                 newSeq += base;
3118                                         }
3119                                 }
3120                                 
3121                                 fastaFile << newSeq.substr(4) << endl;
3122                         }
3123                 }
3124                 fastaFile.close();
3125         
3126                 outputNames.push_back(fastaFileName);
3127         
3128                 if(thisCompositeFASTAFileName != ""){
3129                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3130                 }
3131         }
3132         catch(exception& e) {
3133                 m->errorOut(e, "ShhherCommand", "writeSequences");
3134                 exit(1);        
3135         }               
3136 }
3137
3138 /**************************************************************************************************/
3139
3140 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3141         try {
3142                 string thisOutputDir = outputDir;
3143                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3144                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
3145                 ofstream nameFile;
3146                 m->openOutputFile(nameFileName, nameFile);
3147                 
3148                 for(int i=0;i<numOTUs;i++){
3149                         
3150                         if (m->control_pressed) { break; }
3151                         
3152                         if(otuCounts[i] > 0){
3153                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3154                                 
3155                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3156                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3157                                 }
3158                                 
3159                                 nameFile << endl;
3160                         }
3161                 }
3162                 nameFile.close();
3163                 outputNames.push_back(nameFileName);
3164                 
3165                 
3166                 if(thisCompositeNamesFileName != ""){
3167                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3168                 }               
3169         }
3170         catch(exception& e) {
3171                 m->errorOut(e, "ShhherCommand", "writeNames");
3172                 exit(1);        
3173         }               
3174 }
3175
3176 /**************************************************************************************************/
3177
3178 void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
3179         try {
3180                 string thisOutputDir = outputDir;
3181                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3182                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
3183                 string groupFileName = fileRoot + "shhh.groups";
3184                 ofstream groupFile;
3185                 m->openOutputFile(groupFileName, groupFile);
3186                 
3187                 for(int i=0;i<numSeqs;i++){
3188                         if (m->control_pressed) { break; }
3189                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3190                 }
3191                 groupFile.close();
3192                 outputNames.push_back(groupFileName);
3193         
3194         }
3195         catch(exception& e) {
3196                 m->errorOut(e, "ShhherCommand", "writeGroups");
3197                 exit(1);        
3198         }               
3199 }
3200
3201 /**************************************************************************************************/
3202
3203 void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3204         try {
3205                 string thisOutputDir = outputDir;
3206                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3207                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
3208                 ofstream otuCountsFile;
3209                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3210                 
3211                 string bases = flowOrder;
3212                 
3213                 for(int i=0;i<numOTUs;i++){
3214                         
3215                         if (m->control_pressed) {
3216                                 break;
3217                         }
3218                         //output the translated version of the centroid sequence for the otu
3219                         if(otuCounts[i] > 0){
3220                                 int index = centroids[i];
3221                                 
3222                                 otuCountsFile << "ideal\t";
3223                                 for(int j=8;j<numFlowCells;j++){
3224                                         char base = bases[j % 4];
3225                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3226                                                 otuCountsFile << base;
3227                                         }
3228                                 }
3229                                 otuCountsFile << endl;
3230                                 
3231                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3232                                         int sequence = aaI[i][j];
3233                                         otuCountsFile << seqNameVector[sequence] << '\t';
3234                                         
3235                                         string newSeq = "";
3236                                         
3237                                         for(int k=0;k<lengths[sequence];k++){
3238                                                 char base = bases[k % 4];
3239                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3240                         
3241                                                 for(int s=0;s<freq;s++){
3242                                                         newSeq += base;
3243                                                         //otuCountsFile << base;
3244                                                 }
3245                                         }
3246                                         otuCountsFile << newSeq.substr(4) << endl;
3247                                 }
3248                                 otuCountsFile << endl;
3249                         }
3250                 }
3251                 otuCountsFile.close();
3252                 outputNames.push_back(otuCountsFileName);
3253         
3254         }
3255         catch(exception& e) {
3256                 m->errorOut(e, "ShhherCommand", "writeClusters");
3257                 exit(1);        
3258         }               
3259 }
3260
3261 /**************************************************************************************************/
3262
3263 void ShhherCommand::getSingleLookUp(){
3264         try{
3265                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3266                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3267                 
3268                 int index = 0;
3269                 ifstream lookUpFile;
3270                 m->openInputFile(lookupFileName, lookUpFile);
3271                 
3272                 for(int i=0;i<HOMOPS;i++){
3273                         
3274                         if (m->control_pressed) { break; }
3275                         
3276                         float logFracFreq;
3277                         lookUpFile >> logFracFreq;
3278                         
3279                         for(int j=0;j<NUMBINS;j++)      {
3280                                 lookUpFile >> singleLookUp[index];
3281                                 index++;
3282                         }
3283                 }       
3284                 lookUpFile.close();
3285         }
3286         catch(exception& e) {
3287                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3288                 exit(1);
3289         }
3290 }
3291
3292 /**************************************************************************************************/
3293
3294 void ShhherCommand::getJointLookUp(){
3295         try{
3296                 
3297                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3298                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3299                 
3300                 for(int i=0;i<NUMBINS;i++){
3301                         
3302                         if (m->control_pressed) { break; }
3303                         
3304                         for(int j=0;j<NUMBINS;j++){             
3305                                 
3306                                 double minSum = 100000000;
3307                                 
3308                                 for(int k=0;k<HOMOPS;k++){
3309                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3310                                         
3311                                         if(sum < minSum)        {       minSum = sum;           }
3312                                 }       
3313                                 jointLookUp[i * NUMBINS + j] = minSum;
3314                         }
3315                 }
3316         }
3317         catch(exception& e) {
3318                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3319                 exit(1);
3320         }
3321 }
3322
3323 /**************************************************************************************************/
3324
3325 double ShhherCommand::getProbIntensity(int intIntensity){                          
3326         try{
3327
3328                 double minNegLogProb = 100000000; 
3329
3330                 
3331                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3332                         
3333                         if (m->control_pressed) { break; }
3334                         
3335                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3336                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3337                 }
3338                 
3339                 return minNegLogProb;
3340         }
3341         catch(exception& e) {
3342                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3343                 exit(1);
3344         }
3345 }
3346
3347
3348
3349